From a53c1c381d967ba463e0100c7ed02c13dcda1358 Mon Sep 17 00:00:00 2001 From: Jon Michael Aanes Date: Sun, 25 Feb 2024 20:20:37 +0100 Subject: [PATCH] Improved deduplication --- personal_data/__main__.py | 47 +++++++++++++++++++++++---- personal_data/data.py | 9 +++-- personal_data/fetchers/crunchyroll.py | 5 +-- personal_data/fetchers/playstation.py | 5 +-- personal_data/fetchers/psnprofiles.py | 5 +-- 5 files changed, 57 insertions(+), 14 deletions(-) diff --git a/personal_data/__main__.py b/personal_data/__main__.py index ccb053a..d5c2543 100644 --- a/personal_data/__main__.py +++ b/personal_data/__main__.py @@ -14,6 +14,10 @@ logger = logging.getLogger(__name__) import personal_data.fetchers.playstation import personal_data.fetchers.crunchyroll import personal_data.fetchers.psnprofiles +import personal_data.data + +CSV_DIALECT = 'one_true_dialect' +csv.register_dialect(CSV_DIALECT, lineterminator = '\n', skipinitialspace = True) def determine_scrapers(): scrapers = [] @@ -22,12 +26,40 @@ def determine_scrapers(): scrapers += personal_data.fetchers.psnprofiles.SCRAPERS return scrapers -def extend_csv_file(filename, new_dicts, deduplicate = False): +def try_value(fn, s: str) -> any: + try: + return fn(s) + except ValueError: + return None + +def to_value(s: str) -> any: + s = s.strip() + if len(s) == 0: + return None + if v := try_value(int, s): + return v + if v := try_value(datetime.date.fromisoformat,s): + return v + if v := try_value(datetime.datetime.fromisoformat,s): + return v + if s.lower() == 'false': + return False + if s.lower() == 'true': + return True + if s.lower() == 'none': + return None + return s + +def extend_csv_file(filename: str, new_dicts: dict, deduplicate_mode: personal_data.data.DeduplicateMode): dicts = [] try: with open(filename, 'r') as csvfile: - reader = csv.DictReader(csvfile) + reader = csv.DictReader(csvfile, dialect = CSV_DIALECT) for row in reader: + for k in list(row.keys()): + row[k] = to_value(row[k]) + if row[k] is None: + del row[k] dicts.append(frozendict(row)) del csvfile except FileNotFoundError as e: @@ -43,12 +75,15 @@ def extend_csv_file(filename, new_dicts, deduplicate = False): for k in d.keys(): if k not in fieldnames: fieldnames.append(k) + del k + del d - if deduplicate: - dicts = sorted(set(dicts), key = lambda d: d[fieldnames[0]]) + if deduplicate_mode != personal_data.data.DeduplicateMode.NONE: + dicts = set(dicts) + dicts = sorted(dicts, key = lambda d: tuple(str(d.get(fn, '')) for fn in fieldnames)) csvfile_in_memory = io.StringIO() - writer = csv.DictWriter(csvfile_in_memory, fieldnames=fieldnames) + writer = csv.DictWriter(csvfile_in_memory, fieldnames=fieldnames, dialect = CSV_DIALECT) writer.writeheader() for d in dicts: writer.writerow(d) @@ -83,7 +118,7 @@ def main(): result_rows.append(result) del result extend_csv_file('output/'+scraper.dataset_name, result_rows, - deduplicate = scraper.deduplicate) + deduplicate_mode = scraper.deduplicate_mode) logger.warning('Scraper done: %s', scraper.dataset_name) if __name__ == '__main__': diff --git a/personal_data/data.py b/personal_data/data.py index 6a50f91..6c193db 100644 --- a/personal_data/data.py +++ b/personal_data/data.py @@ -1,10 +1,15 @@ - import dataclasses +from enum import Enum + +class DeduplicateMode(Enum): + NONE = 0 + BY_FIRST_COLUMN = 1 + BY_ALL_COLUMNS = 2 @dataclasses.dataclass class Scraper: scraper: object # TODO: Callable dataset_name: str - deduplicate: bool + deduplicate_mode: DeduplicateMode dataset_format: str = 'list-of-dicts' diff --git a/personal_data/fetchers/crunchyroll.py b/personal_data/fetchers/crunchyroll.py index b69185a..4118c12 100644 --- a/personal_data/fetchers/crunchyroll.py +++ b/personal_data/fetchers/crunchyroll.py @@ -2,7 +2,7 @@ import secrets import functools import logging -from personal_data.data import Scraper +from personal_data.data import Scraper, DeduplicateMode logger = logging.getLogger(__name__) @@ -65,5 +65,6 @@ def scrape_watched_last(session): } SCRAPERS = [ - Scraper(scrape_watched_last, 'episodes_watched_crunchyroll', deduplicate = True) + Scraper(scrape_watched_last, 'episodes_watched_crunchyroll', + deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS) ] diff --git a/personal_data/fetchers/playstation.py b/personal_data/fetchers/playstation.py index 6187a42..1aefaf6 100644 --- a/personal_data/fetchers/playstation.py +++ b/personal_data/fetchers/playstation.py @@ -1,7 +1,7 @@ import secrets import logging -from personal_data.data import Scraper +from personal_data.data import Scraper, DeduplicateMode logger = logging.getLogger(__name__) @@ -58,6 +58,7 @@ def scrape_played_last(session): } SCRAPERS = [ - Scraper(scrape_played_last, 'games_played_playstation', deduplicate = True) + Scraper(scrape_played_last, 'games_played_playstation', + deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS) ] diff --git a/personal_data/fetchers/psnprofiles.py b/personal_data/fetchers/psnprofiles.py index 4b4d711..dc60285 100644 --- a/personal_data/fetchers/psnprofiles.py +++ b/personal_data/fetchers/psnprofiles.py @@ -5,7 +5,7 @@ import logging import bs4 import datetime -from personal_data.data import Scraper +from personal_data.data import Scraper, DeduplicateMode import personal_data.html_util import personal_data.parse_util @@ -109,5 +109,6 @@ def scrape_personal_page(session): yield d SCRAPERS = [ - Scraper(scrape_personal_page, 'games_played_playstation', deduplicate = True) + Scraper(scrape_personal_page, 'games_played_playstation', + deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS) ]