From 8feb3e2cde8f6aa0e7f693b40f9934f13d02e042 Mon Sep 17 00:00:00 2001 From: Jon Michael Aanes Date: Wed, 25 Jun 2025 00:07:09 +0200 Subject: [PATCH] Document scraper --- personal_data/data.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/personal_data/data.py b/personal_data/data.py index 16cc9ca..2d6a7e2 100644 --- a/personal_data/data.py +++ b/personal_data/data.py @@ -15,18 +15,25 @@ class DeduplicateMode(Enum): @dataclasses.dataclass(frozen=True) class Scraper(abc.ABC): + """Base scraper class.""" session: requests.Session @staticmethod def dataset_name() -> str: - pass + """Indicates the filename of the produced dataset. Must be overwritten + by the implementation.""" @staticmethod def deduplicate_mode() -> DeduplicateMode: - pass + """Indicates how the rows should be deduplicated. Must be overwritten + by the implementation.""" @staticmethod def deduplicate_ignore_columns() -> list[str]: + """Indicates columns which are not included in the deduplication check. + + SQL comparison: Columns not in this set is part of the primary key. + """ return [] @staticmethod @@ -35,8 +42,9 @@ class Scraper(abc.ABC): @staticmethod def requires_cfscrape() -> bool: + """Whether the scraper requires advanced CloudFlare circumvention.""" return False @abc.abstractmethod def scrape(self) -> Iterator[Mapping[str, object]]: - pass + """Implementation of the scraper."""