1
0

Document scraper

This commit is contained in:
Jon Michael Aanes 2025-06-25 00:07:09 +02:00
parent bd3fc56a57
commit 8feb3e2cde

View File

@ -15,18 +15,25 @@ class DeduplicateMode(Enum):
@dataclasses.dataclass(frozen=True) @dataclasses.dataclass(frozen=True)
class Scraper(abc.ABC): class Scraper(abc.ABC):
"""Base scraper class."""
session: requests.Session session: requests.Session
@staticmethod @staticmethod
def dataset_name() -> str: def dataset_name() -> str:
pass """Indicates the filename of the produced dataset. Must be overwritten
by the implementation."""
@staticmethod @staticmethod
def deduplicate_mode() -> DeduplicateMode: def deduplicate_mode() -> DeduplicateMode:
pass """Indicates how the rows should be deduplicated. Must be overwritten
by the implementation."""
@staticmethod @staticmethod
def deduplicate_ignore_columns() -> list[str]: def deduplicate_ignore_columns() -> list[str]:
"""Indicates columns which are not included in the deduplication check.
SQL comparison: Columns not in this set is part of the primary key.
"""
return [] return []
@staticmethod @staticmethod
@ -35,8 +42,9 @@ class Scraper(abc.ABC):
@staticmethod @staticmethod
def requires_cfscrape() -> bool: def requires_cfscrape() -> bool:
"""Whether the scraper requires advanced CloudFlare circumvention."""
return False return False
@abc.abstractmethod @abc.abstractmethod
def scrape(self) -> Iterator[Mapping[str, object]]: def scrape(self) -> Iterator[Mapping[str, object]]:
pass """Implementation of the scraper."""