Document scraper
This commit is contained in:
parent
bd3fc56a57
commit
8feb3e2cde
|
@ -15,18 +15,25 @@ class DeduplicateMode(Enum):
|
|||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class Scraper(abc.ABC):
|
||||
"""Base scraper class."""
|
||||
session: requests.Session
|
||||
|
||||
@staticmethod
|
||||
def dataset_name() -> str:
|
||||
pass
|
||||
"""Indicates the filename of the produced dataset. Must be overwritten
|
||||
by the implementation."""
|
||||
|
||||
@staticmethod
|
||||
def deduplicate_mode() -> DeduplicateMode:
|
||||
pass
|
||||
"""Indicates how the rows should be deduplicated. Must be overwritten
|
||||
by the implementation."""
|
||||
|
||||
@staticmethod
|
||||
def deduplicate_ignore_columns() -> list[str]:
|
||||
"""Indicates columns which are not included in the deduplication check.
|
||||
|
||||
SQL comparison: Columns not in this set is part of the primary key.
|
||||
"""
|
||||
return []
|
||||
|
||||
@staticmethod
|
||||
|
@ -35,8 +42,9 @@ class Scraper(abc.ABC):
|
|||
|
||||
@staticmethod
|
||||
def requires_cfscrape() -> bool:
|
||||
"""Whether the scraper requires advanced CloudFlare circumvention."""
|
||||
return False
|
||||
|
||||
@abc.abstractmethod
|
||||
def scrape(self) -> Iterator[Mapping[str, object]]:
|
||||
pass
|
||||
"""Implementation of the scraper."""
|
||||
|
|
Loading…
Reference in New Issue
Block a user