2024-03-31 22:55:55 +00:00
|
|
|
import abc
|
2023-12-10 23:27:56 +00:00
|
|
|
import dataclasses
|
2024-05-15 22:47:42 +00:00
|
|
|
from collections.abc import Iterator, Mapping
|
2024-02-25 19:20:37 +00:00
|
|
|
from enum import Enum
|
2024-03-31 22:55:55 +00:00
|
|
|
|
|
|
|
import requests
|
|
|
|
|
2024-02-25 19:20:37 +00:00
|
|
|
|
|
|
|
class DeduplicateMode(Enum):
|
|
|
|
NONE = 0
|
|
|
|
BY_FIRST_COLUMN = 1
|
|
|
|
BY_ALL_COLUMNS = 2
|
2024-04-16 22:16:40 +00:00
|
|
|
ONLY_LATEST = 3
|
2024-03-31 22:55:55 +00:00
|
|
|
|
2024-04-23 20:58:25 +00:00
|
|
|
|
2024-03-31 22:55:55 +00:00
|
|
|
@dataclasses.dataclass(frozen=True)
|
2024-03-03 15:59:03 +00:00
|
|
|
class Scraper(abc.ABC):
|
|
|
|
session: requests.Session
|
|
|
|
|
|
|
|
@staticmethod
|
2024-03-03 16:25:34 +00:00
|
|
|
def dataset_name() -> str:
|
2024-03-03 15:59:03 +00:00
|
|
|
pass
|
|
|
|
|
|
|
|
@staticmethod
|
2024-03-03 16:25:34 +00:00
|
|
|
def deduplicate_mode() -> DeduplicateMode:
|
2024-03-03 15:59:03 +00:00
|
|
|
pass
|
|
|
|
|
2024-04-28 22:01:11 +00:00
|
|
|
@staticmethod
|
|
|
|
def deduplicate_ignore_columns() -> list[str]:
|
|
|
|
return []
|
|
|
|
|
2024-03-03 15:59:03 +00:00
|
|
|
@staticmethod
|
2024-03-03 16:25:34 +00:00
|
|
|
def dataset_format() -> str:
|
2024-03-03 15:59:03 +00:00
|
|
|
return 'list-of-dicts'
|
|
|
|
|
2024-03-03 16:25:34 +00:00
|
|
|
@staticmethod
|
|
|
|
def requires_cfscrape() -> bool:
|
|
|
|
return False
|
|
|
|
|
2024-03-03 15:59:03 +00:00
|
|
|
@abc.abstractmethod
|
2024-05-15 22:47:42 +00:00
|
|
|
def scrape(self) -> Iterator[Mapping[str, object]]:
|
2024-03-03 15:59:03 +00:00
|
|
|
pass
|