1
0
personal-data/personal_data/data.py

43 lines
802 B
Python
Raw Normal View History

2024-03-31 22:55:55 +00:00
import abc
2023-12-10 23:27:56 +00:00
import dataclasses
2024-05-15 22:47:42 +00:00
from collections.abc import Iterator, Mapping
2024-02-25 19:20:37 +00:00
from enum import Enum
2024-03-31 22:55:55 +00:00
import requests
2024-02-25 19:20:37 +00:00
class DeduplicateMode(Enum):
NONE = 0
BY_FIRST_COLUMN = 1
BY_ALL_COLUMNS = 2
ONLY_LATEST = 3
2024-03-31 22:55:55 +00:00
2024-04-23 20:58:25 +00:00
2024-03-31 22:55:55 +00:00
@dataclasses.dataclass(frozen=True)
2024-03-03 15:59:03 +00:00
class Scraper(abc.ABC):
session: requests.Session
@staticmethod
2024-03-03 16:25:34 +00:00
def dataset_name() -> str:
2024-03-03 15:59:03 +00:00
pass
@staticmethod
2024-03-03 16:25:34 +00:00
def deduplicate_mode() -> DeduplicateMode:
2024-03-03 15:59:03 +00:00
pass
2024-04-28 22:01:11 +00:00
@staticmethod
def deduplicate_ignore_columns() -> list[str]:
return []
2024-03-03 15:59:03 +00:00
@staticmethod
2024-03-03 16:25:34 +00:00
def dataset_format() -> str:
2024-03-03 15:59:03 +00:00
return 'list-of-dicts'
2024-03-03 16:25:34 +00:00
@staticmethod
def requires_cfscrape() -> bool:
return False
2024-03-03 15:59:03 +00:00
@abc.abstractmethod
2024-05-15 22:47:42 +00:00
def scrape(self) -> Iterator[Mapping[str, object]]:
2024-03-03 15:59:03 +00:00
pass