personal-data/personal_data/data.py

import abc
import dataclasses
from collections.abc import Iterator, Mapping
from enum import Enum

import requests


class DeduplicateMode(Enum):
    NONE = 0
    BY_FIRST_COLUMN = 1
    BY_ALL_COLUMNS = 2
    ONLY_LATEST = 3


@dataclasses.dataclass(frozen=True)
class Scraper(abc.ABC):
    """Base scraper class."""
    session: requests.Session

    @staticmethod
    def dataset_name() -> str:
        """Indicates the filename of the produced dataset. Must be overwritten
        by the implementation."""

    @staticmethod
    def deduplicate_mode() -> DeduplicateMode:
        """Indicates how the rows should be deduplicated.  Must be overwritten
        by the implementation."""

    @staticmethod
    def deduplicate_ignore_columns() -> list[str]:
        """Indicates columns which are not included in the deduplication check.

        SQL comparison: Columns not in this set is part of the primary key.
        """
        return []

    @staticmethod
    def dataset_format() -> str:
        return 'list-of-dicts'

    @staticmethod
    def requires_cfscrape() -> bool:
        """Whether the scraper requires advanced CloudFlare circumvention."""
        return False

    @abc.abstractmethod
    def scrape(self) -> Iterator[Mapping[str, object]]:
        """Implementation of the scraper."""