import dataclasses import datetime import logging from collections.abc import Iterator, Mapping from decimal import Decimal from typing import Any from personal_data.data import DeduplicateMode, Scraper from .. import secrets logger = logging.getLogger(__name__) def safe_del(d: dict, *keys: str): for key in keys: if key in d: del d[key] def to_data_point(p: dict[str,Any]) ->Mapping[str, Any]: p['owner'] = p['owner']['login'] safe_del(p, 'permissions', 'internal_tracker') return p @dataclasses.dataclass(frozen=True) class Gitea(Scraper): dataset_name = 'gitea_repos' deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS deduplicate_ignore_columns = [] @staticmethod def requires_cfscrape() -> bool: return False def scrape(self) -> Iterator[Mapping[str, Any]]: response = self.session.get('https://gitfub.space/api/v1/repos/search', params = { #'uid':21, 'private': True, 'sort':'updated', 'order':'desc', 'access_token': secrets.gitea_access_token(), }) response.raise_for_status() data = response.json() logger.info('Got %d results from Gitea', len(data['data'])) for p in data['data']: yield to_data_point(p)