1
0
personal-data/personal_data/main.py

144 lines
3.9 KiB
Python

import csv
import datetime
import decimal
import inspect
import io
from pathlib import Path
import logging
from collections.abc import Iterable, Mapping, Sequence
from decimal import Decimal
import requests
import requests_cache
from frozendict import frozendict
from . import notification, data
from .util import *
logger = logging.getLogger(__name__)
try:
import cfscrape
except ImportError:
cfscrape = None
logger.warning('cfscrape not installed: Certain fetchers might not work')
try:
import browsercookie
except ImportError:
logger.warning('browsercookie not installed: Certain fetchers might not work')
browsercookie = None
OUTPUT_PATH = Path('./output')
logging.basicConfig()
logger.setLevel('INFO')
STANDARD_HEADERS = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0',
# "Accept": "application/json, text/plain, */*",
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
}
if cfscrape:
class CachedCfScrape(requests_cache.CacheMixin, cfscrape.CloudflareScraper):
pass
def get_session(
cookiejar: Sequence,
*,
with_cfscrape: bool,
ignore_cache: bool,
) -> requests.Session:
assert isinstance(with_cfscrape, bool)
session_class = requests_cache.CachedSession
if ignore_cache:
logger.warn('HTTP cache disabled')
return requests.Session()
if cfscrape:
session_class = CachedCfScrape
session = session_class(OUTPUT_PATH / 'web_cache', cookies=cookiejar)
for cookie in cookiejar:
session.cookies.set_cookie(cookie)
return session
def available_scrapers() -> list[type[data.Scraper]]:
subclasses = []
class_queue = [data.Scraper]
while class_queue:
clazz = class_queue.pop()
if inspect.isabstract(clazz):
class_queue.extend(clazz.__subclasses__())
else:
subclasses.append(clazz)
return subclasses
def available_scraper_names() -> list[str]:
return [scraper_cls.__name__ for scraper_cls in available_scrapers()]
def main(
scraper_filter: frozenset[str],
*,
use_cookiejar: bool,
ignore_cache: bool,
notification_types: frozenset[notification.NotificationType],
) -> None:
if use_cookiejar:
cookiejar = browsercookie.firefox()
logger.info('Got cookiejar from firefox: %s cookies', len(cookiejar))
else:
cookiejar = []
logger.warning('No cookiejar is used')
if len(notification_types) == 0:
logger.info('No notifications enabled: Notifications will not be sent!')
for scraper_cls in available_scrapers():
session = get_session(
cookiejar,
with_cfscrape=scraper_cls.requires_cfscrape(),
ignore_cache=ignore_cache,
)
scraper = scraper_cls(session)
if scraper_cls.__name__ not in scraper_filter:
continue
logger.info(
'Running %s, appending to "%s"',
scraper_cls.__name__,
scraper.dataset_name,
)
result_rows = []
try:
for result in scraper.scrape():
result_rows.append(result)
del result
except requests.exceptions.HTTPError:
logger.exception('Failed in running %s', scraper_cls.__name__)
continue
status = extend_csv_file(
OUTPUT_PATH / f'{scraper.dataset_name}.csv',
result_rows,
deduplicate_mode=scraper.deduplicate_mode,
deduplicate_ignore_columns=scraper.deduplicate_ignore_columns(),
)
logger.info('Scraper done: %s', scraper.dataset_name)
if status['extended']:
notification.send_notifications(
session,
scraper_cls.__name__,
status['dicts'][-1],
notification_types,
)
del scraper, session