import datetime import inspect import logging from collections.abc import Sequence from pathlib import Path import requests import requests_cache from . import data, notification, util, fetchers logger = logging.getLogger(__name__) try: import cfscrape except ImportError: cfscrape = None logger.warning('cfscrape not installed: Certain fetchers might not work') try: import browsercookie except ImportError: logger.warning('browsercookie not installed: Certain fetchers might not work') browsercookie = None OUTPUT_PATH = Path('./output') logging.basicConfig() logger.setLevel('INFO') STANDARD_HEADERS = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', } if cfscrape: class CachedCfScrape(requests_cache.CacheMixin, cfscrape.CloudflareScraper): pass CACHE_EXPIRE_DEFAULT = datetime.timedelta(days=7) def get_session( cookiejar: Sequence, *, with_cfscrape: bool, ignore_cache: bool, ) -> requests.Session: if with_cfscrape and cfscrape: session_class = CachedCfScrape if ignore_cache: logger.warning('HTTP cache disabled') return cfscrape.create_scraper() else: session_class = requests_cache.CachedSession if ignore_cache: logger.warning('HTTP cache disabled') return requests.Session() session = session_class( OUTPUT_PATH / 'web_cache', cookies=cookiejar, expire_after=CACHE_EXPIRE_DEFAULT, ) for cookie in cookiejar: session.cookies.set_cookie(cookie) return session def available_scrapers() -> list[type[data.Scraper]]: fetchers.load_fetcher_modules() subclasses = [] class_queue = [data.Scraper] while class_queue: clazz = class_queue.pop() if inspect.isabstract(clazz): class_queue.extend(clazz.__subclasses__()) else: subclasses.append(clazz) return subclasses def available_scraper_names() -> list[str]: return [scraper_cls.__name__ for scraper_cls in available_scrapers()] def main( scraper_filter: frozenset[str], *, use_cookiejar: bool, ignore_cache: bool, notification_types: frozenset[notification.NotificationType], ) -> None: if use_cookiejar: cookiejar = browsercookie.firefox() logger.info('Got cookiejar from firefox: %s cookies', len(cookiejar)) else: cookiejar = [] logger.warning('No cookiejar is used') if len(notification_types) == 0: logger.info('No notifications enabled: Notifications will not be sent!') for scraper_cls in available_scrapers(): session = get_session( cookiejar, with_cfscrape=scraper_cls.requires_cfscrape(), ignore_cache=ignore_cache, ) scraper = scraper_cls(session) if scraper_cls.__name__ not in scraper_filter: continue logger.info( 'Running %s, appending to "%s"', scraper_cls.__name__, scraper.dataset_name, ) result_rows = [] try: for result in scraper.scrape(): result_rows.append(result) del result except requests.exceptions.HTTPError: logger.exception('Failed in running %s', scraper_cls.__name__) continue status = util.extend_csv_file( OUTPUT_PATH / f'{scraper.dataset_name}.csv', result_rows, deduplicate_mode=scraper.deduplicate_mode, deduplicate_ignore_columns=scraper.deduplicate_ignore_columns(), ) logger.info('Scraper done: %s', scraper.dataset_name) if status['extended']: notification.send_notifications( session, scraper_cls.__name__, status['dicts'][-1], notification_types, ) del scraper, session