personal-data/personal_data/main.py

import datetime
import inspect
import logging
from collections.abc import Sequence
from pathlib import Path

import requests
import requests_cache

from . import data, fetchers, notification, util

logger = logging.getLogger(__name__)

try:
    import cfscrape
except ImportError:
    cfscrape = None
    logger.warning('cfscrape not installed: Certain fetchers might not work')

try:
    import browsercookie
except ImportError:
    logger.warning('browsercookie not installed: Certain fetchers might not work')
    browsercookie = None


OUTPUT_PATH = Path('./output')

logging.basicConfig(
    format='%(asctime)s %(levelname)s %(module)s:%(lineno)d - %(message)s',
)
logger.setLevel('INFO')


STANDARD_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
}


if cfscrape:

    class CachedCfScrape(requests_cache.CacheMixin, cfscrape.CloudflareScraper):
        pass


CACHE_EXPIRE_DEFAULT = datetime.timedelta(days=7)


def get_session(
    cookiejar: Sequence,
    *,
    with_cfscrape: bool,
    ignore_cache: bool,
) -> requests.Session:
    if with_cfscrape and cfscrape:
        session_class = CachedCfScrape
        if ignore_cache:
            logger.warning('HTTP cache disabled')
            return cfscrape.create_scraper()
    else:
        session_class = requests_cache.CachedSession
        if ignore_cache:
            logger.warning('HTTP cache disabled')
            return requests.Session()
    session = session_class(
        OUTPUT_PATH / 'web_cache',
        cookies=cookiejar,
        expire_after=CACHE_EXPIRE_DEFAULT,
    )
    for cookie in cookiejar:
        session.cookies.set_cookie(cookie)
    return session


def available_scrapers() -> list[type[data.Scraper]]:
    fetchers.load_fetcher_modules()
    subclasses = []
    class_queue = [data.Scraper]
    while class_queue:
        clazz = class_queue.pop()
        if inspect.isabstract(clazz):
            class_queue.extend(clazz.__subclasses__())
        else:
            subclasses.append(clazz)
    return subclasses


def available_scraper_names() -> list[str]:
    return [scraper_cls.__name__ for scraper_cls in available_scrapers()]


def main(
    scraper_filter: frozenset[str],
    *,
    use_cookiejar: bool,
    ignore_cache: bool,
    notification_types: frozenset[notification.NotificationType],
) -> None:
    if use_cookiejar:
        cookiejar = browsercookie.firefox()
        logger.info('Got cookiejar from firefox: %s cookies', len(cookiejar))
    else:
        cookiejar = []
        logger.warning('No cookiejar is used')

    if len(notification_types) == 0:
        logger.info('No notifications enabled: Notifications will not be sent!')

    for scraper_cls in available_scrapers():
        session = get_session(
            cookiejar,
            with_cfscrape=scraper_cls.requires_cfscrape(),
            ignore_cache=ignore_cache,
        )
        scraper = scraper_cls(session)
        if scraper_cls.__name__ not in scraper_filter:
            continue
        logger.info(
            'Running %s, appending to "%s"',
            scraper_cls.__name__,
            scraper.dataset_name,
        )
        result_rows = []
        try:
            for result in scraper.scrape():
                result_rows.append(result)
                del result
        except requests.exceptions.HTTPError:
            logger.exception('Failed in running %s', scraper_cls.__name__)
            continue
        status = util.extend_csv_file(
            OUTPUT_PATH / f'{scraper.dataset_name}.csv',
            result_rows,
            deduplicate_mode=scraper.deduplicate_mode,
            deduplicate_ignore_columns=scraper.deduplicate_ignore_columns,
        )
        logger.info('Scraper done: %s', scraper.dataset_name)

        if status['extended']:
            notification.send_notifications(
                session,
                scraper_cls.__name__,
                status['dicts'][-1],
                notification_types,
            )

        del scraper, session
Code quality 2024-10-25 20:24:33 +00:00			`import datetime`
Ruff 2024-06-02 16:01:18 +00:00			`import inspect`
Mailgun and secrets 2024-04-16 22:38:57 +00:00			`import logging`
Ruff 2024-08-25 18:50:03 +00:00			`from collections.abc import Sequence`
			`from pathlib import Path`
Mailgun and secrets 2024-04-16 22:38:57 +00:00
			`import requests`
			`import requests_cache`

Ruff 2024-11-17 16:09:41 +00:00			`from . import data, fetchers, notification, util`
Working on calendar export for data 2024-08-25 18:08:41 +00:00
Removed playstation 2024-05-09 15:58:39 +00:00			`logger = logging.getLogger(__name__)`

Mailgun and secrets 2024-04-16 22:38:57 +00:00			`try:`
			`import cfscrape`
			`except ImportError:`
			`cfscrape = None`
Removed playstation 2024-05-09 15:58:39 +00:00			`logger.warning('cfscrape not installed: Certain fetchers might not work')`

			`try:`
			`import browsercookie`
			`except ImportError:`
			`logger.warning('browsercookie not installed: Certain fetchers might not work')`
			`browsercookie = None`
Mailgun and secrets 2024-04-16 22:38:57 +00:00

Use paths 2024-07-27 00:14:01 +00:00			`OUTPUT_PATH = Path('./output')`
Mailgun and secrets 2024-04-16 22:38:57 +00:00
Expanded logging format 2024-11-26 22:26:12 +00:00			`logging.basicConfig(`
			`format='%(asctime)s %(levelname)s %(module)s:%(lineno)d - %(message)s',`
			`)`
Mailgun and secrets 2024-04-16 22:38:57 +00:00			`logger.setLevel('INFO')`


			`STANDARD_HEADERS = {`
			`'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0',`
			`'Accept-Language': 'en-US,en;q=0.5',`
			`'Accept-Encoding': 'gzip, deflate, br',`
			`}`


Improve behaviour without cfscrape 2024-04-17 22:02:13 +00:00			`if cfscrape:`
Ruff check 2024-04-23 20:58:25 +00:00
Improve behaviour without cfscrape 2024-04-17 22:02:13 +00:00			`class CachedCfScrape(requests_cache.CacheMixin, cfscrape.CloudflareScraper):`
			`pass`
Mailgun and secrets 2024-04-16 22:38:57 +00:00
Ruff 2024-10-03 21:24:12 +00:00
Improved caching situation 2024-08-25 19:18:55 +00:00			`CACHE_EXPIRE_DEFAULT = datetime.timedelta(days=7)`
Mailgun and secrets 2024-04-16 22:38:57 +00:00
Ruff 2024-10-03 21:24:12 +00:00
Ruff 2024-06-06 21:50:29 +00:00			`def get_session(`
			`cookiejar: Sequence,`
			`*,`
			`with_cfscrape: bool,`
			`ignore_cache: bool,`
			`) -> requests.Session:`
Code quality 2024-10-25 20:24:33 +00:00			`if with_cfscrape and cfscrape:`
Improve behaviour without cfscrape 2024-04-17 22:02:13 +00:00			`session_class = CachedCfScrape`
PSN Profiles: Implemented pagination 2024-10-23 19:29:53 +00:00			`if ignore_cache:`
			`logger.warning('HTTP cache disabled')`
			`return cfscrape.create_scraper()`
			`else:`
			`session_class = requests_cache.CachedSession`
			`if ignore_cache:`
			`logger.warning('HTTP cache disabled')`
			`return requests.Session()`
Ruff 2024-10-03 21:24:12 +00:00			`session = session_class(`
Ruff 2024-10-10 22:54:01 +00:00			`OUTPUT_PATH / 'web_cache',`
			`cookies=cookiejar,`
			`expire_after=CACHE_EXPIRE_DEFAULT,`
Ruff 2024-10-03 21:24:12 +00:00			`)`
Mailgun and secrets 2024-04-16 22:38:57 +00:00			`for cookie in cookiejar:`
			`session.cookies.set_cookie(cookie)`
			`return session`

Ruff check 2024-04-23 20:58:25 +00:00
Working on calendar export for data 2024-08-25 18:08:41 +00:00			`def available_scrapers() -> list[type[data.Scraper]]:`
Enforce loading of fetchers 2024-11-17 10:30:37 +00:00			`fetchers.load_fetcher_modules()`
Kraken use fin_depo 2024-06-02 16:00:55 +00:00			`subclasses = []`
Working on calendar export for data 2024-08-25 18:08:41 +00:00			`class_queue = [data.Scraper]`
Kraken use fin_depo 2024-06-02 16:00:55 +00:00			`while class_queue:`
			`clazz = class_queue.pop()`
			`if inspect.isabstract(clazz):`
			`class_queue.extend(clazz.__subclasses__())`
			`else:`
			`subclasses.append(clazz)`
			`return subclasses`
Ruff check 2024-04-23 20:58:25 +00:00
Ruff 2024-05-09 14:59:56 +00:00
Improved help 2024-04-28 21:45:47 +00:00			`def available_scraper_names() -> list[str]:`
			`return [scraper_cls.__name__ for scraper_cls in available_scrapers()]`


Ruff 2024-05-09 14:59:56 +00:00			`def main(`
			`scraper_filter: frozenset[str],`
			`*,`
			`use_cookiejar: bool,`
Explicit ignore cache 2024-06-02 22:00:01 +00:00			`ignore_cache: bool,`
			`notification_types: frozenset[notification.NotificationType],`
Ruff 2024-05-09 14:59:56 +00:00			`) -> None:`
Cookiejar is configurable 2024-04-17 22:13:56 +00:00			`if use_cookiejar:`
			`cookiejar = browsercookie.firefox()`
			`logger.info('Got cookiejar from firefox: %s cookies', len(cookiejar))`
			`else:`
			`cookiejar = []`
			`logger.warning('No cookiejar is used')`
Mailgun and secrets 2024-04-16 22:38:57 +00:00
Generalizing notification system 2024-06-02 21:14:19 +00:00			`if len(notification_types) == 0:`
Ruff format 2024-06-02 21:16:11 +00:00			`logger.info('No notifications enabled: Notifications will not be sent!')`
Less spammy notification 2024-05-15 22:05:30 +00:00
Improved help 2024-04-28 21:45:47 +00:00			`for scraper_cls in available_scrapers():`
Ruff 2024-06-06 21:50:29 +00:00			`session = get_session(`
			`cookiejar,`
			`with_cfscrape=scraper_cls.requires_cfscrape(),`
			`ignore_cache=ignore_cache,`
			`)`
Mailgun and secrets 2024-04-16 22:38:57 +00:00			`scraper = scraper_cls(session)`
			`if scraper_cls.__name__ not in scraper_filter:`
			`continue`
Logging 2024-04-16 22:45:15 +00:00			`logger.info(`
Mailgun and secrets 2024-04-16 22:38:57 +00:00			`'Running %s, appending to "%s"',`
			`scraper_cls.__name__,`
			`scraper.dataset_name,`
			`)`
Improved help 2024-04-28 21:45:47 +00:00			`result_rows = []`
Mailgun and secrets 2024-04-16 22:38:57 +00:00			`try:`
			`for result in scraper.scrape():`
			`result_rows.append(result)`
			`del result`
			`except requests.exceptions.HTTPError:`
			`logger.exception('Failed in running %s', scraper_cls.__name__)`
			`continue`
Code quality 2024-10-25 20:24:33 +00:00			`status = util.extend_csv_file(`
Use paths 2024-07-27 00:14:01 +00:00			`OUTPUT_PATH / f'{scraper.dataset_name}.csv',`
Mailgun and secrets 2024-04-16 22:38:57 +00:00			`result_rows,`
			`deduplicate_mode=scraper.deduplicate_mode,`
Wild 2024-11-26 23:11:30 +00:00			`deduplicate_ignore_columns=scraper.deduplicate_ignore_columns,`
Mailgun and secrets 2024-04-16 22:38:57 +00:00			`)`
Logging 2024-04-16 22:45:15 +00:00			`logger.info('Scraper done: %s', scraper.dataset_name)`
Mailgun and secrets 2024-04-16 22:38:57 +00:00
Generalizing notification system 2024-06-02 21:14:19 +00:00			`if status['extended']:`
Ruff format 2024-06-02 21:16:11 +00:00			`notification.send_notifications(`
Ruff 2024-06-06 21:50:29 +00:00			`session,`
			`scraper_cls.__name__,`
			`status['dicts'][-1],`
			`notification_types,`
Ruff format 2024-06-02 21:16:11 +00:00			`)`
Mailgun and secrets 2024-04-16 22:38:57 +00:00
			`del scraper, session`