diff --git a/.gitignore b/.gitignore index b4f00d0..c0fcabb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ *.pyc __pycache__/ -/secrets.py +/secrets/ *.sqlite *.egg-info/ diff --git a/personal_data/__init__.py b/personal_data/__init__.py index d76a9b5..e69de29 100644 --- a/personal_data/__init__.py +++ b/personal_data/__init__.py @@ -1,182 +0,0 @@ -import csv -import datetime -import io -import logging - -import browsercookie -import requests -import requests_cache -from frozendict import frozendict - -try: - import cfscrape -except ImportError: - cfscrape = None - -logger = logging.getLogger(__name__) - -import personal_data.data -import personal_data.fetchers.crunchyroll -import personal_data.fetchers.ffxiv_lodestone -import personal_data.fetchers.playstation -import personal_data.fetchers.psnprofiles -import personal_data.fetchers.partisia_blockchain -from personal_data._version import __version__ - -CSV_DIALECT = 'one_true_dialect' -csv.register_dialect(CSV_DIALECT, lineterminator='\n', skipinitialspace=True) - -logging.basicConfig() -logger.setLevel('INFO') - - -def try_value(fn, s: str) -> object: - try: - return fn(s) - except ValueError: - return None - - -def to_value(s: str) -> object: - s = s.strip() - if len(s) == 0: - return None - if v := try_value(int, s): - return v - if v := try_value(datetime.date.fromisoformat, s): - return v - if v := try_value(datetime.datetime.fromisoformat, s): - return v - if s.lower() == 'false': - return False - if s.lower() == 'true': - return True - if s.lower() == 'none': - return None - return s - - -def extend_csv_file( - filename: str, - new_dicts: dict, - deduplicate_mode: personal_data.data.DeduplicateMode, - deduplicate_ignore_columns: list[str], -): - dicts = [] - try: - with open(filename) as csvfile: - reader = csv.DictReader(csvfile, dialect=CSV_DIALECT) - for row in reader: - for k in list(row.keys()): - row[k] = to_value(row[k]) - if row[k] is None: - del row[k] - dicts.append(frozendict(row)) - del csvfile - except FileNotFoundError as e: - logger.info('Creating file: %s', filename) - - original_num_dicts = len(dicts) - dicts += [frozendict(d) for d in new_dicts] - del new_dicts - - fieldnames = [] - for d in dicts: - for k in d.keys(): - if k not in fieldnames: - fieldnames.append(k) - del k - del d - - def equals_without_fields(a, b, fields = []): - a = dict(a) - b = dict(b) - - for f in fields: - del a[f], b[f] - - return frozendict(a) == frozendict(b) - - - if deduplicate_mode == personal_data.data.DeduplicateMode.ONLY_LATEST: - while len(dicts) >= 2 and equals_without_fields(dicts[-1], dicts[-2], deduplicate_ignore_columns): - del dicts[-1] - elif deduplicate_mode != personal_data.data.DeduplicateMode.NONE: - dicts = set(dicts) - - - dicts = sorted(dicts, key=lambda d: tuple(str(d.get(fn, '')) for fn in fieldnames)) - - csvfile_in_memory = io.StringIO() - writer = csv.DictWriter( - csvfile_in_memory, - fieldnames=fieldnames, - dialect=CSV_DIALECT, - ) - writer.writeheader() - for d in dicts: - writer.writerow(d) - output_csv = csvfile_in_memory.getvalue() - del writer, csvfile_in_memory - - with open(filename, 'w') as csvfile: - csvfile.write(output_csv) - del csvfile - logger.warning( - 'Extended CSV "%s" from %d to %d lines', - filename, - original_num_dicts, - len(dicts), - ) - - -STANDARD_HEADERS = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0', - # "Accept": "application/json, text/plain, */*", - 'Accept-Language': 'en-US,en;q=0.5', - 'Accept-Encoding': 'gzip, deflate, br', -} - - -class CachedCfScrape(requests_cache.CacheMixin, cfscrape.CloudflareScraper): - pass - - -def get_session(cookiejar, *, with_cfscrape: bool) -> requests.Session: - assert isinstance(with_cfscrape, bool) - session = CachedCfScrape('web_cache', cookies=cookiejar) - for cookie in cookiejar: - session.cookies.set_cookie(cookie) - return session - - -def main(scraper_filter: frozenset[str]): - cookiejar = browsercookie.firefox() - logger.warning('Got cookiejar from firefox: %s cookies', len(cookiejar)) - - for scraper_cls in personal_data.data.Scraper.__subclasses__(): - session = get_session(cookiejar, with_cfscrape=scraper_cls.requires_cfscrape()) - scraper = scraper_cls(session) - if scraper_cls.__name__ not in scraper_filter: - continue - logger.warning( - 'Running %s, appending to "%s"', - scraper_cls.__name__, - scraper.dataset_name, - ) - result_rows = list() - try: - for result in scraper.scrape(): - result_rows.append(result) - del result - except requests.exceptions.HTTPError: - logger.exception('Failed in running %s', scraper_cls.__name__) - continue - extend_csv_file( - 'output/' + scraper.dataset_name, - result_rows, - deduplicate_mode=scraper.deduplicate_mode, - deduplicate_ignore_columns=scraper.deduplicate_ignore_columns, - ) - logger.warning('Scraper done: %s', scraper.dataset_name) - del scraper, session diff --git a/personal_data/__main__.py b/personal_data/__main__.py index 850acf0..6a6f82c 100644 --- a/personal_data/__main__.py +++ b/personal_data/__main__.py @@ -1,4 +1,4 @@ -import personal_data +import personal_data.main import argparse def parse_arguments(): @@ -9,7 +9,7 @@ def parse_arguments(): def main(): args = parse_arguments() scraper_filter = frozenset(args.fetchers) - personal_data.main(scraper_filter) + personal_data.main.main(scraper_filter) if __name__ == '__main__': main() diff --git a/personal_data/mailgun.py b/personal_data/mailgun.py new file mode 100644 index 0000000..cb65cd3 --- /dev/null +++ b/personal_data/mailgun.py @@ -0,0 +1,31 @@ +import requests + +import personal_data.secrets as secrets + +MAILGUN_API_ENDPOINT = 'https://api.mailgun.net/v3/{mailgun_domain}/messages' + +FROM_MAIL_NAME = 'Personal Scrapers' +FROM_MAIL_USERNAME = 'scrapers' + + +def send_email(session: requests.Session, subject: str, text: str): + assert isinstance(session, requests.Session) + + assert subject != '' + assert text != '' + + data = { + 'from': f'{FROM_MAIL_NAME} <{FROM_MAIL_USERNAME}@{secrets.MAILGUN_DOMAIN}>', + 'to': [secrets.MAILGUN_RECIPIENT], + 'subject': subject, + 'text': text, + } + url = MAILGUN_API_ENDPOINT.format(mailgun_domain=secrets.MAILGUN_DOMAIN) + response = session.post( + url, + auth=('api', secrets.MAILGUN_API_KEY), + data=data, + ) + response.raise_for_status() + return response + diff --git a/personal_data/main.py b/personal_data/main.py new file mode 100644 index 0000000..6dc265c --- /dev/null +++ b/personal_data/main.py @@ -0,0 +1,202 @@ +import csv +import datetime +import io +import logging + +import browsercookie +import requests +import requests_cache +from frozendict import frozendict + +try: + import cfscrape +except ImportError: + cfscrape = None + +logger = logging.getLogger(__name__) + +import personal_data.data +import personal_data.fetchers.crunchyroll +import personal_data.fetchers.ffxiv_lodestone +import personal_data.fetchers.playstation +import personal_data.fetchers.psnprofiles +import personal_data.fetchers.partisia_blockchain +from personal_data._version import __version__ + +from . import mailgun + +import personal_data.mailgun as mailgun +import personal_data.secrets as secrets + +CSV_DIALECT = 'one_true_dialect' +csv.register_dialect(CSV_DIALECT, lineterminator='\n', skipinitialspace=True) + +logging.basicConfig() +logger.setLevel('INFO') + + +def try_value(fn, s: str) -> object: + try: + return fn(s) + except ValueError: + return None + + +def to_value(s: str) -> object: + s = s.strip() + if len(s) == 0: + return None + if v := try_value(int, s): + return v + if v := try_value(datetime.date.fromisoformat, s): + return v + if v := try_value(datetime.datetime.fromisoformat, s): + return v + if s.lower() == 'false': + return False + if s.lower() == 'true': + return True + if s.lower() == 'none': + return None + return s + + +def extend_csv_file( + filename: str, + new_dicts: dict, + deduplicate_mode: personal_data.data.DeduplicateMode, + deduplicate_ignore_columns: list[str], +) -> dict: + dicts = [] + try: + with open(filename) as csvfile: + reader = csv.DictReader(csvfile, dialect=CSV_DIALECT) + for row in reader: + for k in list(row.keys()): + row[k] = to_value(row[k]) + if row[k] is None: + del row[k] + dicts.append(frozendict(row)) + del csvfile + except FileNotFoundError as e: + logger.info('Creating file: %s', filename) + + original_num_dicts = len(dicts) + dicts += [frozendict(d) for d in new_dicts] + del new_dicts + + fieldnames = [] + for d in dicts: + for k in d.keys(): + if k not in fieldnames: + fieldnames.append(k) + del k + del d + + def equals_without_fields(a, b, fields = []): + a = dict(a) + b = dict(b) + + for f in fields: + del a[f], b[f] + + return frozendict(a) == frozendict(b) + + + if deduplicate_mode == personal_data.data.DeduplicateMode.ONLY_LATEST: + while len(dicts) >= 2 and equals_without_fields(dicts[-1], dicts[-2], deduplicate_ignore_columns): + del dicts[-1] + elif deduplicate_mode != personal_data.data.DeduplicateMode.NONE: + dicts = set(dicts) + + + dicts = sorted(dicts, key=lambda d: tuple(str(d.get(fn, '')) for fn in fieldnames)) + + csvfile_in_memory = io.StringIO() + writer = csv.DictWriter( + csvfile_in_memory, + fieldnames=fieldnames, + dialect=CSV_DIALECT, + ) + writer.writeheader() + for d in dicts: + writer.writerow(d) + output_csv = csvfile_in_memory.getvalue() + del writer, csvfile_in_memory + + with open(filename, 'w') as csvfile: + csvfile.write(output_csv) + del csvfile + logger.warning( + 'Extended CSV "%s" from %d to %d lines', + filename, + original_num_dicts, + len(dicts), + ) + + return { + 'extended': original_num_dicts != len(dicts), + 'input_lines': original_num_dicts, + 'output_lines': len(dicts), + 'dicts': dicts, + } + + +STANDARD_HEADERS = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0', + # "Accept": "application/json, text/plain, */*", + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate, br', +} + + +class CachedCfScrape(requests_cache.CacheMixin, cfscrape.CloudflareScraper): + pass + + +def get_session(cookiejar, *, with_cfscrape: bool) -> requests.Session: + assert isinstance(with_cfscrape, bool) + session = CachedCfScrape('web_cache', cookies=cookiejar) + for cookie in cookiejar: + session.cookies.set_cookie(cookie) + return session + +def send_notification(session: requests.Session, scraper_name: str, latest_dict: frozendict): + maingun.send_email(session, f'Updated {scraper_name}', repr(latest_dict)) + + +def main(scraper_filter: frozenset[str]): + cookiejar = browsercookie.firefox() + logger.warning('Got cookiejar from firefox: %s cookies', len(cookiejar)) + + for scraper_cls in personal_data.data.Scraper.__subclasses__(): + session = get_session(cookiejar, with_cfscrape=scraper_cls.requires_cfscrape()) + scraper = scraper_cls(session) + if scraper_cls.__name__ not in scraper_filter: + continue + logger.warning( + 'Running %s, appending to "%s"', + scraper_cls.__name__, + scraper.dataset_name, + ) + result_rows = list() + try: + for result in scraper.scrape(): + result_rows.append(result) + del result + except requests.exceptions.HTTPError: + logger.exception('Failed in running %s', scraper_cls.__name__) + continue + status = extend_csv_file( + 'output/' + scraper.dataset_name, + result_rows, + deduplicate_mode=scraper.deduplicate_mode, + deduplicate_ignore_columns=scraper.deduplicate_ignore_columns, + ) + logger.warning('Scraper done: %s', scraper.dataset_name) + + if status['extended']: + print('Extended') + send_notification(scraper_cls.__name__, status['dicts'][-1]) + + del scraper, session diff --git a/personal_data/secrets.py b/personal_data/secrets.py new file mode 100644 index 0000000..6761f2d --- /dev/null +++ b/personal_data/secrets.py @@ -0,0 +1,34 @@ +import logging +import os + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +ENV_KEY_PREFIX = 'CF_IV_' + +def load_secret(env_key: str) -> str: + filepath = os.environ.get(ENV_KEY_PREFIX + env_key) + if filepath is None: + filepath = f'./secrets/{env_key.lower()}' + try: + with open(filepath) as f: + value = f.read().strip() + logger.info('Loaded secret: %s', env_key) + return value + except Exception: + logger.exception('Could not load secret %s', env_key) + return None + + +# Crunchyroll +CRUNCHYROLL_DEVICE_ID = load_secret('CRUNCHYROLL_DEVICE_ID') +CRUNCHYROLL_AUTH =load_secret('CRUNCHYROLL_AUTH') + +# FFXIV +FFXIV_CHARACTER_ID = load_secret('FFXIV_CHARACTER_ID') + +# Email configuration +MAILGUN_API_KEY = load_secret('MAILGUN_API_KEY') +MAILGUN_DOMAIN = load_secret('MAILGUN_DOMAIN') +MAILGUN_RECIPIENT = load_secret('MAILGUN_RECIPIENT') +