import requests import requests_cache import csv import datetime import io import browsercookie from frozendict import frozendict import logging import cfscrape logger = logging.getLogger(__name__) import personal_data.fetchers.playstation import personal_data.fetchers.crunchyroll import personal_data.fetchers.psnprofiles def determine_scrapers(): scrapers = [] #scrapers += personal_data.fetchers.playstation.SCRAPERS scrapers += personal_data.fetchers.crunchyroll.SCRAPERS scrapers += personal_data.fetchers.psnprofiles.SCRAPERS return scrapers def extend_csv_file(filename, new_dicts, deduplicate = False): dicts = [] try: with open(filename, 'r') as csvfile: reader = csv.DictReader(csvfile) for row in reader: dicts.append(frozendict(row)) del csvfile except FileNotFoundError as e: logger.info('Creating file: %s', filename) pass original_num_dicts = len(dicts) dicts += [frozendict(d) for d in new_dicts] del new_dicts fieldnames = [] for d in dicts: for k in d.keys(): if k not in fieldnames: fieldnames.append(k) if deduplicate: dicts = sorted(set(dicts), key = lambda d: d[fieldnames[0]]) csvfile_in_memory = io.StringIO() writer = csv.DictWriter(csvfile_in_memory, fieldnames=fieldnames) writer.writeheader() for d in dicts: writer.writerow(d) output_csv = csvfile_in_memory.getvalue() del writer, csvfile_in_memory with open(filename, 'w') as csvfile: csvfile.write(output_csv) del csvfile logger.warning('Extended CSV "%s" from %d to %d lines', filename, original_num_dicts, len(dicts)) STANDARD_HEADERS = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0', #"Accept": "application/json, text/plain, */*", 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', } def main(): cookiejar = browsercookie.firefox() logger.warning('Got cookiejar from firefox: %s cookies', len(cookiejar)) #session = requests_cache.CachedSession('web_cache', cookies = cookiejar) session = cfscrape.create_scraper() for cookie in cookiejar: session.cookies.set_cookie(cookie) for scraper in determine_scrapers(): logger.warning('Running scraper: %s', scraper.dataset_name) result_rows = list() for result in scraper.scraper(session): result_rows.append(result) del result extend_csv_file('output/'+scraper.dataset_name, result_rows, deduplicate = scraper.deduplicate) logger.warning('Scraper done: %s', scraper.dataset_name) if __name__ == '__main__': main()