diff --git a/README.md b/README.md index 055cbc4..6dbcf96 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,10 @@ This is a collection of small fetchers for personal data spread around the inter ## Ideas for more fetchers +- [ ] Final Fantasy XIV: Achievements & Mounts - [ ] Save data for Theatherythm: Most played songs, and liked songs. - [ ] YouTube (Music): Liked videos with title and URL. - [ ] Playstation: Achivement dates, hours played and last played dates +- [ ] Steam Wishlist and Achievements +- [ ] fredagscafeen.dk diff --git a/personal_data/__main__.py b/personal_data/__main__.py index d5c2543..ec4dd95 100644 --- a/personal_data/__main__.py +++ b/personal_data/__main__.py @@ -19,13 +19,6 @@ import personal_data.data CSV_DIALECT = 'one_true_dialect' csv.register_dialect(CSV_DIALECT, lineterminator = '\n', skipinitialspace = True) -def determine_scrapers(): - scrapers = [] - #scrapers += personal_data.fetchers.playstation.SCRAPERS - scrapers += personal_data.fetchers.crunchyroll.SCRAPERS - scrapers += personal_data.fetchers.psnprofiles.SCRAPERS - return scrapers - def try_value(fn, s: str) -> any: try: return fn(s) @@ -111,15 +104,18 @@ def main(): for cookie in cookiejar: session.cookies.set_cookie(cookie) - for scraper in determine_scrapers(): + for scraper_cls in personal_data.data.Scraper.__subclasses__(): + scraper = scraper_cls(session) + del scraper_cls logger.warning('Running scraper: %s', scraper.dataset_name) result_rows = list() - for result in scraper.scraper(session): + for result in scraper.scrape(): result_rows.append(result) del result extend_csv_file('output/'+scraper.dataset_name, result_rows, deduplicate_mode = scraper.deduplicate_mode) logger.warning('Scraper done: %s', scraper.dataset_name) + del scraper if __name__ == '__main__': main() diff --git a/personal_data/data.py b/personal_data/data.py index 6c193db..94da2b2 100644 --- a/personal_data/data.py +++ b/personal_data/data.py @@ -1,15 +1,35 @@ import dataclasses +import requests from enum import Enum +import abc class DeduplicateMode(Enum): NONE = 0 BY_FIRST_COLUMN = 1 BY_ALL_COLUMNS = 2 -@dataclasses.dataclass -class Scraper: - scraper: object # TODO: Callable - dataset_name: str - deduplicate_mode: DeduplicateMode - dataset_format: str = 'list-of-dicts' +@dataclasses.dataclass(frozen = True) +class Scraper(abc.ABC): + session: requests.Session + + @staticmethod + @property + @abc.abstractmethod + def dataset_name(self) -> str: + pass + + @staticmethod + @property + @abc.abstractmethod + def deduplicate_mode(self) -> DeduplicateMode: + pass + + @staticmethod + @property + def dataset_format(self) -> str: + return 'list-of-dicts' + + @abc.abstractmethod + def scrape(self): + pass diff --git a/personal_data/fetchers/crunchyroll.py b/personal_data/fetchers/crunchyroll.py index 4118c12..1b359f4 100644 --- a/personal_data/fetchers/crunchyroll.py +++ b/personal_data/fetchers/crunchyroll.py @@ -1,6 +1,7 @@ import secrets import functools import logging +import dataclasses from personal_data.data import Scraper, DeduplicateMode @@ -11,60 +12,61 @@ API_URL_TOKEN = API_ROOT + '/auth/v1/token' API_URL_ME = API_ROOT + '/accounts/v1/me' API_URL_WATCH_LIST = API_ROOT + '/content/v2/{account_uuid}/watch-history?page_size=100&locale=en-US' -def scrape_watched_last(session): - headers = { - 'Referer': 'https://www.crunchyroll.com/history', - 'Authorization': secrets.CRUNCHYROLL_AUTH, # TODO: Determine automatically - } +@dataclasses.dataclass(frozen = True) +class CrunchyrollScraper(Scraper): + dataset_name = 'episodes_watched_crunchyroll' + deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS - # Request to get account UUID - logger.info('Getting Access Token') - response = session.post(API_URL_TOKEN, headers = headers, cookies = session.cookies, data = { - "device_id": secrets.CRUNCHYROLL_DEVICE_ID, # TODO: Determine automatically - "device_type": "Firefox on Linux", - "grant_type": "etp_rt_cookie" - }) - - response.raise_for_status() - data_me = response.json() - headers['Authorization'] = '{} {}'.format(data_me['token_type'], data_me['access_token']) - account_uuid = data_me['account_id'] - - logger.info(' Account UUID: %s', account_uuid) - - # Request to get watch history - logger.info('Getting Watchlist') - response = session.get(API_URL_WATCH_LIST.format(account_uuid = account_uuid), headers = headers) - response.raise_for_status() - - # Parse data - episodes_data = response.json()['data'] - logger.info(' Watchlist length: %d', len(episodes_data)) - - for episode_data in episodes_data: - yield { - # Sorting fields - 'datetime_played': episode_data['date_played'], - - # Important fields - 'series.title': episode_data['panel']['episode_metadata']['series_title'], - 'season.number': episode_data['panel']['episode_metadata']['season_number'], - 'episode.number': episode_data['panel']['episode_metadata']['episode'], - 'episode.name': episode_data['panel']['title'], - - # Secondary fields - 'episode.language': episode_data['panel']['episode_metadata']['audio_locale'], - 'episode.duration_ms': episode_data['panel']['episode_metadata']['duration_ms'], - 'episode.maturity_ratings': ' '.join(episode_data['panel']['episode_metadata']['maturity_ratings']), - 'season.title': episode_data['panel']['episode_metadata']['season_title'], - 'fully_watched': episode_data['fully_watched'], - - # Identifiers - 'episode.crunchyroll_id': episode_data['id'], - 'series.crunchyroll_id': episode_data['parent_id'], + def scrape(self): + headers = { + 'Referer': 'https://www.crunchyroll.com/history', + 'Authorization': secrets.CRUNCHYROLL_AUTH, # TODO: Determine automatically } -SCRAPERS = [ - Scraper(scrape_watched_last, 'episodes_watched_crunchyroll', - deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS) -] + # Request to get account UUID + logger.info('Getting Access Token') + response = self.session.post(API_URL_TOKEN, headers = headers, cookies = self.session.cookies, data = { + "device_id": secrets.CRUNCHYROLL_DEVICE_ID, # TODO: Determine automatically + "device_type": "Firefox on Linux", + "grant_type": "etp_rt_cookie" + }) + + response.raise_for_status() + data_me = response.json() + headers['Authorization'] = '{} {}'.format(data_me['token_type'], data_me['access_token']) + account_uuid = data_me['account_id'] + + logger.info(' Account UUID: %s', account_uuid) + + # Request to get watch history + logger.info('Getting Watchlist') + response = self.session.get(API_URL_WATCH_LIST.format(account_uuid = account_uuid), headers = headers) + response.raise_for_status() + + # Parse data + episodes_data = response.json()['data'] + logger.info(' Watchlist length: %d', len(episodes_data)) + + for episode_data in episodes_data: + yield { + # Sorting fields + 'datetime_played': episode_data['date_played'], + + # Important fields + 'series.title': episode_data['panel']['episode_metadata']['series_title'], + 'season.number': episode_data['panel']['episode_metadata']['season_number'], + 'episode.number': episode_data['panel']['episode_metadata']['episode'], + 'episode.name': episode_data['panel']['title'], + + # Secondary fields + 'episode.language': episode_data['panel']['episode_metadata']['audio_locale'], + 'episode.duration_ms': episode_data['panel']['episode_metadata']['duration_ms'], + 'episode.maturity_ratings': ' '.join(episode_data['panel']['episode_metadata']['maturity_ratings']), + 'season.title': episode_data['panel']['episode_metadata']['season_title'], + 'fully_watched': episode_data['fully_watched'], + + # Identifiers + 'episode.crunchyroll_id': episode_data['id'], + 'series.crunchyroll_id': episode_data['parent_id'], + } + diff --git a/personal_data/fetchers/playstation.py b/personal_data/fetchers/playstation.py index 1aefaf6..f25e9b5 100644 --- a/personal_data/fetchers/playstation.py +++ b/personal_data/fetchers/playstation.py @@ -57,8 +57,10 @@ def scrape_played_last(session): 'game.icon': game_data['image']['url'], } +''' SCRAPERS = [ Scraper(scrape_played_last, 'games_played_playstation', deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS) ] +''' diff --git a/personal_data/fetchers/psnprofiles.py b/personal_data/fetchers/psnprofiles.py index dc60285..82ead2b 100644 --- a/personal_data/fetchers/psnprofiles.py +++ b/personal_data/fetchers/psnprofiles.py @@ -1,5 +1,6 @@ import secrets import functools +import dataclasses import re import logging import bs4 @@ -26,89 +27,89 @@ assert game_psnprofiles_id_from_url('/trophies/21045-theatrhythm-final-bar-line/ assert game_psnprofiles_id_from_url('/trophies/21045-theatrhythm-final-bar-line/Jmaanes') assert game_psnprofiles_id_from_url('/trophy/21045-theatrhythm-final-bar-line/19-seasoned-hunter') -def scrape_personal_page(session): - # Request to get watch history - logger.info('Getting Watchlist') - url = URL_PROFILE.format(psn_id = secrets.PLAYSTATION_PSN_ID) - response = session.get(url) - response.raise_for_status() +@dataclasses.dataclass(frozen = True) +class PsnProfilesScraper(Scraper): + dataset_name = 'games_played_playstation' + deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS - NOW = datetime.datetime.strptime(response.headers['Date'], FORMAT_DATE_HEADER) + def scrape(self): + # Request to get watch history + logger.info('Getting Watchlist') + url = URL_PROFILE.format(psn_id = secrets.PLAYSTATION_PSN_ID) + response = self.session.get(url) + response.raise_for_status() - # Parse data - soup = bs4.BeautifulSoup(response.content, 'lxml') - soup = personal_data.html_util.normalize_soup_slightly(soup, classes = False) + NOW = datetime.datetime.strptime(response.headers['Date'], FORMAT_DATE_HEADER) - # Recent trophies. - soup_recent_tropies = soup.select('ul#recent-trophies > li') - assert len(soup_recent_tropies) > 0, url - for row in soup_recent_tropies: - cells = row.select_one('.info .box td').find_all('div') + # Parse data + soup = bs4.BeautifulSoup(response.content, 'lxml') + soup = personal_data.html_util.normalize_soup_slightly(soup, classes = False) - trophy_name = cells[0].get_text().strip() - trophy_desc = cells[1].get_text().strip() - game_name = cells[2].a.extract().get_text().strip() + # Recent trophies. + soup_recent_tropies = soup.select('ul#recent-trophies > li') + assert len(soup_recent_tropies) > 0, url + for row in soup_recent_tropies: + cells = row.select_one('.info .box td').find_all('div') - psnprofiles_id = game_psnprofiles_id_from_url(cells[0].find('a')['href']) - trophy_icon = row.find(class_='icon').find('img')['src'] + trophy_name = cells[0].get_text().strip() + trophy_desc = cells[1].get_text().strip() + game_name = cells[2].a.extract().get_text().strip() - gotten_at = cells[2].get_text().strip().removesuffix(' in').removesuffix(' ago') - gotten_at = personal_data.parse_util.parse_duration(gotten_at) - time_acquired = NOW - gotten_at + psnprofiles_id = game_psnprofiles_id_from_url(cells[0].find('a')['href']) + trophy_icon = row.find(class_='icon').find('img')['src'] - yield { - 'game.name' : game_name, - 'me.last_played_time': time_acquired, + gotten_at = cells[2].get_text().strip().removesuffix(' in').removesuffix(' ago') + gotten_at = personal_data.parse_util.parse_duration(gotten_at) + time_acquired = NOW - gotten_at - # Trophy Data - 'trophy.name': trophy_name, - 'trophy.desc': trophy_desc, - 'trophy.icon': trophy_icon, - 'psnprofiles.game_id': psnprofiles_id, - } + yield { + 'game.name' : game_name, + 'me.last_played_time': time_acquired, - del row, cells, time_acquired - - # Games table - table_rows = soup.find(id = 'gamesTable').find_all('tr') - assert len(table_rows) > 0, url - - for row in table_rows: - cells = row.find_all('td') - - # Check for pagination - if re.match(r'show \d+ more games', cells[0].get_text().strip(), re.IGNORECASE): - break - - game_name = cells[1].find(class_ = 'title').get_text() - psnprofiles_id = game_psnprofiles_id_from_url(cells[0].find('a')['href']) - game_icon = cells[0].find('img')['src'] - - game_name = row.select_one('.title').get_text() - game_platform = row.select_one('.platform').get_text() - - small_infos = cells[1].find_all('div') - if len(small_infos) > 2: - time_played_div = small_infos[2] - time_played_div.sup.extract() - time_played = datetime.datetime.strptime(time_played_div.get_text().strip(), FORMAT_DAY_MONTH_YEAR).date() - else: - time_played = None - - d = { - # Important fields - 'game.name': game_name, - - # Secondary fields - 'game.platform': game_platform, - 'game.icon': game_icon, + # Trophy Data + 'trophy.name': trophy_name, + 'trophy.desc': trophy_desc, + 'trophy.icon': trophy_icon, 'psnprofiles.game_id': psnprofiles_id, - } - if time_played: - d['me.last_played_time'] = time_played - yield d + } -SCRAPERS = [ - Scraper(scrape_personal_page, 'games_played_playstation', - deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS) -] + del row, cells, time_acquired + + # Games table + table_rows = soup.find(id = 'gamesTable').find_all('tr') + assert len(table_rows) > 0, url + + for row in table_rows: + cells = row.find_all('td') + + # Check for pagination + if re.match(r'show \d+ more games', cells[0].get_text().strip(), re.IGNORECASE): + break + + game_name = cells[1].find(class_ = 'title').get_text() + psnprofiles_id = game_psnprofiles_id_from_url(cells[0].find('a')['href']) + game_icon = cells[0].find('img')['src'] + + game_name = row.select_one('.title').get_text() + game_platform = row.select_one('.platform').get_text() + + small_infos = cells[1].find_all('div') + if len(small_infos) > 2: + time_played_div = small_infos[2] + time_played_div.sup.extract() + time_played = datetime.datetime.strptime(time_played_div.get_text().strip(), FORMAT_DAY_MONTH_YEAR).date() + else: + time_played = None + + d = { + # Important fields + 'game.name': game_name, + + # Secondary fields + 'game.platform': game_platform, + 'game.icon': game_icon, + 'psnprofiles.game_id': psnprofiles_id, + } + if time_played: + d['me.last_played_time'] = time_played + yield d