From 87e9d4548c2d8e2cead8a2ec23b786e386f6a778 Mon Sep 17 00:00:00 2001 From: Jon Michael Aanes Date: Sat, 6 Apr 2024 18:21:56 +0200 Subject: [PATCH] Improved psnprofiles --- personal_data/__init__.py | 15 +++++-- personal_data/fetchers/psnprofiles.py | 65 ++++++++++++++++++++++++++- 2 files changed, 74 insertions(+), 6 deletions(-) diff --git a/personal_data/__init__.py b/personal_data/__init__.py index 0a77adc..c12fde0 100644 --- a/personal_data/__init__.py +++ b/personal_data/__init__.py @@ -134,19 +134,26 @@ def main(): cookiejar = browsercookie.firefox() logger.warning('Got cookiejar from firefox: %s cookies', len(cookiejar)) + scraper_filter = {'PsnProfilesScraper'} + for scraper_cls in personal_data.data.Scraper.__subclasses__(): session = get_session(cookiejar, with_cfscrape=scraper_cls.requires_cfscrape()) scraper = scraper_cls(session) + if scraper_cls.__name__ not in scraper_filter: + continue logger.warning( 'Running %s, appending to "%s"', scraper_cls.__name__, scraper.dataset_name, ) - del scraper_cls result_rows = list() - for result in scraper.scrape(): - result_rows.append(result) - del result + try: + for result in scraper.scrape(): + result_rows.append(result) + del result + except requests.exceptions.HTTPError: + logger.exception('Failed in running %s', scraper_cls.__name__) + continue extend_csv_file( 'output/' + scraper.dataset_name, result_rows, diff --git a/personal_data/fetchers/psnprofiles.py b/personal_data/fetchers/psnprofiles.py index 2528a81..7484feb 100644 --- a/personal_data/fetchers/psnprofiles.py +++ b/personal_data/fetchers/psnprofiles.py @@ -1,5 +1,6 @@ import dataclasses import datetime +from collections.abc import Iterator import logging import re import secrets @@ -13,6 +14,7 @@ from personal_data.data import DeduplicateMode, Scraper logger = logging.getLogger(__name__) URL_PROFILE = 'https://psnprofiles.com/{psn_id}' +URL_USER_GAME_TROPHIES = 'https://psnprofiles.com/trophies/{game_id}/{psn_id}' FORMAT_DAY_MONTH_YEAR = '%d %B %Y' @@ -32,6 +34,13 @@ assert game_psnprofiles_id_from_url( '/trophy/21045-theatrhythm-final-bar-line/19-seasoned-hunter', ) +def parse_time(text: str) -> datetime.datetime: + text = text.replace('\n', ' ') + text = text.strip() + return datetime.datetime.strptime(text, '%d %b %Y %I:%M:%S %p') + +assert parse_time('06 Apr 2024 06:11:42 PM') +assert parse_time('26 Mar 2024 7:07:01 PM') @dataclasses.dataclass(frozen=True) class PsnProfilesScraper(Scraper): @@ -43,8 +52,17 @@ class PsnProfilesScraper(Scraper): return True def scrape(self): - # Request to get watch history - logger.info('Getting Watchlist') + games_rows = list(self.scrape_games_overview()) + games_ids = {row['psnprofiles.game_id']: ['game.name'] for row in games_rows} + + yield from games_rows + for game_id, game_name in games_ids.items(): + yield from self.scrape_game_trophies(game_id, game_name) + del game_id + + def scrape_games_overview(self) -> Iterator[dict]: + # Request to get overview + logger.info('Getting Overview') url = URL_PROFILE.format(psn_id=secrets.PLAYSTATION_PSN_ID) response = self.session.get(url) response.raise_for_status() @@ -130,3 +148,46 @@ class PsnProfilesScraper(Scraper): if time_played: d['me.last_played_time'] = time_played yield d + + def scrape_game_trophies(self, psnprofiles_id: str, game_name: str) -> Iterator[dict]: + logger.info('Getting Game Trophies %s', psnprofiles_id) + + url = URL_USER_GAME_TROPHIES.format(psn_id=secrets.PLAYSTATION_PSN_ID, + game_id=psnprofiles_id) + response = self.session.get(url) + response.raise_for_status() + + # Parse data + soup = bs4.BeautifulSoup(response.content, 'lxml') + soup = personal_data.html_util.normalize_soup_slightly(soup, classes=False) + + # Recent trophies. + soup_tropies = soup.select('#content table.zebra tr.completed') + assert len(soup_tropies) > 0, url + for row in soup_tropies: + print(row) + + cells = row.find_all('td') + print(cells) + + trophy_name_a = cells[1].a + trophy_name = trophy_name_a.get_text().strip() + trophy_name_a.extract() + trophy_desc = cells[1].get_text().strip() + + trophy_icon = cells[0].img['src'] + + cells[2].span.span.nobr.sup.extract() + gotten_at = parse_time(cells[2].get_text()) + + yield { + 'game.name': game_name, + 'me.last_played_time': gotten_at, + # Trophy Data + 'trophy.name': trophy_name, + 'trophy.desc': trophy_desc, + 'trophy.icon': trophy_icon, + 'psnprofiles.game_id': psnprofiles_id, + } + + del row, cells