From 3170d8e7a8bb41b6d3a01ca5725bdad19a24c700 Mon Sep 17 00:00:00 2001 From: Jon Michael Aanes Date: Wed, 23 Oct 2024 21:29:53 +0200 Subject: [PATCH] PSN Profiles: Implemented pagination --- personal_data/fetchers/psnprofiles.py | 43 ++++++++++++++++++++++++--- personal_data/main.py | 12 +++++--- 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/personal_data/fetchers/psnprofiles.py b/personal_data/fetchers/psnprofiles.py index 42d6fc4..eaad481 100644 --- a/personal_data/fetchers/psnprofiles.py +++ b/personal_data/fetchers/psnprofiles.py @@ -1,6 +1,7 @@ import dataclasses import logging import re +import datetime from collections.abc import Iterator import bs4 @@ -17,6 +18,7 @@ logger = logging.getLogger(__name__) URL_API_ROOT = 'https://psnprofiles.com/' URL_PROFILE = URL_API_ROOT + '{psn_id}' URL_USER_GAME_TROPHIES = URL_API_ROOT + 'trophies/{game_id}/{psn_id}' +URL_GAMES_OVERVIEW = URL_API_ROOT + '{psn_id}' def game_psnprofiles_id_from_url(relative_url: str) -> int: @@ -42,6 +44,8 @@ class PsnProfilesScraper(Scraper): games_rows = list(self._scrape_games_overview()) games_ids = {row['psnprofiles.game_id']: row['game.name'] for row in games_rows} + logger.info('Found %d games from overview', len(games_rows)) + SCRAPE_FROM_OVERVIEW = False if SCRAPE_FROM_OVERVIEW: yield from games_rows @@ -59,24 +63,50 @@ class PsnProfilesScraper(Scraper): self.session, URL_API_ROOT, per_minute=5, + expire_after=datetime.timedelta(hours=1), + ) + requests_util.setup_limiter( + self.session, + URL_API_ROOT+'/trophies/', + expire_after=datetime.timedelta(days=14), ) def _scrape_games_overview(self) -> Iterator[dict]: + + for page_num in range(1, 1000): + logger.info('Getting Overview (page %d)', page_num) + url = URL_GAMES_OVERVIEW.format(psn_id=secrets.PLAYSTATION_PSN_ID) + response = self.session.get(url, params={'page': page_num}) + if 'page' not in response.url: + msg = 'Configuration error? psnprofiles.com made an redirection. This is possibly because your profile name wasn\'t exactly as expected. Please check it' + raise RuntimeError(msg) + response.raise_for_status() + soup = bs4.BeautifulSoup(response.text, 'lxml') + soup = personal_data.html_util.normalize_soup_slightly(soup, classes=False) + games_on_page = list(self._iterate_games_from_games_table(soup)) + yield from games_on_page + if len(games_on_page) == 0: + return + + def _scrape_games_overview_old(self) -> Iterator[dict]: # Request to get overview logger.info('Getting Overview') url = URL_PROFILE.format(psn_id=secrets.PLAYSTATION_PSN_ID) response = self.session.get(url) response.raise_for_status() - NOW = parse_util.parse_response_datetime(response) + now = parse_util.parse_response_datetime(response) # Parse data soup = bs4.BeautifulSoup(response.content, 'lxml') soup = personal_data.html_util.normalize_soup_slightly(soup, classes=False) - # Recent trophies. + yield from self._iterate_games_from_recent_tropies(soup, now) + yield from self._iterate_games_from_games_table(soup) + + def _iterate_games_from_recent_tropies(self, soup, now) -> Iterator[dict]: soup_recent_tropies = soup.select('ul#recent-trophies > li') - assert len(soup_recent_tropies) > 0, url + assert len(soup_recent_tropies) > 0 for row in soup_recent_tropies: cells = row.select_one('.info .box td').find_all('div') @@ -91,7 +121,7 @@ class PsnProfilesScraper(Scraper): cells[2].get_text().strip().removesuffix(' in').removesuffix(' ago') ) gotten_at = parse_util.parse_duration(gotten_at) - time_acquired = NOW - gotten_at + time_acquired = now - gotten_at yield { 'game.name': game_name, @@ -105,10 +135,15 @@ class PsnProfilesScraper(Scraper): del row, cells, time_acquired + def _iterate_games_from_games_table(self, soup) -> Iterator[dict]: # Games table table_rows = soup.find(id='gamesTable').find_all('tr') assert len(table_rows) > 0, url + if title := table_rows[0].h2: + if title.get_text().strip() == 'No games found': + return + for row in table_rows: cells = row.find_all('td') diff --git a/personal_data/main.py b/personal_data/main.py index 150860f..b7d85d6 100644 --- a/personal_data/main.py +++ b/personal_data/main.py @@ -54,12 +54,16 @@ def get_session( ignore_cache: bool, ) -> requests.Session: assert isinstance(with_cfscrape, bool) - session_class = requests_cache.CachedSession - if ignore_cache: - logger.warning('HTTP cache disabled') - return requests.Session() if cfscrape: session_class = CachedCfScrape + if ignore_cache: + logger.warning('HTTP cache disabled') + return cfscrape.create_scraper() + else: + session_class = requests_cache.CachedSession + if ignore_cache: + logger.warning('HTTP cache disabled') + return requests.Session() session = session_class( OUTPUT_PATH / 'web_cache', cookies=cookiejar,