import dataclasses import datetime import logging import re from collections.abc import Iterator import bs4 import requests_util import personal_data.html_util from personal_data import secrets from personal_data.data import DeduplicateMode, Scraper from .. import parse_util logger = logging.getLogger(__name__) URL_API_ROOT = 'https://psnprofiles.com/' URL_PROFILE = URL_API_ROOT + '{psn_id}' URL_USER_GAME_TROPHIES = URL_API_ROOT + 'trophies/{game_id}/{psn_id}' URL_GAMES_OVERVIEW = URL_API_ROOT + '{psn_id}' def game_psnprofiles_id_from_url(relative_url: str) -> int: m = re.match(r'/(?:trophy|trophies)/(\d+)\-(?:[\w-]+)(/[\w-]*)?', relative_url) result = m.group(1) return int(result) MAX_NUMBER_GAMES_TO_PARSE = 1000 @dataclasses.dataclass(frozen=True) class PsnProfilesScraper(Scraper): dataset_name = 'games_played_playstation' deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS @staticmethod def requires_cfscrape() -> bool: return True def scrape(self): self._setup_cache() games_rows = list(self._scrape_games_overview()) games_ids = {row['psnprofiles.game_id']: row['game.name'] for row in games_rows} logger.info('Found %d games from overview', len(games_rows)) SCRAPE_FROM_OVERVIEW = False if SCRAPE_FROM_OVERVIEW: yield from games_rows idx = 0 for game_id, game_name in games_ids.items(): yield from self._scrape_game_trophies(game_id, game_name) del game_id idx += 1 if idx >= MAX_NUMBER_GAMES_TO_PARSE: break def _setup_cache(self): requests_util.setup_limiter( self.session, URL_API_ROOT, per_minute=5, expire_after=datetime.timedelta(hours=1), ) requests_util.setup_limiter( self.session, URL_API_ROOT + '/trophies/', expire_after=datetime.timedelta(days=14), ) def _scrape_games_overview(self) -> Iterator[dict]: for page_num in range(1, 1000): logger.info('Getting Overview (page %d)', page_num) url = URL_GAMES_OVERVIEW.format(psn_id=secrets.PLAYSTATION_PSN_ID) response = self.session.get(url, params={'page': page_num}) if 'page' not in response.url: msg = "Configuration error? psnprofiles.com made an redirection. This is possibly because your profile name wasn't exactly as expected. Please check it" raise RuntimeError(msg) response.raise_for_status() soup = bs4.BeautifulSoup(response.text, 'lxml') soup = personal_data.html_util.normalize_soup_slightly(soup, classes=False) games_on_page = list(self._iterate_games_from_games_table(soup)) yield from games_on_page if len(games_on_page) == 0: return def _scrape_games_overview_old(self) -> Iterator[dict]: # Request to get overview logger.info('Getting Overview') url = URL_PROFILE.format(psn_id=secrets.PLAYSTATION_PSN_ID) response = self.session.get(url) response.raise_for_status() now = parse_util.parse_response_datetime(response) # Parse data soup = bs4.BeautifulSoup(response.content, 'lxml') soup = personal_data.html_util.normalize_soup_slightly(soup, classes=False) yield from self._iterate_games_from_recent_tropies(soup, now) yield from self._iterate_games_from_games_table(soup) def _iterate_games_from_recent_tropies(self, soup, now) -> Iterator[dict]: soup_recent_tropies = soup.select('ul#recent-trophies > li') assert len(soup_recent_tropies) > 0 for row in soup_recent_tropies: cells = row.select_one('.info .box td').find_all('div') trophy_name = cells[0].get_text().strip() trophy_desc = cells[1].get_text().strip() game_name = cells[2].a.extract().get_text().strip() psnprofiles_id = game_psnprofiles_id_from_url(cells[0].find('a')['href']) trophy_icon = row.find(class_='icon').find('img')['src'] gotten_at = ( cells[2].get_text().strip().removesuffix(' in').removesuffix(' ago') ) gotten_at = parse_util.parse_duration(gotten_at) time_acquired = now - gotten_at yield { 'game.name': game_name, 'me.last_played_time': time_acquired.date(), # Trophy Data 'trophy.name': trophy_name, 'trophy.desc': trophy_desc, 'trophy.icon': trophy_icon, 'psnprofiles.game_id': psnprofiles_id, } del row, cells, time_acquired def _iterate_games_from_games_table(self, soup) -> Iterator[dict]: # Games table table_rows = soup.find(id='gamesTable').find_all('tr') assert len(table_rows) > 0, url if title := table_rows[0].h2: if title.get_text().strip() == 'No games found': return for row in table_rows: cells = row.find_all('td') # Check for pagination if re.match( r'show \d+ more games', cells[0].get_text().strip(), re.IGNORECASE, ): break game_name = cells[1].find(class_='title').get_text() psnprofiles_id = game_psnprofiles_id_from_url(cells[0].find('a')['href']) game_icon = cells[0].find('img')['src'] game_name = row.select_one('.title').get_text() game_platform = row.select_one('.platform').get_text() small_infos = cells[1].find_all('div') if len(small_infos) > 2: time_played_div = small_infos[2] time_played_div.sup.extract() time_played = parse_util.parse_date( time_played_div.get_text(), ) else: time_played = None d = { # Important fields 'game.name': game_name, # Secondary fields 'game.platform': game_platform, 'game.icon': game_icon, 'psnprofiles.game_id': psnprofiles_id, } if time_played: d['me.last_played_time'] = time_played yield d def _scrape_game_trophies( self, psnprofiles_id: int, game_name: str, ) -> Iterator[dict]: assert isinstance(psnprofiles_id, int), psnprofiles_id assert isinstance(game_name, str), game_name logger.info('Getting Game Trophies %s', psnprofiles_id) url = URL_USER_GAME_TROPHIES.format( psn_id=secrets.PLAYSTATION_PSN_ID, game_id=psnprofiles_id, ) response = self.session.get(url) response.raise_for_status() # Parse data soup = bs4.BeautifulSoup(response.content, 'lxml') soup = personal_data.html_util.normalize_soup_slightly(soup, classes=False) # Remove redundant for redundant in soup.select('.wide-ad'): redundant.extract() for redundant in soup.select('div.col-xs-4'): redundant.extract() # Recent trophies. soup_tropies = soup.select( '#content.page > .row > div.col-xs div.box table.zebra tr.completed', ) for row in soup_tropies: cells = row.find_all('td') trophy_name_a = cells[1].a if trophy_name_a is None: continue trophy_name = trophy_name_a.get_text().strip() trophy_name_a.extract() trophy_desc = cells[1].get_text().strip() trophy_icon = cells[0].img['src'] cells[2].span.span.nobr.sup.extract() gotten_at = parse_util.parse_time(cells[2].get_text()) yield { 'game.name': game_name, 'me.last_played_time': gotten_at, # Trophy Data 'trophy.name': trophy_name, 'trophy.desc': trophy_desc, 'trophy.icon': trophy_icon, 'psnprofiles.game_id': psnprofiles_id, } del row, cells