From 842bb5d609af44d333d867536b894fa2da2aa5f2 Mon Sep 17 00:00:00 2001 From: Jon Michael Aanes Date: Fri, 25 Oct 2024 21:29:44 +0200 Subject: [PATCH] Code quality and release year parsing --- personal_data/fetchers/psnprofiles.py | 21 +++++++++++++++++++++ personal_data/fetchers/steam_community.py | 14 ++++++++------ personal_data/parse_util.py | 14 ++++++++------ 3 files changed, 37 insertions(+), 12 deletions(-) diff --git a/personal_data/fetchers/psnprofiles.py b/personal_data/fetchers/psnprofiles.py index 4cd3b12..4cd66b0 100644 --- a/personal_data/fetchers/psnprofiles.py +++ b/personal_data/fetchers/psnprofiles.py @@ -2,6 +2,7 @@ import dataclasses import logging import re from collections.abc import Iterator +import datetime import bs4 import requests_util @@ -31,6 +32,8 @@ MAX_NUMBER_GAMES_TO_PARSE = 10000 @dataclasses.dataclass(frozen=True) class PsnProfilesScraper(Scraper): + """Downloads all trophies for the given user.""" + dataset_name = 'games_played_playstation' deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS @@ -124,6 +127,16 @@ class PsnProfilesScraper(Scraper): d['me.last_played_time'] = time_played yield d + def _parse_game_release_date(self, soup: bs4.BeautifulSoup) -> datetime.date: + table_rows = soup.select('table.gameInfo tr') + for row in table_rows : + cells = row.select('td') + if cells[0].get_text() in {'Release', 'Releases'}: + text = cells[1].get_text() + dates = re.findall(r'\w+\s+\d+,\s+\d{4}', text) + return min(parse_util.parse_date(date) for date in dates) + assert False, 'Could not find release date' + def _scrape_game_trophies( self, psnprofiles_id: int, @@ -143,8 +156,14 @@ class PsnProfilesScraper(Scraper): # Parse data soup = bs4.BeautifulSoup(response.content, 'lxml') + + # Normalize before parsing trophies soup = personal_data.html_util.normalize_soup_slightly(soup, classes=False) + # Parse release year + game_release_date = self._parse_game_release_date(soup) + assert game_release_date + # Remove redundant for redundant in soup.select('.wide-ad'): redundant.extract() @@ -174,11 +193,13 @@ class PsnProfilesScraper(Scraper): yield { 'game.name': game_name, + 'game.release_date': game_release_date, 'me.last_played_time': gotten_at, # Trophy Data 'trophy.name': trophy_name, 'trophy.desc': trophy_desc, 'trophy.icon': trophy_icon, + # Ids 'psnprofiles.game_id': psnprofiles_id, } diff --git a/personal_data/fetchers/steam_community.py b/personal_data/fetchers/steam_community.py index 5290987..a7ea8c3 100644 --- a/personal_data/fetchers/steam_community.py +++ b/personal_data/fetchers/steam_community.py @@ -22,15 +22,17 @@ FORMAT_DATE_HEADER = '%d/%m/%YYYY' @dataclasses.dataclass(frozen=True) class SteamAchievementScraper(Scraper): + """Downloads all trophies for the given user.""" + dataset_name = 'games_played_TODO' deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS def scrape(self) -> Iterator[dict[str, Any]]: username = secrets.STEAM_USERNAME - for appid in self.determine_appids_from_recent_activity(username): - yield from self.scrape_app(username, appid) + for appid in self._determine_appids_from_recent_activity(username): + yield from self._scrape_app_achievements(username, appid) - def determine_appids_from_recent_activity(self, username: str) -> Iterator[int]: + def _determine_appids_from_recent_activity(self, username: str) -> Iterator[int]: url = URL_USER_RECENT_ACTIVITY.format( username=username, ) @@ -47,7 +49,7 @@ class SteamAchievementScraper(Scraper): appid = int(href.split('/')[-1]) yield appid - def scrape_app(self, username: str, appid: int) -> Iterator[dict[str, Any]]: + def _scrape_app_achievements(self, username: str, appid: int) -> Iterator[dict[str, Any]]: url = URL_GAME_ACHIVEMENTS.format( username=username, appid=appid, @@ -55,8 +57,6 @@ class SteamAchievementScraper(Scraper): response = self.session.get(url) response.raise_for_status() - NOW = parse_util.parse_response_datetime(response) - # Parse data soup = bs4.BeautifulSoup(response.content, 'lxml') @@ -90,6 +90,8 @@ class SteamAchievementScraper(Scraper): 'trophy.name': trophy_name, 'trophy.desc': trophy_desc, 'trophy.icon': trophy_icon, + # Ids + 'steam.appid': appid, } del entry, time_acquired diff --git a/personal_data/parse_util.py b/personal_data/parse_util.py index 475da67..0e0bf76 100644 --- a/personal_data/parse_util.py +++ b/personal_data/parse_util.py @@ -19,8 +19,6 @@ DATETIME_UNITS = { FORMAT_DATE_HEADER = '%a, %d %b %Y %H:%M:%S GMT' -FORMAT_DAY_MONTH_YEAR = '%d %B %Y' - def parse_duration(text: str) -> datetime.timedelta: (num, unit) = text.split(' ') @@ -69,7 +67,11 @@ def parse_time(text: str) -> datetime.datetime: def parse_date(text: str) -> datetime.date: - return datetime.datetime.strptime( - text.strip(), - FORMAT_DAY_MONTH_YEAR, - ).date() + text = text.strip() + if dt := try_parse(text, '%d %B %Y'): + return dt.date() + if dt := try_parse(text, '%b %d, %Y'): + return dt.date() + if dt := try_parse(text, '%B %d, %Y'): + return dt.date() + assert False, text