Code quality and release year parsing
This commit is contained in:
parent
ce89103c32
commit
842bb5d609
|
@ -2,6 +2,7 @@ import dataclasses
|
|||
import logging
|
||||
import re
|
||||
from collections.abc import Iterator
|
||||
import datetime
|
||||
|
||||
import bs4
|
||||
import requests_util
|
||||
|
@ -31,6 +32,8 @@ MAX_NUMBER_GAMES_TO_PARSE = 10000
|
|||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class PsnProfilesScraper(Scraper):
|
||||
"""Downloads all trophies for the given user."""
|
||||
|
||||
dataset_name = 'games_played_playstation'
|
||||
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
|
||||
|
||||
|
@ -124,6 +127,16 @@ class PsnProfilesScraper(Scraper):
|
|||
d['me.last_played_time'] = time_played
|
||||
yield d
|
||||
|
||||
def _parse_game_release_date(self, soup: bs4.BeautifulSoup) -> datetime.date:
|
||||
table_rows = soup.select('table.gameInfo tr')
|
||||
for row in table_rows :
|
||||
cells = row.select('td')
|
||||
if cells[0].get_text() in {'Release', 'Releases'}:
|
||||
text = cells[1].get_text()
|
||||
dates = re.findall(r'\w+\s+\d+,\s+\d{4}', text)
|
||||
return min(parse_util.parse_date(date) for date in dates)
|
||||
assert False, 'Could not find release date'
|
||||
|
||||
def _scrape_game_trophies(
|
||||
self,
|
||||
psnprofiles_id: int,
|
||||
|
@ -143,8 +156,14 @@ class PsnProfilesScraper(Scraper):
|
|||
|
||||
# Parse data
|
||||
soup = bs4.BeautifulSoup(response.content, 'lxml')
|
||||
|
||||
# Normalize before parsing trophies
|
||||
soup = personal_data.html_util.normalize_soup_slightly(soup, classes=False)
|
||||
|
||||
# Parse release year
|
||||
game_release_date = self._parse_game_release_date(soup)
|
||||
assert game_release_date
|
||||
|
||||
# Remove redundant
|
||||
for redundant in soup.select('.wide-ad'):
|
||||
redundant.extract()
|
||||
|
@ -174,11 +193,13 @@ class PsnProfilesScraper(Scraper):
|
|||
|
||||
yield {
|
||||
'game.name': game_name,
|
||||
'game.release_date': game_release_date,
|
||||
'me.last_played_time': gotten_at,
|
||||
# Trophy Data
|
||||
'trophy.name': trophy_name,
|
||||
'trophy.desc': trophy_desc,
|
||||
'trophy.icon': trophy_icon,
|
||||
# Ids
|
||||
'psnprofiles.game_id': psnprofiles_id,
|
||||
}
|
||||
|
||||
|
|
|
@ -22,15 +22,17 @@ FORMAT_DATE_HEADER = '%d/%m/%YYYY'
|
|||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class SteamAchievementScraper(Scraper):
|
||||
"""Downloads all trophies for the given user."""
|
||||
|
||||
dataset_name = 'games_played_TODO'
|
||||
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
|
||||
|
||||
def scrape(self) -> Iterator[dict[str, Any]]:
|
||||
username = secrets.STEAM_USERNAME
|
||||
for appid in self.determine_appids_from_recent_activity(username):
|
||||
yield from self.scrape_app(username, appid)
|
||||
for appid in self._determine_appids_from_recent_activity(username):
|
||||
yield from self._scrape_app_achievements(username, appid)
|
||||
|
||||
def determine_appids_from_recent_activity(self, username: str) -> Iterator[int]:
|
||||
def _determine_appids_from_recent_activity(self, username: str) -> Iterator[int]:
|
||||
url = URL_USER_RECENT_ACTIVITY.format(
|
||||
username=username,
|
||||
)
|
||||
|
@ -47,7 +49,7 @@ class SteamAchievementScraper(Scraper):
|
|||
appid = int(href.split('/')[-1])
|
||||
yield appid
|
||||
|
||||
def scrape_app(self, username: str, appid: int) -> Iterator[dict[str, Any]]:
|
||||
def _scrape_app_achievements(self, username: str, appid: int) -> Iterator[dict[str, Any]]:
|
||||
url = URL_GAME_ACHIVEMENTS.format(
|
||||
username=username,
|
||||
appid=appid,
|
||||
|
@ -55,8 +57,6 @@ class SteamAchievementScraper(Scraper):
|
|||
response = self.session.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
NOW = parse_util.parse_response_datetime(response)
|
||||
|
||||
# Parse data
|
||||
soup = bs4.BeautifulSoup(response.content, 'lxml')
|
||||
|
||||
|
@ -90,6 +90,8 @@ class SteamAchievementScraper(Scraper):
|
|||
'trophy.name': trophy_name,
|
||||
'trophy.desc': trophy_desc,
|
||||
'trophy.icon': trophy_icon,
|
||||
# Ids
|
||||
'steam.appid': appid,
|
||||
}
|
||||
|
||||
del entry, time_acquired
|
||||
|
|
|
@ -19,8 +19,6 @@ DATETIME_UNITS = {
|
|||
|
||||
FORMAT_DATE_HEADER = '%a, %d %b %Y %H:%M:%S GMT'
|
||||
|
||||
FORMAT_DAY_MONTH_YEAR = '%d %B %Y'
|
||||
|
||||
|
||||
def parse_duration(text: str) -> datetime.timedelta:
|
||||
(num, unit) = text.split(' ')
|
||||
|
@ -69,7 +67,11 @@ def parse_time(text: str) -> datetime.datetime:
|
|||
|
||||
|
||||
def parse_date(text: str) -> datetime.date:
|
||||
return datetime.datetime.strptime(
|
||||
text.strip(),
|
||||
FORMAT_DAY_MONTH_YEAR,
|
||||
).date()
|
||||
text = text.strip()
|
||||
if dt := try_parse(text, '%d %B %Y'):
|
||||
return dt.date()
|
||||
if dt := try_parse(text, '%b %d, %Y'):
|
||||
return dt.date()
|
||||
if dt := try_parse(text, '%B %d, %Y'):
|
||||
return dt.date()
|
||||
assert False, text
|
||||
|
|
Loading…
Reference in New Issue
Block a user