1
0

Compare commits

...

4 Commits

Author SHA1 Message Date
b67089f911
Standardized scraper name to not include scraper
Some checks failed
Run Python tests (through Pytest) / Test (push) Failing after 40s
Verify Python project can be installed, loaded and have version checked / Test (push) Successful in 30s
2024-10-25 21:50:56 +02:00
ce63ad2d53
Merged all dataset outputs 2024-10-25 21:47:44 +02:00
9a9af4287a
Ruff 2024-10-25 21:30:12 +02:00
842bb5d609
Code quality and release year parsing 2024-10-25 21:29:52 +02:00
7 changed files with 57 additions and 25 deletions

View File

@ -213,7 +213,7 @@ def import_watched_series_csv_from_file(vault: ObsidianVault) -> int:
def import_played_games_csv_from_file(vault: ObsidianVault) -> int: def import_played_games_csv_from_file(vault: ObsidianVault) -> int:
data_path = Path('output/games_played_playstation.csv') data_path = Path('output/games_played.csv')
return import_activity_sample_csv_from_file( return import_activity_sample_csv_from_file(
vault, vault,
data_path, data_path,

View File

@ -18,12 +18,13 @@ URL_PROFILE_MOUNTS = (
'https://eu.finalfantasyxiv.com/lodestone/character/{character_id}/mount/' 'https://eu.finalfantasyxiv.com/lodestone/character/{character_id}/mount/'
) )
FORMAT_DATE_HEADER = '%d/%m/%YYYY' FFXIV_ARR_NAME = 'Final Fantasy XIV: A Realm Reborn'
FFXIV_ARR_RELEASE_DATE = datetime.date(2013,8,27)
@dataclasses.dataclass(frozen=True) @dataclasses.dataclass(frozen=True)
class LodestoneAchievementScraper(Scraper): class LodestoneAchievement(Scraper):
dataset_name = 'games_played_playstation' dataset_name = 'games_played'
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
def scrape(self): def scrape(self):
@ -67,7 +68,8 @@ class LodestoneAchievementScraper(Scraper):
trophy_icon = trophy_icon.src trophy_icon = trophy_icon.src
yield { yield {
'game.name': 'Final Fantasy XIV: A Realm Reborn', 'game.name': FFXIV_ARR_NAME,
'game.release_date': FFXIV_ARR_RELEASE_DATE,
'me.last_played_time': time_acquired, 'me.last_played_time': time_acquired,
# Trophy Data # Trophy Data
'trophy.name': trophy_name, 'trophy.name': trophy_name,

View File

@ -43,7 +43,7 @@ def iterate_watched_episodes_of_series(client, series_id: str):
@dataclasses.dataclass(frozen=True) @dataclasses.dataclass(frozen=True)
class JellyfinWatchHistoryScraper(Scraper): class JellyfinWatchHistory(Scraper):
dataset_name = 'show_episodes_watched' dataset_name = 'show_episodes_watched'
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS

View File

@ -1,4 +1,5 @@
import dataclasses import dataclasses
import datetime
import logging import logging
import re import re
from collections.abc import Iterator from collections.abc import Iterator
@ -30,8 +31,10 @@ MAX_NUMBER_GAMES_TO_PARSE = 10000
@dataclasses.dataclass(frozen=True) @dataclasses.dataclass(frozen=True)
class PsnProfilesScraper(Scraper): class PsnProfiles(Scraper):
dataset_name = 'games_played_playstation' """Downloads all trophies for the given user."""
dataset_name = 'games_played'
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
@staticmethod @staticmethod
@ -124,6 +127,16 @@ class PsnProfilesScraper(Scraper):
d['me.last_played_time'] = time_played d['me.last_played_time'] = time_played
yield d yield d
def _parse_game_release_date(self, soup: bs4.BeautifulSoup) -> datetime.date:
table_rows = soup.select('table.gameInfo tr')
for row in table_rows:
cells = row.select('td')
if cells[0].get_text() in {'Release', 'Releases'}:
text = cells[1].get_text()
dates = re.findall(r'\w+\s+\d+,\s+\d{4}', text)
return min(parse_util.parse_date(date) for date in dates)
assert False, 'Could not find release date'
def _scrape_game_trophies( def _scrape_game_trophies(
self, self,
psnprofiles_id: int, psnprofiles_id: int,
@ -143,8 +156,14 @@ class PsnProfilesScraper(Scraper):
# Parse data # Parse data
soup = bs4.BeautifulSoup(response.content, 'lxml') soup = bs4.BeautifulSoup(response.content, 'lxml')
# Normalize before parsing trophies
soup = personal_data.html_util.normalize_soup_slightly(soup, classes=False) soup = personal_data.html_util.normalize_soup_slightly(soup, classes=False)
# Parse release year
game_release_date = self._parse_game_release_date(soup)
assert game_release_date
# Remove redundant # Remove redundant
for redundant in soup.select('.wide-ad'): for redundant in soup.select('.wide-ad'):
redundant.extract() redundant.extract()
@ -174,11 +193,13 @@ class PsnProfilesScraper(Scraper):
yield { yield {
'game.name': game_name, 'game.name': game_name,
'game.release_date': game_release_date,
'me.last_played_time': gotten_at, 'me.last_played_time': gotten_at,
# Trophy Data # Trophy Data
'trophy.name': trophy_name, 'trophy.name': trophy_name,
'trophy.desc': trophy_desc, 'trophy.desc': trophy_desc,
'trophy.icon': trophy_icon, 'trophy.icon': trophy_icon,
# Ids
'psnprofiles.game_id': psnprofiles_id, 'psnprofiles.game_id': psnprofiles_id,
} }

View File

@ -21,16 +21,20 @@ FORMAT_DATE_HEADER = '%d/%m/%YYYY'
@dataclasses.dataclass(frozen=True) @dataclasses.dataclass(frozen=True)
class SteamAchievementScraper(Scraper): class SteamAchievement(Scraper):
dataset_name = 'games_played_TODO' """Downloads all trophies for the given user."""
dataset_name = 'games_played'
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
def scrape(self) -> Iterator[dict[str, Any]]: def scrape(self) -> Iterator[dict[str, Any]]:
username = secrets.STEAM_USERNAME username: str = secrets.STEAM_USERNAME
for appid in self.determine_appids_from_recent_activity(username): appids = list(self._determine_appids_from_recent_activity(username))
yield from self.scrape_app(username, appid) logger.info('Found %d Steam Apps', len(appids))
for appid in appids:
yield from self._scrape_app_achievements(username, appid)
def determine_appids_from_recent_activity(self, username: str) -> Iterator[int]: def _determine_appids_from_recent_activity(self, username: str) -> Iterator[int]:
url = URL_USER_RECENT_ACTIVITY.format( url = URL_USER_RECENT_ACTIVITY.format(
username=username, username=username,
) )
@ -47,7 +51,9 @@ class SteamAchievementScraper(Scraper):
appid = int(href.split('/')[-1]) appid = int(href.split('/')[-1])
yield appid yield appid
def scrape_app(self, username: str, appid: int) -> Iterator[dict[str, Any]]: def _scrape_app_achievements(
self, username: str, appid: int,
) -> Iterator[dict[str, Any]]:
url = URL_GAME_ACHIVEMENTS.format( url = URL_GAME_ACHIVEMENTS.format(
username=username, username=username,
appid=appid, appid=appid,
@ -55,8 +61,6 @@ class SteamAchievementScraper(Scraper):
response = self.session.get(url) response = self.session.get(url)
response.raise_for_status() response.raise_for_status()
NOW = parse_util.parse_response_datetime(response)
# Parse data # Parse data
soup = bs4.BeautifulSoup(response.content, 'lxml') soup = bs4.BeautifulSoup(response.content, 'lxml')
@ -73,7 +77,7 @@ class SteamAchievementScraper(Scraper):
for entry in soup.select('.achieveRow'): for entry in soup.select('.achieveRow'):
trophy_name: str = entry.select_one('h3').get_text() trophy_name: str = entry.select_one('h3').get_text()
trophy_desc: str = entry.select_one('h5').get_text() trophy_desc: str = entry.select_one('h5').get_text()
trophy_icon: str = entry.select_one('img').src trophy_icon: str = entry.select_one('img')['src']
time_acquired_html: str = entry.select_one('.achieveUnlockTime') time_acquired_html: str = entry.select_one('.achieveUnlockTime')
if time_acquired_html is None: if time_acquired_html is None:
@ -85,11 +89,14 @@ class SteamAchievementScraper(Scraper):
yield { yield {
'game.name': game_name, 'game.name': game_name,
#'game.release_date': None,
'me.last_played_time': time_acquired, 'me.last_played_time': time_acquired,
# Trophy Data # Trophy Data
'trophy.name': trophy_name, 'trophy.name': trophy_name,
'trophy.desc': trophy_desc, 'trophy.desc': trophy_desc,
'trophy.icon': trophy_icon, 'trophy.icon': trophy_icon,
# Ids
'steam.appid': appid,
} }
del entry, time_acquired del entry, time_acquired

View File

@ -41,7 +41,7 @@ def load_credentials() -> CredentialsType:
@dataclasses.dataclass(frozen=True) @dataclasses.dataclass(frozen=True)
class WithingsActivityScraper(Scraper): class WithingsActivity(Scraper):
dataset_name = 'withings_activity' dataset_name = 'withings_activity'
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS

View File

@ -19,8 +19,6 @@ DATETIME_UNITS = {
FORMAT_DATE_HEADER = '%a, %d %b %Y %H:%M:%S GMT' FORMAT_DATE_HEADER = '%a, %d %b %Y %H:%M:%S GMT'
FORMAT_DAY_MONTH_YEAR = '%d %B %Y'
def parse_duration(text: str) -> datetime.timedelta: def parse_duration(text: str) -> datetime.timedelta:
(num, unit) = text.split(' ') (num, unit) = text.split(' ')
@ -69,7 +67,11 @@ def parse_time(text: str) -> datetime.datetime:
def parse_date(text: str) -> datetime.date: def parse_date(text: str) -> datetime.date:
return datetime.datetime.strptime( text = text.strip()
text.strip(), if dt := try_parse(text, '%d %B %Y'):
FORMAT_DAY_MONTH_YEAR, return dt.date()
).date() if dt := try_parse(text, '%b %d, %Y'):
return dt.date()
if dt := try_parse(text, '%B %d, %Y'):
return dt.date()
assert False, text