This commit is contained in:
parent
64ba67831c
commit
87e9d4548c
|
@ -134,19 +134,26 @@ def main():
|
||||||
cookiejar = browsercookie.firefox()
|
cookiejar = browsercookie.firefox()
|
||||||
logger.warning('Got cookiejar from firefox: %s cookies', len(cookiejar))
|
logger.warning('Got cookiejar from firefox: %s cookies', len(cookiejar))
|
||||||
|
|
||||||
|
scraper_filter = {'PsnProfilesScraper'}
|
||||||
|
|
||||||
for scraper_cls in personal_data.data.Scraper.__subclasses__():
|
for scraper_cls in personal_data.data.Scraper.__subclasses__():
|
||||||
session = get_session(cookiejar, with_cfscrape=scraper_cls.requires_cfscrape())
|
session = get_session(cookiejar, with_cfscrape=scraper_cls.requires_cfscrape())
|
||||||
scraper = scraper_cls(session)
|
scraper = scraper_cls(session)
|
||||||
|
if scraper_cls.__name__ not in scraper_filter:
|
||||||
|
continue
|
||||||
logger.warning(
|
logger.warning(
|
||||||
'Running %s, appending to "%s"',
|
'Running %s, appending to "%s"',
|
||||||
scraper_cls.__name__,
|
scraper_cls.__name__,
|
||||||
scraper.dataset_name,
|
scraper.dataset_name,
|
||||||
)
|
)
|
||||||
del scraper_cls
|
|
||||||
result_rows = list()
|
result_rows = list()
|
||||||
|
try:
|
||||||
for result in scraper.scrape():
|
for result in scraper.scrape():
|
||||||
result_rows.append(result)
|
result_rows.append(result)
|
||||||
del result
|
del result
|
||||||
|
except requests.exceptions.HTTPError:
|
||||||
|
logger.exception('Failed in running %s', scraper_cls.__name__)
|
||||||
|
continue
|
||||||
extend_csv_file(
|
extend_csv_file(
|
||||||
'output/' + scraper.dataset_name,
|
'output/' + scraper.dataset_name,
|
||||||
result_rows,
|
result_rows,
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import datetime
|
import datetime
|
||||||
|
from collections.abc import Iterator
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import secrets
|
import secrets
|
||||||
|
@ -13,6 +14,7 @@ from personal_data.data import DeduplicateMode, Scraper
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
URL_PROFILE = 'https://psnprofiles.com/{psn_id}'
|
URL_PROFILE = 'https://psnprofiles.com/{psn_id}'
|
||||||
|
URL_USER_GAME_TROPHIES = 'https://psnprofiles.com/trophies/{game_id}/{psn_id}'
|
||||||
|
|
||||||
FORMAT_DAY_MONTH_YEAR = '%d %B %Y'
|
FORMAT_DAY_MONTH_YEAR = '%d %B %Y'
|
||||||
|
|
||||||
|
@ -32,6 +34,13 @@ assert game_psnprofiles_id_from_url(
|
||||||
'/trophy/21045-theatrhythm-final-bar-line/19-seasoned-hunter',
|
'/trophy/21045-theatrhythm-final-bar-line/19-seasoned-hunter',
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def parse_time(text: str) -> datetime.datetime:
|
||||||
|
text = text.replace('\n', ' ')
|
||||||
|
text = text.strip()
|
||||||
|
return datetime.datetime.strptime(text, '%d %b %Y %I:%M:%S %p')
|
||||||
|
|
||||||
|
assert parse_time('06 Apr 2024 06:11:42 PM')
|
||||||
|
assert parse_time('26 Mar 2024 7:07:01 PM')
|
||||||
|
|
||||||
@dataclasses.dataclass(frozen=True)
|
@dataclasses.dataclass(frozen=True)
|
||||||
class PsnProfilesScraper(Scraper):
|
class PsnProfilesScraper(Scraper):
|
||||||
|
@ -43,8 +52,17 @@ class PsnProfilesScraper(Scraper):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def scrape(self):
|
def scrape(self):
|
||||||
# Request to get watch history
|
games_rows = list(self.scrape_games_overview())
|
||||||
logger.info('Getting Watchlist')
|
games_ids = {row['psnprofiles.game_id']: ['game.name'] for row in games_rows}
|
||||||
|
|
||||||
|
yield from games_rows
|
||||||
|
for game_id, game_name in games_ids.items():
|
||||||
|
yield from self.scrape_game_trophies(game_id, game_name)
|
||||||
|
del game_id
|
||||||
|
|
||||||
|
def scrape_games_overview(self) -> Iterator[dict]:
|
||||||
|
# Request to get overview
|
||||||
|
logger.info('Getting Overview')
|
||||||
url = URL_PROFILE.format(psn_id=secrets.PLAYSTATION_PSN_ID)
|
url = URL_PROFILE.format(psn_id=secrets.PLAYSTATION_PSN_ID)
|
||||||
response = self.session.get(url)
|
response = self.session.get(url)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
@ -130,3 +148,46 @@ class PsnProfilesScraper(Scraper):
|
||||||
if time_played:
|
if time_played:
|
||||||
d['me.last_played_time'] = time_played
|
d['me.last_played_time'] = time_played
|
||||||
yield d
|
yield d
|
||||||
|
|
||||||
|
def scrape_game_trophies(self, psnprofiles_id: str, game_name: str) -> Iterator[dict]:
|
||||||
|
logger.info('Getting Game Trophies %s', psnprofiles_id)
|
||||||
|
|
||||||
|
url = URL_USER_GAME_TROPHIES.format(psn_id=secrets.PLAYSTATION_PSN_ID,
|
||||||
|
game_id=psnprofiles_id)
|
||||||
|
response = self.session.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# Parse data
|
||||||
|
soup = bs4.BeautifulSoup(response.content, 'lxml')
|
||||||
|
soup = personal_data.html_util.normalize_soup_slightly(soup, classes=False)
|
||||||
|
|
||||||
|
# Recent trophies.
|
||||||
|
soup_tropies = soup.select('#content table.zebra tr.completed')
|
||||||
|
assert len(soup_tropies) > 0, url
|
||||||
|
for row in soup_tropies:
|
||||||
|
print(row)
|
||||||
|
|
||||||
|
cells = row.find_all('td')
|
||||||
|
print(cells)
|
||||||
|
|
||||||
|
trophy_name_a = cells[1].a
|
||||||
|
trophy_name = trophy_name_a.get_text().strip()
|
||||||
|
trophy_name_a.extract()
|
||||||
|
trophy_desc = cells[1].get_text().strip()
|
||||||
|
|
||||||
|
trophy_icon = cells[0].img['src']
|
||||||
|
|
||||||
|
cells[2].span.span.nobr.sup.extract()
|
||||||
|
gotten_at = parse_time(cells[2].get_text())
|
||||||
|
|
||||||
|
yield {
|
||||||
|
'game.name': game_name,
|
||||||
|
'me.last_played_time': gotten_at,
|
||||||
|
# Trophy Data
|
||||||
|
'trophy.name': trophy_name,
|
||||||
|
'trophy.desc': trophy_desc,
|
||||||
|
'trophy.icon': trophy_icon,
|
||||||
|
'psnprofiles.game_id': psnprofiles_id,
|
||||||
|
}
|
||||||
|
|
||||||
|
del row, cells
|
||||||
|
|
Loading…
Reference in New Issue
Block a user