PSN Profiles: Implemented pagination
This commit is contained in:
parent
5255206cf4
commit
3170d8e7a8
|
@ -1,6 +1,7 @@
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
import datetime
|
||||||
from collections.abc import Iterator
|
from collections.abc import Iterator
|
||||||
|
|
||||||
import bs4
|
import bs4
|
||||||
|
@ -17,6 +18,7 @@ logger = logging.getLogger(__name__)
|
||||||
URL_API_ROOT = 'https://psnprofiles.com/'
|
URL_API_ROOT = 'https://psnprofiles.com/'
|
||||||
URL_PROFILE = URL_API_ROOT + '{psn_id}'
|
URL_PROFILE = URL_API_ROOT + '{psn_id}'
|
||||||
URL_USER_GAME_TROPHIES = URL_API_ROOT + 'trophies/{game_id}/{psn_id}'
|
URL_USER_GAME_TROPHIES = URL_API_ROOT + 'trophies/{game_id}/{psn_id}'
|
||||||
|
URL_GAMES_OVERVIEW = URL_API_ROOT + '{psn_id}'
|
||||||
|
|
||||||
|
|
||||||
def game_psnprofiles_id_from_url(relative_url: str) -> int:
|
def game_psnprofiles_id_from_url(relative_url: str) -> int:
|
||||||
|
@ -42,6 +44,8 @@ class PsnProfilesScraper(Scraper):
|
||||||
games_rows = list(self._scrape_games_overview())
|
games_rows = list(self._scrape_games_overview())
|
||||||
games_ids = {row['psnprofiles.game_id']: row['game.name'] for row in games_rows}
|
games_ids = {row['psnprofiles.game_id']: row['game.name'] for row in games_rows}
|
||||||
|
|
||||||
|
logger.info('Found %d games from overview', len(games_rows))
|
||||||
|
|
||||||
SCRAPE_FROM_OVERVIEW = False
|
SCRAPE_FROM_OVERVIEW = False
|
||||||
if SCRAPE_FROM_OVERVIEW:
|
if SCRAPE_FROM_OVERVIEW:
|
||||||
yield from games_rows
|
yield from games_rows
|
||||||
|
@ -59,24 +63,50 @@ class PsnProfilesScraper(Scraper):
|
||||||
self.session,
|
self.session,
|
||||||
URL_API_ROOT,
|
URL_API_ROOT,
|
||||||
per_minute=5,
|
per_minute=5,
|
||||||
|
expire_after=datetime.timedelta(hours=1),
|
||||||
|
)
|
||||||
|
requests_util.setup_limiter(
|
||||||
|
self.session,
|
||||||
|
URL_API_ROOT+'/trophies/',
|
||||||
|
expire_after=datetime.timedelta(days=14),
|
||||||
)
|
)
|
||||||
|
|
||||||
def _scrape_games_overview(self) -> Iterator[dict]:
|
def _scrape_games_overview(self) -> Iterator[dict]:
|
||||||
|
|
||||||
|
for page_num in range(1, 1000):
|
||||||
|
logger.info('Getting Overview (page %d)', page_num)
|
||||||
|
url = URL_GAMES_OVERVIEW.format(psn_id=secrets.PLAYSTATION_PSN_ID)
|
||||||
|
response = self.session.get(url, params={'page': page_num})
|
||||||
|
if 'page' not in response.url:
|
||||||
|
msg = 'Configuration error? psnprofiles.com made an redirection. This is possibly because your profile name wasn\'t exactly as expected. Please check it'
|
||||||
|
raise RuntimeError(msg)
|
||||||
|
response.raise_for_status()
|
||||||
|
soup = bs4.BeautifulSoup(response.text, 'lxml')
|
||||||
|
soup = personal_data.html_util.normalize_soup_slightly(soup, classes=False)
|
||||||
|
games_on_page = list(self._iterate_games_from_games_table(soup))
|
||||||
|
yield from games_on_page
|
||||||
|
if len(games_on_page) == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
def _scrape_games_overview_old(self) -> Iterator[dict]:
|
||||||
# Request to get overview
|
# Request to get overview
|
||||||
logger.info('Getting Overview')
|
logger.info('Getting Overview')
|
||||||
url = URL_PROFILE.format(psn_id=secrets.PLAYSTATION_PSN_ID)
|
url = URL_PROFILE.format(psn_id=secrets.PLAYSTATION_PSN_ID)
|
||||||
response = self.session.get(url)
|
response = self.session.get(url)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
NOW = parse_util.parse_response_datetime(response)
|
now = parse_util.parse_response_datetime(response)
|
||||||
|
|
||||||
# Parse data
|
# Parse data
|
||||||
soup = bs4.BeautifulSoup(response.content, 'lxml')
|
soup = bs4.BeautifulSoup(response.content, 'lxml')
|
||||||
soup = personal_data.html_util.normalize_soup_slightly(soup, classes=False)
|
soup = personal_data.html_util.normalize_soup_slightly(soup, classes=False)
|
||||||
|
|
||||||
# Recent trophies.
|
yield from self._iterate_games_from_recent_tropies(soup, now)
|
||||||
|
yield from self._iterate_games_from_games_table(soup)
|
||||||
|
|
||||||
|
def _iterate_games_from_recent_tropies(self, soup, now) -> Iterator[dict]:
|
||||||
soup_recent_tropies = soup.select('ul#recent-trophies > li')
|
soup_recent_tropies = soup.select('ul#recent-trophies > li')
|
||||||
assert len(soup_recent_tropies) > 0, url
|
assert len(soup_recent_tropies) > 0
|
||||||
for row in soup_recent_tropies:
|
for row in soup_recent_tropies:
|
||||||
cells = row.select_one('.info .box td').find_all('div')
|
cells = row.select_one('.info .box td').find_all('div')
|
||||||
|
|
||||||
|
@ -91,7 +121,7 @@ class PsnProfilesScraper(Scraper):
|
||||||
cells[2].get_text().strip().removesuffix(' in').removesuffix(' ago')
|
cells[2].get_text().strip().removesuffix(' in').removesuffix(' ago')
|
||||||
)
|
)
|
||||||
gotten_at = parse_util.parse_duration(gotten_at)
|
gotten_at = parse_util.parse_duration(gotten_at)
|
||||||
time_acquired = NOW - gotten_at
|
time_acquired = now - gotten_at
|
||||||
|
|
||||||
yield {
|
yield {
|
||||||
'game.name': game_name,
|
'game.name': game_name,
|
||||||
|
@ -105,10 +135,15 @@ class PsnProfilesScraper(Scraper):
|
||||||
|
|
||||||
del row, cells, time_acquired
|
del row, cells, time_acquired
|
||||||
|
|
||||||
|
def _iterate_games_from_games_table(self, soup) -> Iterator[dict]:
|
||||||
# Games table
|
# Games table
|
||||||
table_rows = soup.find(id='gamesTable').find_all('tr')
|
table_rows = soup.find(id='gamesTable').find_all('tr')
|
||||||
assert len(table_rows) > 0, url
|
assert len(table_rows) > 0, url
|
||||||
|
|
||||||
|
if title := table_rows[0].h2:
|
||||||
|
if title.get_text().strip() == 'No games found':
|
||||||
|
return
|
||||||
|
|
||||||
for row in table_rows:
|
for row in table_rows:
|
||||||
cells = row.find_all('td')
|
cells = row.find_all('td')
|
||||||
|
|
||||||
|
|
|
@ -54,12 +54,16 @@ def get_session(
|
||||||
ignore_cache: bool,
|
ignore_cache: bool,
|
||||||
) -> requests.Session:
|
) -> requests.Session:
|
||||||
assert isinstance(with_cfscrape, bool)
|
assert isinstance(with_cfscrape, bool)
|
||||||
|
if cfscrape:
|
||||||
|
session_class = CachedCfScrape
|
||||||
|
if ignore_cache:
|
||||||
|
logger.warning('HTTP cache disabled')
|
||||||
|
return cfscrape.create_scraper()
|
||||||
|
else:
|
||||||
session_class = requests_cache.CachedSession
|
session_class = requests_cache.CachedSession
|
||||||
if ignore_cache:
|
if ignore_cache:
|
||||||
logger.warning('HTTP cache disabled')
|
logger.warning('HTTP cache disabled')
|
||||||
return requests.Session()
|
return requests.Session()
|
||||||
if cfscrape:
|
|
||||||
session_class = CachedCfScrape
|
|
||||||
session = session_class(
|
session = session_class(
|
||||||
OUTPUT_PATH / 'web_cache',
|
OUTPUT_PATH / 'web_cache',
|
||||||
cookies=cookiejar,
|
cookies=cookiejar,
|
||||||
|
|
Loading…
Reference in New Issue
Block a user