2024-03-03 15:59:03 +00:00
import dataclasses
2024-02-25 00:38:44 +00:00
import logging
2024-03-31 22:55:55 +00:00
import re
2024-10-23 19:29:53 +00:00
import datetime
2024-04-16 21:00:44 +00:00
from collections . abc import Iterator
2024-03-31 22:55:55 +00:00
2024-02-25 00:38:44 +00:00
import bs4
2024-10-03 21:24:12 +00:00
import requests_util
2024-02-25 00:38:44 +00:00
import personal_data . html_util
2024-04-28 22:01:11 +00:00
from personal_data import secrets
2024-03-31 22:55:55 +00:00
from personal_data . data import DeduplicateMode , Scraper
2024-02-25 00:38:44 +00:00
2024-05-18 19:35:58 +00:00
from . . import parse_util
2024-02-25 00:38:44 +00:00
logger = logging . getLogger ( __name__ )
2024-08-25 19:18:55 +00:00
URL_API_ROOT = ' https://psnprofiles.com/ '
URL_PROFILE = URL_API_ROOT + ' {psn_id} '
URL_USER_GAME_TROPHIES = URL_API_ROOT + ' trophies/ {game_id} / {psn_id} '
2024-10-23 19:29:53 +00:00
URL_GAMES_OVERVIEW = URL_API_ROOT + ' {psn_id} '
2024-02-25 00:38:44 +00:00
2024-05-09 14:59:56 +00:00
2024-02-25 00:38:44 +00:00
def game_psnprofiles_id_from_url ( relative_url : str ) - > int :
m = re . match ( r ' /(?:trophy|trophies)/( \ d+) \ -(?:[ \ w-]+)(/[ \ w-]*)? ' , relative_url )
result = m . group ( 1 )
return int ( result )
2024-03-31 22:55:55 +00:00
2024-08-25 21:12:33 +00:00
MAX_NUMBER_GAMES_TO_PARSE = 1000
2024-04-06 16:59:18 +00:00
2024-10-03 21:24:12 +00:00
2024-03-31 22:55:55 +00:00
@dataclasses.dataclass ( frozen = True )
2024-03-03 15:59:03 +00:00
class PsnProfilesScraper ( Scraper ) :
dataset_name = ' games_played_playstation '
deduplicate_mode = DeduplicateMode . BY_ALL_COLUMNS
2024-03-03 16:25:34 +00:00
@staticmethod
def requires_cfscrape ( ) - > bool :
return True
2024-03-03 15:59:03 +00:00
def scrape ( self ) :
2024-08-25 19:18:55 +00:00
self . _setup_cache ( )
games_rows = list ( self . _scrape_games_overview ( ) )
2024-04-06 16:59:18 +00:00
games_ids = { row [ ' psnprofiles.game_id ' ] : row [ ' game.name ' ] for row in games_rows }
2024-04-06 16:21:56 +00:00
2024-10-23 19:29:53 +00:00
logger . info ( ' Found %d games from overview ' , len ( games_rows ) )
2024-08-25 20:15:13 +00:00
SCRAPE_FROM_OVERVIEW = False
if SCRAPE_FROM_OVERVIEW :
yield from games_rows
2024-04-06 16:59:18 +00:00
idx = 0
2024-04-06 16:21:56 +00:00
for game_id , game_name in games_ids . items ( ) :
2024-08-25 19:18:55 +00:00
yield from self . _scrape_game_trophies ( game_id , game_name )
2024-04-06 16:21:56 +00:00
del game_id
2024-04-06 16:59:18 +00:00
idx + = 1
2024-08-25 21:12:33 +00:00
if idx > = MAX_NUMBER_GAMES_TO_PARSE :
2024-04-06 16:59:18 +00:00
break
2024-04-06 16:21:56 +00:00
2024-08-25 19:18:55 +00:00
def _setup_cache ( self ) :
requests_util . setup_limiter (
self . session ,
URL_API_ROOT ,
2024-10-03 21:24:12 +00:00
per_minute = 5 ,
2024-10-23 19:29:53 +00:00
expire_after = datetime . timedelta ( hours = 1 ) ,
)
requests_util . setup_limiter (
self . session ,
URL_API_ROOT + ' /trophies/ ' ,
expire_after = datetime . timedelta ( days = 14 ) ,
2024-08-25 19:18:55 +00:00
)
def _scrape_games_overview ( self ) - > Iterator [ dict ] :
2024-10-23 19:29:53 +00:00
for page_num in range ( 1 , 1000 ) :
logger . info ( ' Getting Overview (page %d ) ' , page_num )
url = URL_GAMES_OVERVIEW . format ( psn_id = secrets . PLAYSTATION_PSN_ID )
response = self . session . get ( url , params = { ' page ' : page_num } )
if ' page ' not in response . url :
msg = ' Configuration error? psnprofiles.com made an redirection. This is possibly because your profile name wasn \' t exactly as expected. Please check it '
raise RuntimeError ( msg )
response . raise_for_status ( )
soup = bs4 . BeautifulSoup ( response . text , ' lxml ' )
soup = personal_data . html_util . normalize_soup_slightly ( soup , classes = False )
games_on_page = list ( self . _iterate_games_from_games_table ( soup ) )
yield from games_on_page
if len ( games_on_page ) == 0 :
return
def _scrape_games_overview_old ( self ) - > Iterator [ dict ] :
2024-04-06 16:21:56 +00:00
# Request to get overview
logger . info ( ' Getting Overview ' )
2024-03-31 22:55:55 +00:00
url = URL_PROFILE . format ( psn_id = secrets . PLAYSTATION_PSN_ID )
2024-03-03 15:59:03 +00:00
response = self . session . get ( url )
response . raise_for_status ( )
2024-10-23 19:29:53 +00:00
now = parse_util . parse_response_datetime ( response )
2024-03-03 15:59:03 +00:00
# Parse data
soup = bs4 . BeautifulSoup ( response . content , ' lxml ' )
2024-03-31 22:55:55 +00:00
soup = personal_data . html_util . normalize_soup_slightly ( soup , classes = False )
2024-03-03 15:59:03 +00:00
2024-10-23 19:29:53 +00:00
yield from self . _iterate_games_from_recent_tropies ( soup , now )
yield from self . _iterate_games_from_games_table ( soup )
def _iterate_games_from_recent_tropies ( self , soup , now ) - > Iterator [ dict ] :
2024-03-03 15:59:03 +00:00
soup_recent_tropies = soup . select ( ' ul#recent-trophies > li ' )
2024-10-23 19:29:53 +00:00
assert len ( soup_recent_tropies ) > 0
2024-03-03 15:59:03 +00:00
for row in soup_recent_tropies :
cells = row . select_one ( ' .info .box td ' ) . find_all ( ' div ' )
trophy_name = cells [ 0 ] . get_text ( ) . strip ( )
trophy_desc = cells [ 1 ] . get_text ( ) . strip ( )
game_name = cells [ 2 ] . a . extract ( ) . get_text ( ) . strip ( )
psnprofiles_id = game_psnprofiles_id_from_url ( cells [ 0 ] . find ( ' a ' ) [ ' href ' ] )
trophy_icon = row . find ( class_ = ' icon ' ) . find ( ' img ' ) [ ' src ' ]
2024-03-31 22:55:55 +00:00
gotten_at = (
cells [ 2 ] . get_text ( ) . strip ( ) . removesuffix ( ' in ' ) . removesuffix ( ' ago ' )
)
2024-05-18 19:35:58 +00:00
gotten_at = parse_util . parse_duration ( gotten_at )
2024-10-23 19:29:53 +00:00
time_acquired = now - gotten_at
2024-03-03 15:59:03 +00:00
yield {
2024-03-31 22:55:55 +00:00
' game.name ' : game_name ,
2024-03-31 22:51:56 +00:00
' me.last_played_time ' : time_acquired . date ( ) ,
2024-03-03 15:59:03 +00:00
# Trophy Data
' trophy.name ' : trophy_name ,
' trophy.desc ' : trophy_desc ,
' trophy.icon ' : trophy_icon ,
2024-02-25 00:38:44 +00:00
' psnprofiles.game_id ' : psnprofiles_id ,
2024-03-03 15:59:03 +00:00
}
del row , cells , time_acquired
2024-10-23 19:29:53 +00:00
def _iterate_games_from_games_table ( self , soup ) - > Iterator [ dict ] :
2024-03-03 15:59:03 +00:00
# Games table
2024-03-31 22:55:55 +00:00
table_rows = soup . find ( id = ' gamesTable ' ) . find_all ( ' tr ' )
2024-03-03 15:59:03 +00:00
assert len ( table_rows ) > 0 , url
2024-10-23 19:29:53 +00:00
if title := table_rows [ 0 ] . h2 :
if title . get_text ( ) . strip ( ) == ' No games found ' :
return
2024-03-03 15:59:03 +00:00
for row in table_rows :
cells = row . find_all ( ' td ' )
# Check for pagination
2024-03-31 22:55:55 +00:00
if re . match (
r ' show \ d+ more games ' ,
cells [ 0 ] . get_text ( ) . strip ( ) ,
re . IGNORECASE ,
) :
2024-03-03 15:59:03 +00:00
break
2024-03-31 22:55:55 +00:00
game_name = cells [ 1 ] . find ( class_ = ' title ' ) . get_text ( )
2024-03-03 15:59:03 +00:00
psnprofiles_id = game_psnprofiles_id_from_url ( cells [ 0 ] . find ( ' a ' ) [ ' href ' ] )
game_icon = cells [ 0 ] . find ( ' img ' ) [ ' src ' ]
game_name = row . select_one ( ' .title ' ) . get_text ( )
game_platform = row . select_one ( ' .platform ' ) . get_text ( )
small_infos = cells [ 1 ] . find_all ( ' div ' )
if len ( small_infos ) > 2 :
time_played_div = small_infos [ 2 ]
time_played_div . sup . extract ( )
2024-05-18 19:35:58 +00:00
time_played = parse_util . parse_date (
2024-05-09 14:59:56 +00:00
time_played_div . get_text ( ) ,
)
2024-03-03 15:59:03 +00:00
else :
time_played = None
d = {
2024-03-31 22:55:55 +00:00
# Important fields
' game.name ' : game_name ,
# Secondary fields
' game.platform ' : game_platform ,
' game.icon ' : game_icon ,
' psnprofiles.game_id ' : psnprofiles_id ,
2024-03-03 15:59:03 +00:00
}
if time_played :
d [ ' me.last_played_time ' ] = time_played
yield d
2024-04-06 16:21:56 +00:00
2024-08-25 19:18:55 +00:00
def _scrape_game_trophies (
2024-04-23 20:58:25 +00:00
self ,
psnprofiles_id : int ,
game_name : str ,
2024-04-16 21:00:44 +00:00
) - > Iterator [ dict ] :
2024-04-06 16:59:18 +00:00
assert isinstance ( psnprofiles_id , int ) , psnprofiles_id
assert isinstance ( game_name , str ) , game_name
2024-04-06 16:21:56 +00:00
logger . info ( ' Getting Game Trophies %s ' , psnprofiles_id )
2024-04-16 21:00:44 +00:00
url = URL_USER_GAME_TROPHIES . format (
2024-04-23 20:58:25 +00:00
psn_id = secrets . PLAYSTATION_PSN_ID ,
game_id = psnprofiles_id ,
2024-04-16 21:00:44 +00:00
)
2024-04-06 16:21:56 +00:00
response = self . session . get ( url )
response . raise_for_status ( )
# Parse data
soup = bs4 . BeautifulSoup ( response . content , ' lxml ' )
soup = personal_data . html_util . normalize_soup_slightly ( soup , classes = False )
2024-04-06 16:56:12 +00:00
# Remove redundant
for redundant in soup . select ( ' .wide-ad ' ) :
redundant . extract ( )
for redundant in soup . select ( ' div.col-xs-4 ' ) :
redundant . extract ( )
2024-04-06 16:21:56 +00:00
# Recent trophies.
2024-04-16 21:00:44 +00:00
soup_tropies = soup . select (
' #content.page > .row > div.col-xs div.box table.zebra tr.completed ' ,
)
2024-04-06 16:21:56 +00:00
for row in soup_tropies :
cells = row . find_all ( ' td ' )
trophy_name_a = cells [ 1 ] . a
2024-04-06 16:56:12 +00:00
if trophy_name_a is None :
continue
2024-04-06 16:21:56 +00:00
trophy_name = trophy_name_a . get_text ( ) . strip ( )
trophy_name_a . extract ( )
trophy_desc = cells [ 1 ] . get_text ( ) . strip ( )
trophy_icon = cells [ 0 ] . img [ ' src ' ]
cells [ 2 ] . span . span . nobr . sup . extract ( )
2024-05-18 19:35:58 +00:00
gotten_at = parse_util . parse_time ( cells [ 2 ] . get_text ( ) )
2024-04-06 16:21:56 +00:00
yield {
' game.name ' : game_name ,
' me.last_played_time ' : gotten_at ,
# Trophy Data
' trophy.name ' : trophy_name ,
' trophy.desc ' : trophy_desc ,
' trophy.icon ' : trophy_icon ,
' psnprofiles.game_id ' : psnprofiles_id ,
}
del row , cells