1
0

The Lodestone

This commit is contained in:
Jon Michael Aanes 2024-03-03 17:25:34 +01:00
parent a03ba73dcd
commit 8b9a878af9
5 changed files with 89 additions and 17 deletions

View File

@ -11,6 +11,7 @@ import cfscrape
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
import personal_data.fetchers.ffxiv_lodestone
import personal_data.fetchers.playstation import personal_data.fetchers.playstation
import personal_data.fetchers.crunchyroll import personal_data.fetchers.crunchyroll
import personal_data.fetchers.psnprofiles import personal_data.fetchers.psnprofiles
@ -95,19 +96,25 @@ STANDARD_HEADERS = {
'Accept-Encoding': 'gzip, deflate, br', 'Accept-Encoding': 'gzip, deflate, br',
} }
def get_session(with_cfscrape: bool, cookiejar) -> requests.Session:
assert isinstance(with_cfscrape, bool)
if with_cfscrape:
session = cfscrape.create_scraper()
else:
session = requests_cache.CachedSession('web_cache', cookies = cookiejar)
for cookie in cookiejar:
session.cookies.set_cookie(cookie)
return session
def main(): def main():
cookiejar = browsercookie.firefox() cookiejar = browsercookie.firefox()
logger.warning('Got cookiejar from firefox: %s cookies', len(cookiejar)) logger.warning('Got cookiejar from firefox: %s cookies', len(cookiejar))
#session = requests_cache.CachedSession('web_cache', cookies = cookiejar)
session = cfscrape.create_scraper()
for cookie in cookiejar:
session.cookies.set_cookie(cookie)
for scraper_cls in personal_data.data.Scraper.__subclasses__(): for scraper_cls in personal_data.data.Scraper.__subclasses__():
session = get_session(scraper_cls.requires_cfscrape(), cookiejar)
scraper = scraper_cls(session) scraper = scraper_cls(session)
logger.warning('Running %s, appending to "%s"', scraper_cls.__name__, scraper.dataset_name)
del scraper_cls del scraper_cls
logger.warning('Running scraper: %s', scraper.dataset_name)
result_rows = list() result_rows = list()
for result in scraper.scrape(): for result in scraper.scrape():
result_rows.append(result) result_rows.append(result)
@ -115,7 +122,7 @@ def main():
extend_csv_file('output/'+scraper.dataset_name, result_rows, extend_csv_file('output/'+scraper.dataset_name, result_rows,
deduplicate_mode = scraper.deduplicate_mode) deduplicate_mode = scraper.deduplicate_mode)
logger.warning('Scraper done: %s', scraper.dataset_name) logger.warning('Scraper done: %s', scraper.dataset_name)
del scraper del scraper, session
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@ -13,22 +13,21 @@ class Scraper(abc.ABC):
session: requests.Session session: requests.Session
@staticmethod @staticmethod
@property def dataset_name() -> str:
@abc.abstractmethod
def dataset_name(self) -> str:
pass pass
@staticmethod @staticmethod
@property def deduplicate_mode() -> DeduplicateMode:
@abc.abstractmethod
def deduplicate_mode(self) -> DeduplicateMode:
pass pass
@staticmethod @staticmethod
@property def dataset_format() -> str:
def dataset_format(self) -> str:
return 'list-of-dicts' return 'list-of-dicts'
@staticmethod
def requires_cfscrape() -> bool:
return False
@abc.abstractmethod @abc.abstractmethod
def scrape(self): def scrape(self):
pass pass

View File

@ -0,0 +1,58 @@
import secrets
import functools
import dataclasses
import re
import logging
import bs4
import datetime
from personal_data.data import Scraper, DeduplicateMode
import personal_data.html_util
import personal_data.parse_util
logger = logging.getLogger(__name__)
URL_PROFILE_ACHIEVEMENTS = 'https://eu.finalfantasyxiv.com/lodestone/character/{character_id}/achievement/?page={page_idx}'
URL_PROFILE_MINIONS = 'https://eu.finalfantasyxiv.com/lodestone/character/{character_id}/minion/'
URL_PROFILE_MOUNTS = 'https://eu.finalfantasyxiv.com/lodestone/character/{character_id}/mount/'
FORMAT_DATE_HEADER = '%d/%m/%YYYY'
@dataclasses.dataclass(frozen = True)
class LodestoneAchievementScraper(Scraper):
dataset_name = 'games_played_playstation'
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
def scrape(self):
for page_idx in range(1, 13+1): # TODO: Automatically determine
url = URL_PROFILE_ACHIEVEMENTS.format(character_id = secrets.FFXIV_CHARACTER_ID, page_idx = page_idx)
response = self.session.get(url)
response.raise_for_status()
NOW = personal_data.parse_util.response_datetime(response)
# Parse data
soup = bs4.BeautifulSoup(response.content, 'lxml')
soup = personal_data.html_util.normalize_soup_slightly(soup, classes = False, scripts = False)
#print(soup)
for entry in soup.select('.ldst__achievement ul li.entry'):
time_acquired = str(entry.script.text).strip()
time_acquired = re.search(r'ldst_strftime\((\d+)\s*,', time_acquired).group(1)
time_acquired = int(time_acquired)
time_acquired = datetime.datetime.fromtimestamp(time_acquired)
trophy_desc = entry.select_one('.entry__activity__txt').get_text().strip()
trophy_name = re.match(r'^.*achievement "([^"]+)" earned!$', trophy_desc).group(1)
trophy_icon = entry.select_one('.entry__achievement__frame img')
trophy_icon = trophy_icon.src
yield {
'game.name' : 'Final Fantasy XIV',
'me.last_played_time': time_acquired,
# Trophy Data
'trophy.name': trophy_name,
'trophy.desc': trophy_desc,
'trophy.icon': trophy_icon,
}

View File

@ -14,7 +14,6 @@ logger = logging.getLogger(__name__)
URL_PROFILE = 'https://psnprofiles.com/{psn_id}' URL_PROFILE = 'https://psnprofiles.com/{psn_id}'
FORMAT_DATE_HEADER = '%a, %d %b %Y %H:%M:%S GMT'
FORMAT_DAY_MONTH_YEAR = '%d %B %Y' FORMAT_DAY_MONTH_YEAR = '%d %B %Y'
def game_psnprofiles_id_from_url(relative_url: str) -> int: def game_psnprofiles_id_from_url(relative_url: str) -> int:
@ -32,6 +31,10 @@ class PsnProfilesScraper(Scraper):
dataset_name = 'games_played_playstation' dataset_name = 'games_played_playstation'
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
@staticmethod
def requires_cfscrape() -> bool:
return True
def scrape(self): def scrape(self):
# Request to get watch history # Request to get watch history
logger.info('Getting Watchlist') logger.info('Getting Watchlist')
@ -39,7 +42,7 @@ class PsnProfilesScraper(Scraper):
response = self.session.get(url) response = self.session.get(url)
response.raise_for_status() response.raise_for_status()
NOW = datetime.datetime.strptime(response.headers['Date'], FORMAT_DATE_HEADER) NOW = personal_data.parse_util.response_datetime(response)
# Parse data # Parse data
soup = bs4.BeautifulSoup(response.content, 'lxml') soup = bs4.BeautifulSoup(response.content, 'lxml')

View File

@ -18,8 +18,13 @@ DATETIME_UNITS = {
'years': datetime.timedelta(days = 365), 'years': datetime.timedelta(days = 365),
} }
FORMAT_DATE_HEADER = '%a, %d %b %Y %H:%M:%S GMT'
def parse_duration(text: str) -> datetime.timedelta: def parse_duration(text: str) -> datetime.timedelta:
(num, unit) = text.split(' ') (num, unit) = text.split(' ')
num = int(num) num = int(num)
unit = DATETIME_UNITS[unit] unit = DATETIME_UNITS[unit]
return unit * num return unit * num
def response_datetime(response) -> datetime.datetime:
return datetime.datetime.strptime(response.headers['Date'], FORMAT_DATE_HEADER)