The Lodestone
This commit is contained in:
parent
a03ba73dcd
commit
8b9a878af9
|
@ -11,6 +11,7 @@ import cfscrape
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
import personal_data.fetchers.ffxiv_lodestone
|
||||||
import personal_data.fetchers.playstation
|
import personal_data.fetchers.playstation
|
||||||
import personal_data.fetchers.crunchyroll
|
import personal_data.fetchers.crunchyroll
|
||||||
import personal_data.fetchers.psnprofiles
|
import personal_data.fetchers.psnprofiles
|
||||||
|
@ -95,19 +96,25 @@ STANDARD_HEADERS = {
|
||||||
'Accept-Encoding': 'gzip, deflate, br',
|
'Accept-Encoding': 'gzip, deflate, br',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def get_session(with_cfscrape: bool, cookiejar) -> requests.Session:
|
||||||
|
assert isinstance(with_cfscrape, bool)
|
||||||
|
if with_cfscrape:
|
||||||
|
session = cfscrape.create_scraper()
|
||||||
|
else:
|
||||||
|
session = requests_cache.CachedSession('web_cache', cookies = cookiejar)
|
||||||
|
for cookie in cookiejar:
|
||||||
|
session.cookies.set_cookie(cookie)
|
||||||
|
return session
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
cookiejar = browsercookie.firefox()
|
cookiejar = browsercookie.firefox()
|
||||||
logger.warning('Got cookiejar from firefox: %s cookies', len(cookiejar))
|
logger.warning('Got cookiejar from firefox: %s cookies', len(cookiejar))
|
||||||
|
|
||||||
#session = requests_cache.CachedSession('web_cache', cookies = cookiejar)
|
|
||||||
session = cfscrape.create_scraper()
|
|
||||||
for cookie in cookiejar:
|
|
||||||
session.cookies.set_cookie(cookie)
|
|
||||||
|
|
||||||
for scraper_cls in personal_data.data.Scraper.__subclasses__():
|
for scraper_cls in personal_data.data.Scraper.__subclasses__():
|
||||||
|
session = get_session(scraper_cls.requires_cfscrape(), cookiejar)
|
||||||
scraper = scraper_cls(session)
|
scraper = scraper_cls(session)
|
||||||
|
logger.warning('Running %s, appending to "%s"', scraper_cls.__name__, scraper.dataset_name)
|
||||||
del scraper_cls
|
del scraper_cls
|
||||||
logger.warning('Running scraper: %s', scraper.dataset_name)
|
|
||||||
result_rows = list()
|
result_rows = list()
|
||||||
for result in scraper.scrape():
|
for result in scraper.scrape():
|
||||||
result_rows.append(result)
|
result_rows.append(result)
|
||||||
|
@ -115,7 +122,7 @@ def main():
|
||||||
extend_csv_file('output/'+scraper.dataset_name, result_rows,
|
extend_csv_file('output/'+scraper.dataset_name, result_rows,
|
||||||
deduplicate_mode = scraper.deduplicate_mode)
|
deduplicate_mode = scraper.deduplicate_mode)
|
||||||
logger.warning('Scraper done: %s', scraper.dataset_name)
|
logger.warning('Scraper done: %s', scraper.dataset_name)
|
||||||
del scraper
|
del scraper, session
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
|
|
@ -13,22 +13,21 @@ class Scraper(abc.ABC):
|
||||||
session: requests.Session
|
session: requests.Session
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@property
|
def dataset_name() -> str:
|
||||||
@abc.abstractmethod
|
|
||||||
def dataset_name(self) -> str:
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@property
|
def deduplicate_mode() -> DeduplicateMode:
|
||||||
@abc.abstractmethod
|
|
||||||
def deduplicate_mode(self) -> DeduplicateMode:
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@property
|
def dataset_format() -> str:
|
||||||
def dataset_format(self) -> str:
|
|
||||||
return 'list-of-dicts'
|
return 'list-of-dicts'
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def requires_cfscrape() -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def scrape(self):
|
def scrape(self):
|
||||||
pass
|
pass
|
||||||
|
|
58
personal_data/fetchers/ffxiv_lodestone.py
Normal file
58
personal_data/fetchers/ffxiv_lodestone.py
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
import secrets
|
||||||
|
import functools
|
||||||
|
import dataclasses
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
import bs4
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
from personal_data.data import Scraper, DeduplicateMode
|
||||||
|
import personal_data.html_util
|
||||||
|
import personal_data.parse_util
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
URL_PROFILE_ACHIEVEMENTS = 'https://eu.finalfantasyxiv.com/lodestone/character/{character_id}/achievement/?page={page_idx}'
|
||||||
|
URL_PROFILE_MINIONS = 'https://eu.finalfantasyxiv.com/lodestone/character/{character_id}/minion/'
|
||||||
|
URL_PROFILE_MOUNTS = 'https://eu.finalfantasyxiv.com/lodestone/character/{character_id}/mount/'
|
||||||
|
|
||||||
|
FORMAT_DATE_HEADER = '%d/%m/%YYYY'
|
||||||
|
|
||||||
|
@dataclasses.dataclass(frozen = True)
|
||||||
|
class LodestoneAchievementScraper(Scraper):
|
||||||
|
dataset_name = 'games_played_playstation'
|
||||||
|
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
|
||||||
|
|
||||||
|
def scrape(self):
|
||||||
|
for page_idx in range(1, 13+1): # TODO: Automatically determine
|
||||||
|
url = URL_PROFILE_ACHIEVEMENTS.format(character_id = secrets.FFXIV_CHARACTER_ID, page_idx = page_idx)
|
||||||
|
response = self.session.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
NOW = personal_data.parse_util.response_datetime(response)
|
||||||
|
|
||||||
|
# Parse data
|
||||||
|
soup = bs4.BeautifulSoup(response.content, 'lxml')
|
||||||
|
soup = personal_data.html_util.normalize_soup_slightly(soup, classes = False, scripts = False)
|
||||||
|
|
||||||
|
#print(soup)
|
||||||
|
|
||||||
|
for entry in soup.select('.ldst__achievement ul li.entry'):
|
||||||
|
time_acquired = str(entry.script.text).strip()
|
||||||
|
time_acquired = re.search(r'ldst_strftime\((\d+)\s*,', time_acquired).group(1)
|
||||||
|
time_acquired = int(time_acquired)
|
||||||
|
time_acquired = datetime.datetime.fromtimestamp(time_acquired)
|
||||||
|
trophy_desc = entry.select_one('.entry__activity__txt').get_text().strip()
|
||||||
|
trophy_name = re.match(r'^.*achievement "([^"]+)" earned!$', trophy_desc).group(1)
|
||||||
|
trophy_icon = entry.select_one('.entry__achievement__frame img')
|
||||||
|
trophy_icon = trophy_icon.src
|
||||||
|
|
||||||
|
yield {
|
||||||
|
'game.name' : 'Final Fantasy XIV',
|
||||||
|
'me.last_played_time': time_acquired,
|
||||||
|
|
||||||
|
# Trophy Data
|
||||||
|
'trophy.name': trophy_name,
|
||||||
|
'trophy.desc': trophy_desc,
|
||||||
|
'trophy.icon': trophy_icon,
|
||||||
|
}
|
|
@ -14,7 +14,6 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
URL_PROFILE = 'https://psnprofiles.com/{psn_id}'
|
URL_PROFILE = 'https://psnprofiles.com/{psn_id}'
|
||||||
|
|
||||||
FORMAT_DATE_HEADER = '%a, %d %b %Y %H:%M:%S GMT'
|
|
||||||
FORMAT_DAY_MONTH_YEAR = '%d %B %Y'
|
FORMAT_DAY_MONTH_YEAR = '%d %B %Y'
|
||||||
|
|
||||||
def game_psnprofiles_id_from_url(relative_url: str) -> int:
|
def game_psnprofiles_id_from_url(relative_url: str) -> int:
|
||||||
|
@ -32,6 +31,10 @@ class PsnProfilesScraper(Scraper):
|
||||||
dataset_name = 'games_played_playstation'
|
dataset_name = 'games_played_playstation'
|
||||||
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
|
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def requires_cfscrape() -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
def scrape(self):
|
def scrape(self):
|
||||||
# Request to get watch history
|
# Request to get watch history
|
||||||
logger.info('Getting Watchlist')
|
logger.info('Getting Watchlist')
|
||||||
|
@ -39,7 +42,7 @@ class PsnProfilesScraper(Scraper):
|
||||||
response = self.session.get(url)
|
response = self.session.get(url)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
NOW = datetime.datetime.strptime(response.headers['Date'], FORMAT_DATE_HEADER)
|
NOW = personal_data.parse_util.response_datetime(response)
|
||||||
|
|
||||||
# Parse data
|
# Parse data
|
||||||
soup = bs4.BeautifulSoup(response.content, 'lxml')
|
soup = bs4.BeautifulSoup(response.content, 'lxml')
|
||||||
|
|
|
@ -18,8 +18,13 @@ DATETIME_UNITS = {
|
||||||
'years': datetime.timedelta(days = 365),
|
'years': datetime.timedelta(days = 365),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FORMAT_DATE_HEADER = '%a, %d %b %Y %H:%M:%S GMT'
|
||||||
|
|
||||||
def parse_duration(text: str) -> datetime.timedelta:
|
def parse_duration(text: str) -> datetime.timedelta:
|
||||||
(num, unit) = text.split(' ')
|
(num, unit) = text.split(' ')
|
||||||
num = int(num)
|
num = int(num)
|
||||||
unit = DATETIME_UNITS[unit]
|
unit = DATETIME_UNITS[unit]
|
||||||
return unit * num
|
return unit * num
|
||||||
|
|
||||||
|
def response_datetime(response) -> datetime.datetime:
|
||||||
|
return datetime.datetime.strptime(response.headers['Date'], FORMAT_DATE_HEADER)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user