The Lodestone
This commit is contained in:
parent
a03ba73dcd
commit
8b9a878af9
|
@ -11,6 +11,7 @@ import cfscrape
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
import personal_data.fetchers.ffxiv_lodestone
|
||||
import personal_data.fetchers.playstation
|
||||
import personal_data.fetchers.crunchyroll
|
||||
import personal_data.fetchers.psnprofiles
|
||||
|
@ -95,19 +96,25 @@ STANDARD_HEADERS = {
|
|||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
}
|
||||
|
||||
def get_session(with_cfscrape: bool, cookiejar) -> requests.Session:
|
||||
assert isinstance(with_cfscrape, bool)
|
||||
if with_cfscrape:
|
||||
session = cfscrape.create_scraper()
|
||||
else:
|
||||
session = requests_cache.CachedSession('web_cache', cookies = cookiejar)
|
||||
for cookie in cookiejar:
|
||||
session.cookies.set_cookie(cookie)
|
||||
return session
|
||||
|
||||
def main():
|
||||
cookiejar = browsercookie.firefox()
|
||||
logger.warning('Got cookiejar from firefox: %s cookies', len(cookiejar))
|
||||
|
||||
#session = requests_cache.CachedSession('web_cache', cookies = cookiejar)
|
||||
session = cfscrape.create_scraper()
|
||||
for cookie in cookiejar:
|
||||
session.cookies.set_cookie(cookie)
|
||||
|
||||
for scraper_cls in personal_data.data.Scraper.__subclasses__():
|
||||
session = get_session(scraper_cls.requires_cfscrape(), cookiejar)
|
||||
scraper = scraper_cls(session)
|
||||
logger.warning('Running %s, appending to "%s"', scraper_cls.__name__, scraper.dataset_name)
|
||||
del scraper_cls
|
||||
logger.warning('Running scraper: %s', scraper.dataset_name)
|
||||
result_rows = list()
|
||||
for result in scraper.scrape():
|
||||
result_rows.append(result)
|
||||
|
@ -115,7 +122,7 @@ def main():
|
|||
extend_csv_file('output/'+scraper.dataset_name, result_rows,
|
||||
deduplicate_mode = scraper.deduplicate_mode)
|
||||
logger.warning('Scraper done: %s', scraper.dataset_name)
|
||||
del scraper
|
||||
del scraper, session
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
|
@ -13,22 +13,21 @@ class Scraper(abc.ABC):
|
|||
session: requests.Session
|
||||
|
||||
@staticmethod
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def dataset_name(self) -> str:
|
||||
def dataset_name() -> str:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def deduplicate_mode(self) -> DeduplicateMode:
|
||||
def deduplicate_mode() -> DeduplicateMode:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
@property
|
||||
def dataset_format(self) -> str:
|
||||
def dataset_format() -> str:
|
||||
return 'list-of-dicts'
|
||||
|
||||
@staticmethod
|
||||
def requires_cfscrape() -> bool:
|
||||
return False
|
||||
|
||||
@abc.abstractmethod
|
||||
def scrape(self):
|
||||
pass
|
||||
|
|
58
personal_data/fetchers/ffxiv_lodestone.py
Normal file
58
personal_data/fetchers/ffxiv_lodestone.py
Normal file
|
@ -0,0 +1,58 @@
|
|||
import secrets
|
||||
import functools
|
||||
import dataclasses
|
||||
import re
|
||||
import logging
|
||||
import bs4
|
||||
import datetime
|
||||
|
||||
from personal_data.data import Scraper, DeduplicateMode
|
||||
import personal_data.html_util
|
||||
import personal_data.parse_util
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
URL_PROFILE_ACHIEVEMENTS = 'https://eu.finalfantasyxiv.com/lodestone/character/{character_id}/achievement/?page={page_idx}'
|
||||
URL_PROFILE_MINIONS = 'https://eu.finalfantasyxiv.com/lodestone/character/{character_id}/minion/'
|
||||
URL_PROFILE_MOUNTS = 'https://eu.finalfantasyxiv.com/lodestone/character/{character_id}/mount/'
|
||||
|
||||
FORMAT_DATE_HEADER = '%d/%m/%YYYY'
|
||||
|
||||
@dataclasses.dataclass(frozen = True)
|
||||
class LodestoneAchievementScraper(Scraper):
|
||||
dataset_name = 'games_played_playstation'
|
||||
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
|
||||
|
||||
def scrape(self):
|
||||
for page_idx in range(1, 13+1): # TODO: Automatically determine
|
||||
url = URL_PROFILE_ACHIEVEMENTS.format(character_id = secrets.FFXIV_CHARACTER_ID, page_idx = page_idx)
|
||||
response = self.session.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
NOW = personal_data.parse_util.response_datetime(response)
|
||||
|
||||
# Parse data
|
||||
soup = bs4.BeautifulSoup(response.content, 'lxml')
|
||||
soup = personal_data.html_util.normalize_soup_slightly(soup, classes = False, scripts = False)
|
||||
|
||||
#print(soup)
|
||||
|
||||
for entry in soup.select('.ldst__achievement ul li.entry'):
|
||||
time_acquired = str(entry.script.text).strip()
|
||||
time_acquired = re.search(r'ldst_strftime\((\d+)\s*,', time_acquired).group(1)
|
||||
time_acquired = int(time_acquired)
|
||||
time_acquired = datetime.datetime.fromtimestamp(time_acquired)
|
||||
trophy_desc = entry.select_one('.entry__activity__txt').get_text().strip()
|
||||
trophy_name = re.match(r'^.*achievement "([^"]+)" earned!$', trophy_desc).group(1)
|
||||
trophy_icon = entry.select_one('.entry__achievement__frame img')
|
||||
trophy_icon = trophy_icon.src
|
||||
|
||||
yield {
|
||||
'game.name' : 'Final Fantasy XIV',
|
||||
'me.last_played_time': time_acquired,
|
||||
|
||||
# Trophy Data
|
||||
'trophy.name': trophy_name,
|
||||
'trophy.desc': trophy_desc,
|
||||
'trophy.icon': trophy_icon,
|
||||
}
|
|
@ -14,7 +14,6 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
URL_PROFILE = 'https://psnprofiles.com/{psn_id}'
|
||||
|
||||
FORMAT_DATE_HEADER = '%a, %d %b %Y %H:%M:%S GMT'
|
||||
FORMAT_DAY_MONTH_YEAR = '%d %B %Y'
|
||||
|
||||
def game_psnprofiles_id_from_url(relative_url: str) -> int:
|
||||
|
@ -32,6 +31,10 @@ class PsnProfilesScraper(Scraper):
|
|||
dataset_name = 'games_played_playstation'
|
||||
deduplicate_mode = DeduplicateMode.BY_ALL_COLUMNS
|
||||
|
||||
@staticmethod
|
||||
def requires_cfscrape() -> bool:
|
||||
return True
|
||||
|
||||
def scrape(self):
|
||||
# Request to get watch history
|
||||
logger.info('Getting Watchlist')
|
||||
|
@ -39,7 +42,7 @@ class PsnProfilesScraper(Scraper):
|
|||
response = self.session.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
NOW = datetime.datetime.strptime(response.headers['Date'], FORMAT_DATE_HEADER)
|
||||
NOW = personal_data.parse_util.response_datetime(response)
|
||||
|
||||
# Parse data
|
||||
soup = bs4.BeautifulSoup(response.content, 'lxml')
|
||||
|
|
|
@ -18,8 +18,13 @@ DATETIME_UNITS = {
|
|||
'years': datetime.timedelta(days = 365),
|
||||
}
|
||||
|
||||
FORMAT_DATE_HEADER = '%a, %d %b %Y %H:%M:%S GMT'
|
||||
|
||||
def parse_duration(text: str) -> datetime.timedelta:
|
||||
(num, unit) = text.split(' ')
|
||||
num = int(num)
|
||||
unit = DATETIME_UNITS[unit]
|
||||
return unit * num
|
||||
|
||||
def response_datetime(response) -> datetime.datetime:
|
||||
return datetime.datetime.strptime(response.headers['Date'], FORMAT_DATE_HEADER)
|
||||
|
|
Loading…
Reference in New Issue
Block a user