From 6bdd778028fabd8fae304f4f78a697c40b0ac923 Mon Sep 17 00:00:00 2001 From: Jon Michael Aanes Date: Sun, 25 Aug 2024 21:18:55 +0200 Subject: [PATCH] Improved caching situation --- personal_data/fetchers/psnprofiles.py | 23 ++++++++++++++++------- personal_data/fetchers/tavex.py | 15 ++++++++++++--- personal_data/main.py | 3 ++- 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/personal_data/fetchers/psnprofiles.py b/personal_data/fetchers/psnprofiles.py index f4fdc73..f191ae5 100644 --- a/personal_data/fetchers/psnprofiles.py +++ b/personal_data/fetchers/psnprofiles.py @@ -10,11 +10,13 @@ from personal_data import secrets from personal_data.data import DeduplicateMode, Scraper from .. import parse_util +import requests_util logger = logging.getLogger(__name__) -URL_PROFILE = 'https://psnprofiles.com/{psn_id}' -URL_USER_GAME_TROPHIES = 'https://psnprofiles.com/trophies/{game_id}/{psn_id}' +URL_API_ROOT = 'https://psnprofiles.com/' +URL_PROFILE = URL_API_ROOT + '{psn_id}' +URL_USER_GAME_TROPHIES = URL_API_ROOT + 'trophies/{game_id}/{psn_id}' def game_psnprofiles_id_from_url(relative_url: str) -> int: @@ -25,7 +27,6 @@ def game_psnprofiles_id_from_url(relative_url: str) -> int: MAX_GAME_ITERATIONS = 10 - @dataclasses.dataclass(frozen=True) class PsnProfilesScraper(Scraper): dataset_name = 'games_played_playstation' @@ -36,19 +37,27 @@ class PsnProfilesScraper(Scraper): return True def scrape(self): - games_rows = list(self.scrape_games_overview()) + self._setup_cache() + games_rows = list(self._scrape_games_overview()) games_ids = {row['psnprofiles.game_id']: row['game.name'] for row in games_rows} yield from games_rows idx = 0 for game_id, game_name in games_ids.items(): - yield from self.scrape_game_trophies(game_id, game_name) + yield from self._scrape_game_trophies(game_id, game_name) del game_id idx += 1 if idx >= MAX_GAME_ITERATIONS: break - def scrape_games_overview(self) -> Iterator[dict]: + def _setup_cache(self): + requests_util.setup_limiter( + self.session, + URL_API_ROOT, + per_minute = 5, + ) + + def _scrape_games_overview(self) -> Iterator[dict]: # Request to get overview logger.info('Getting Overview') url = URL_PROFILE.format(psn_id=secrets.PLAYSTATION_PSN_ID) @@ -136,7 +145,7 @@ class PsnProfilesScraper(Scraper): d['me.last_played_time'] = time_played yield d - def scrape_game_trophies( + def _scrape_game_trophies( self, psnprofiles_id: int, game_name: str, diff --git a/personal_data/fetchers/tavex.py b/personal_data/fetchers/tavex.py index 2713169..3b5ea50 100644 --- a/personal_data/fetchers/tavex.py +++ b/personal_data/fetchers/tavex.py @@ -12,10 +12,11 @@ import bs4 import personal_data.html_util import personal_data.parse_util from personal_data.data import DeduplicateMode, Scraper +import requests_util +URL_API_ROOT = 'https://tavex.dk/' def parse_dkk_price(dkk: str) -> Decimal: - print(dkk) if dkk.strip() == '-': return None return Decimal(dkk.removesuffix(' DKK').replace(',', '.')) @@ -33,7 +34,15 @@ class TavexScraperBase(Scraper): def page_url() -> str: pass + def _setup_cache(self): + requests_util.setup_limiter( + self.session, + URL_API_ROOT, + per_minute = 5, + ) + def scrape(self): + self._setup_cache() response = self.session.get(self.page_url()) response.raise_for_status() @@ -77,7 +86,7 @@ class TavexScraperGold(TavexScraperBase): @staticmethod def page_url() -> str: - return 'https://tavex.dk/guld/1oz-canadisk-maple-leaf-guldmont/' + return f'{URL_API_ROOT}/guld/1oz-canadisk-maple-leaf-guldmont/' @dataclasses.dataclass(frozen=True) @@ -87,4 +96,4 @@ class TavexScraperSilver(TavexScraperBase): @staticmethod def page_url() -> str: - return 'https://tavex.dk/solv/1-oz-american-eagle-solvmont-tidligere-argange/' + return f'{URL_API_ROOT}/solv/1-oz-american-eagle-solvmont-tidligere-argange/' diff --git a/personal_data/main.py b/personal_data/main.py index 28365ac..3cd2110 100644 --- a/personal_data/main.py +++ b/personal_data/main.py @@ -43,6 +43,7 @@ if cfscrape: class CachedCfScrape(requests_cache.CacheMixin, cfscrape.CloudflareScraper): pass +CACHE_EXPIRE_DEFAULT = datetime.timedelta(days=7) def get_session( cookiejar: Sequence, @@ -57,7 +58,7 @@ def get_session( return requests.Session() if cfscrape: session_class = CachedCfScrape - session = session_class(OUTPUT_PATH / 'web_cache', cookies=cookiejar, expire_after=datetime.timedelta(days=1)) + session = session_class(OUTPUT_PATH / 'web_cache', cookies=cookiejar, expire_after=CACHE_EXPIRE_DEFAULT) for cookie in cookiejar: session.cookies.set_cookie(cookie) return session