diff --git a/personal_data/__init__.py b/personal_data/__init__.py index c12fde0..e6da075 100644 --- a/personal_data/__init__.py +++ b/personal_data/__init__.py @@ -25,6 +25,8 @@ from personal_data._version import __version__ CSV_DIALECT = 'one_true_dialect' csv.register_dialect(CSV_DIALECT, lineterminator='\n', skipinitialspace=True) +logging.basicConfig() +logger.setLevel('INFO') def try_value(fn, s: str) -> any: try: @@ -118,13 +120,12 @@ STANDARD_HEADERS = { 'Accept-Encoding': 'gzip, deflate, br', } +class CachedCfScrape(requests_cache.CacheMixin, cfscrape.CloudflareScraper): + pass def get_session(cookiejar, *, with_cfscrape: bool) -> requests.Session: assert isinstance(with_cfscrape, bool) - if with_cfscrape: - session = cfscrape.create_scraper() - else: - session = requests_cache.CachedSession('web_cache', cookies=cookiejar) + session = CachedCfScrape('web_cache', cookies=cookiejar) for cookie in cookiejar: session.cookies.set_cookie(cookie) return session diff --git a/personal_data/fetchers/psnprofiles.py b/personal_data/fetchers/psnprofiles.py index 7484feb..4421d0b 100644 --- a/personal_data/fetchers/psnprofiles.py +++ b/personal_data/fetchers/psnprofiles.py @@ -161,16 +161,20 @@ class PsnProfilesScraper(Scraper): soup = bs4.BeautifulSoup(response.content, 'lxml') soup = personal_data.html_util.normalize_soup_slightly(soup, classes=False) - # Recent trophies. - soup_tropies = soup.select('#content table.zebra tr.completed') - assert len(soup_tropies) > 0, url - for row in soup_tropies: - print(row) + # Remove redundant + for redundant in soup.select('.wide-ad'): + redundant.extract() + for redundant in soup.select('div.col-xs-4'): + redundant.extract() + # Recent trophies. + soup_tropies = soup.select('#content.page > .row > div.col-xs div.box table.zebra tr.completed') + for row in soup_tropies: cells = row.find_all('td') - print(cells) trophy_name_a = cells[1].a + if trophy_name_a is None: + continue trophy_name = trophy_name_a.get_text().strip() trophy_name_a.extract() trophy_desc = cells[1].get_text().strip()