1
0

Improved parsing

This commit is contained in:
Jon Michael Aanes 2024-04-06 18:56:12 +02:00
parent 87e9d4548c
commit 1dfc67d741
Signed by: Jmaa
SSH Key Fingerprint: SHA256:Ab0GfHGCblESJx7JRE4fj4bFy/KRpeLhi41y4pF3sNA
2 changed files with 15 additions and 10 deletions

View File

@ -25,6 +25,8 @@ from personal_data._version import __version__
CSV_DIALECT = 'one_true_dialect' CSV_DIALECT = 'one_true_dialect'
csv.register_dialect(CSV_DIALECT, lineterminator='\n', skipinitialspace=True) csv.register_dialect(CSV_DIALECT, lineterminator='\n', skipinitialspace=True)
logging.basicConfig()
logger.setLevel('INFO')
def try_value(fn, s: str) -> any: def try_value(fn, s: str) -> any:
try: try:
@ -118,13 +120,12 @@ STANDARD_HEADERS = {
'Accept-Encoding': 'gzip, deflate, br', 'Accept-Encoding': 'gzip, deflate, br',
} }
class CachedCfScrape(requests_cache.CacheMixin, cfscrape.CloudflareScraper):
pass
def get_session(cookiejar, *, with_cfscrape: bool) -> requests.Session: def get_session(cookiejar, *, with_cfscrape: bool) -> requests.Session:
assert isinstance(with_cfscrape, bool) assert isinstance(with_cfscrape, bool)
if with_cfscrape: session = CachedCfScrape('web_cache', cookies=cookiejar)
session = cfscrape.create_scraper()
else:
session = requests_cache.CachedSession('web_cache', cookies=cookiejar)
for cookie in cookiejar: for cookie in cookiejar:
session.cookies.set_cookie(cookie) session.cookies.set_cookie(cookie)
return session return session

View File

@ -161,16 +161,20 @@ class PsnProfilesScraper(Scraper):
soup = bs4.BeautifulSoup(response.content, 'lxml') soup = bs4.BeautifulSoup(response.content, 'lxml')
soup = personal_data.html_util.normalize_soup_slightly(soup, classes=False) soup = personal_data.html_util.normalize_soup_slightly(soup, classes=False)
# Recent trophies. # Remove redundant
soup_tropies = soup.select('#content table.zebra tr.completed') for redundant in soup.select('.wide-ad'):
assert len(soup_tropies) > 0, url redundant.extract()
for row in soup_tropies: for redundant in soup.select('div.col-xs-4'):
print(row) redundant.extract()
# Recent trophies.
soup_tropies = soup.select('#content.page > .row > div.col-xs div.box table.zebra tr.completed')
for row in soup_tropies:
cells = row.find_all('td') cells = row.find_all('td')
print(cells)
trophy_name_a = cells[1].a trophy_name_a = cells[1].a
if trophy_name_a is None:
continue
trophy_name = trophy_name_a.get_text().strip() trophy_name = trophy_name_a.get_text().strip()
trophy_name_a.extract() trophy_name_a.extract()
trophy_desc = cells[1].get_text().strip() trophy_desc = cells[1].get_text().strip()