Improved parsing
This commit is contained in:
parent
87e9d4548c
commit
1dfc67d741
|
@ -25,6 +25,8 @@ from personal_data._version import __version__
|
|||
CSV_DIALECT = 'one_true_dialect'
|
||||
csv.register_dialect(CSV_DIALECT, lineterminator='\n', skipinitialspace=True)
|
||||
|
||||
logging.basicConfig()
|
||||
logger.setLevel('INFO')
|
||||
|
||||
def try_value(fn, s: str) -> any:
|
||||
try:
|
||||
|
@ -118,13 +120,12 @@ STANDARD_HEADERS = {
|
|||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
}
|
||||
|
||||
class CachedCfScrape(requests_cache.CacheMixin, cfscrape.CloudflareScraper):
|
||||
pass
|
||||
|
||||
def get_session(cookiejar, *, with_cfscrape: bool) -> requests.Session:
|
||||
assert isinstance(with_cfscrape, bool)
|
||||
if with_cfscrape:
|
||||
session = cfscrape.create_scraper()
|
||||
else:
|
||||
session = requests_cache.CachedSession('web_cache', cookies=cookiejar)
|
||||
session = CachedCfScrape('web_cache', cookies=cookiejar)
|
||||
for cookie in cookiejar:
|
||||
session.cookies.set_cookie(cookie)
|
||||
return session
|
||||
|
|
|
@ -161,16 +161,20 @@ class PsnProfilesScraper(Scraper):
|
|||
soup = bs4.BeautifulSoup(response.content, 'lxml')
|
||||
soup = personal_data.html_util.normalize_soup_slightly(soup, classes=False)
|
||||
|
||||
# Recent trophies.
|
||||
soup_tropies = soup.select('#content table.zebra tr.completed')
|
||||
assert len(soup_tropies) > 0, url
|
||||
for row in soup_tropies:
|
||||
print(row)
|
||||
# Remove redundant
|
||||
for redundant in soup.select('.wide-ad'):
|
||||
redundant.extract()
|
||||
for redundant in soup.select('div.col-xs-4'):
|
||||
redundant.extract()
|
||||
|
||||
# Recent trophies.
|
||||
soup_tropies = soup.select('#content.page > .row > div.col-xs div.box table.zebra tr.completed')
|
||||
for row in soup_tropies:
|
||||
cells = row.find_all('td')
|
||||
print(cells)
|
||||
|
||||
trophy_name_a = cells[1].a
|
||||
if trophy_name_a is None:
|
||||
continue
|
||||
trophy_name = trophy_name_a.get_text().strip()
|
||||
trophy_name_a.extract()
|
||||
trophy_desc = cells[1].get_text().strip()
|
||||
|
|
Loading…
Reference in New Issue
Block a user