Improved parsing
This commit is contained in:
parent
87e9d4548c
commit
1dfc67d741
|
@ -25,6 +25,8 @@ from personal_data._version import __version__
|
||||||
CSV_DIALECT = 'one_true_dialect'
|
CSV_DIALECT = 'one_true_dialect'
|
||||||
csv.register_dialect(CSV_DIALECT, lineterminator='\n', skipinitialspace=True)
|
csv.register_dialect(CSV_DIALECT, lineterminator='\n', skipinitialspace=True)
|
||||||
|
|
||||||
|
logging.basicConfig()
|
||||||
|
logger.setLevel('INFO')
|
||||||
|
|
||||||
def try_value(fn, s: str) -> any:
|
def try_value(fn, s: str) -> any:
|
||||||
try:
|
try:
|
||||||
|
@ -118,13 +120,12 @@ STANDARD_HEADERS = {
|
||||||
'Accept-Encoding': 'gzip, deflate, br',
|
'Accept-Encoding': 'gzip, deflate, br',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class CachedCfScrape(requests_cache.CacheMixin, cfscrape.CloudflareScraper):
|
||||||
|
pass
|
||||||
|
|
||||||
def get_session(cookiejar, *, with_cfscrape: bool) -> requests.Session:
|
def get_session(cookiejar, *, with_cfscrape: bool) -> requests.Session:
|
||||||
assert isinstance(with_cfscrape, bool)
|
assert isinstance(with_cfscrape, bool)
|
||||||
if with_cfscrape:
|
session = CachedCfScrape('web_cache', cookies=cookiejar)
|
||||||
session = cfscrape.create_scraper()
|
|
||||||
else:
|
|
||||||
session = requests_cache.CachedSession('web_cache', cookies=cookiejar)
|
|
||||||
for cookie in cookiejar:
|
for cookie in cookiejar:
|
||||||
session.cookies.set_cookie(cookie)
|
session.cookies.set_cookie(cookie)
|
||||||
return session
|
return session
|
||||||
|
|
|
@ -161,16 +161,20 @@ class PsnProfilesScraper(Scraper):
|
||||||
soup = bs4.BeautifulSoup(response.content, 'lxml')
|
soup = bs4.BeautifulSoup(response.content, 'lxml')
|
||||||
soup = personal_data.html_util.normalize_soup_slightly(soup, classes=False)
|
soup = personal_data.html_util.normalize_soup_slightly(soup, classes=False)
|
||||||
|
|
||||||
# Recent trophies.
|
# Remove redundant
|
||||||
soup_tropies = soup.select('#content table.zebra tr.completed')
|
for redundant in soup.select('.wide-ad'):
|
||||||
assert len(soup_tropies) > 0, url
|
redundant.extract()
|
||||||
for row in soup_tropies:
|
for redundant in soup.select('div.col-xs-4'):
|
||||||
print(row)
|
redundant.extract()
|
||||||
|
|
||||||
|
# Recent trophies.
|
||||||
|
soup_tropies = soup.select('#content.page > .row > div.col-xs div.box table.zebra tr.completed')
|
||||||
|
for row in soup_tropies:
|
||||||
cells = row.find_all('td')
|
cells = row.find_all('td')
|
||||||
print(cells)
|
|
||||||
|
|
||||||
trophy_name_a = cells[1].a
|
trophy_name_a = cells[1].a
|
||||||
|
if trophy_name_a is None:
|
||||||
|
continue
|
||||||
trophy_name = trophy_name_a.get_text().strip()
|
trophy_name = trophy_name_a.get_text().strip()
|
||||||
trophy_name_a.extract()
|
trophy_name_a.extract()
|
||||||
trophy_desc = cells[1].get_text().strip()
|
trophy_desc = cells[1].get_text().strip()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user