Use new improved cloudflare avoider
This commit is contained in:
parent
d945fb81fb
commit
119b380f8a
|
@ -5,7 +5,7 @@ Sub-module for importing time-based data into Obsidian.
|
||||||
|
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import datetime
|
import datetime
|
||||||
from collections.abc import Iterable, Iterator
|
from collections.abc import Iterator
|
||||||
from logging import getLogger
|
from logging import getLogger
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
@ -104,9 +104,9 @@ def import_workout_csv(vault: ObsidianVault, rows: Rows) -> int:
|
||||||
return num_updated
|
return num_updated
|
||||||
|
|
||||||
|
|
||||||
def import_step_counts_csv(vault: ObsidianVault, rows: Rows) -> int:
|
MINIMUM_BELIEVABLE_STEP_COUNT = 300
|
||||||
MINIMUM_STEPS = 300
|
|
||||||
|
|
||||||
|
def import_step_counts_csv(vault: ObsidianVault, rows: Rows) -> int:
|
||||||
num_updated = 0
|
num_updated = 0
|
||||||
|
|
||||||
rows_per_date = {}
|
rows_per_date = {}
|
||||||
|
@ -121,7 +121,7 @@ def import_step_counts_csv(vault: ObsidianVault, rows: Rows) -> int:
|
||||||
}
|
}
|
||||||
|
|
||||||
for date, steps in steps_per_date.items():
|
for date, steps in steps_per_date.items():
|
||||||
if steps < MINIMUM_STEPS:
|
if steps < MINIMUM_BELIEVABLE_STEP_COUNT:
|
||||||
continue
|
continue
|
||||||
was_updated = vault.add_statistic(date, 'Steps', steps)
|
was_updated = vault.add_statistic(date, 'Steps', steps)
|
||||||
if was_updated:
|
if was_updated:
|
||||||
|
|
|
@ -1,6 +1,9 @@
|
||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
logging.basicConfig()
|
||||||
|
logging.getLogger('personal_data').setLevel('INFO')
|
||||||
|
|
||||||
import personal_data.main
|
import personal_data.main
|
||||||
from personal_data.notification import NotificationType
|
from personal_data.notification import NotificationType
|
||||||
|
|
||||||
|
@ -37,9 +40,6 @@ def parse_arguments():
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
logging.basicConfig()
|
|
||||||
logging.getLogger('personal_data').setLevel('INFO')
|
|
||||||
|
|
||||||
args = parse_arguments()
|
args = parse_arguments()
|
||||||
scraper_filter = frozenset(args.fetchers)
|
scraper_filter = frozenset(args.fetchers)
|
||||||
|
|
||||||
|
|
|
@ -12,10 +12,10 @@ from . import data, fetchers, notification, util
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import cfscrape
|
import cloudscraper
|
||||||
except ImportError:
|
except ImportError:
|
||||||
cfscrape = None
|
cloudscraper = None
|
||||||
logger.exception('cfscrape not installed: Certain fetchers might not work')
|
logger.exception('cloudscraper not installed: Certain fetchers might not work')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import browser_cookie3
|
import browser_cookie3
|
||||||
|
@ -26,11 +26,6 @@ except ImportError:
|
||||||
|
|
||||||
OUTPUT_PATH = Path('./output')
|
OUTPUT_PATH = Path('./output')
|
||||||
|
|
||||||
logging.basicConfig(
|
|
||||||
format='%(asctime)s %(levelname)s %(module)s:%(lineno)d - %(message)s',
|
|
||||||
)
|
|
||||||
logger.setLevel('INFO')
|
|
||||||
|
|
||||||
|
|
||||||
STANDARD_HEADERS = {
|
STANDARD_HEADERS = {
|
||||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0',
|
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0',
|
||||||
|
@ -39,9 +34,9 @@ STANDARD_HEADERS = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if cfscrape:
|
if cloudscraper:
|
||||||
|
|
||||||
class CachedCfScrape(requests_cache.CacheMixin, cfscrape.CloudflareScraper):
|
class CachedCfScrape(requests_cache.CacheMixin, cloudscraper.CloudScraper):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@ -54,13 +49,21 @@ def get_session(
|
||||||
with_cfscrape: bool,
|
with_cfscrape: bool,
|
||||||
ignore_cache: bool,
|
ignore_cache: bool,
|
||||||
) -> requests.Session:
|
) -> requests.Session:
|
||||||
if with_cfscrape and cfscrape:
|
session_class = requests_cache.CachedSession
|
||||||
session_class = CachedCfScrape
|
if with_cfscrape:
|
||||||
if ignore_cache:
|
if cloudscraper:
|
||||||
logger.warning('HTTP cache disabled')
|
session_class = CachedCfScrape
|
||||||
return cfscrape.create_scraper()
|
|
||||||
|
if ignore_cache:
|
||||||
|
logger.warning('HTTP cache disabled')
|
||||||
|
return cloudscraper.create_scraper(
|
||||||
|
interpreter='js2py',
|
||||||
|
delay=5,
|
||||||
|
debug=False,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.error('Expected cloudscraper, but not defined!')
|
||||||
else:
|
else:
|
||||||
session_class = requests_cache.CachedSession
|
|
||||||
if ignore_cache:
|
if ignore_cache:
|
||||||
logger.warning('HTTP cache disabled')
|
logger.warning('HTTP cache disabled')
|
||||||
return requests.Session()
|
return requests.Session()
|
||||||
|
@ -100,6 +103,7 @@ def get_cookiejar(use_cookiejar: bool):
|
||||||
browser_cookie3.firefox(
|
browser_cookie3.firefox(
|
||||||
'/home/jmaa/.cachy/mbui5xg7.default-release/cookies.sqlite',
|
'/home/jmaa/.cachy/mbui5xg7.default-release/cookies.sqlite',
|
||||||
)
|
)
|
||||||
|
logger.warning('Cookiejar has %s cookies', len(cookiejar))
|
||||||
if len(cookiejar) > 10:
|
if len(cookiejar) > 10:
|
||||||
return cookiejar
|
return cookiejar
|
||||||
logger.warning('No cookiejar is used')
|
logger.warning('No cookiejar is used')
|
||||||
|
@ -114,23 +118,23 @@ def main(
|
||||||
notification_types: frozenset[notification.NotificationType],
|
notification_types: frozenset[notification.NotificationType],
|
||||||
) -> None:
|
) -> None:
|
||||||
cookiejar = get_cookiejar(use_cookiejar)
|
cookiejar = get_cookiejar(use_cookiejar)
|
||||||
logger.warning('Cookiejar has %s cookies', len(cookiejar))
|
|
||||||
|
|
||||||
if len(notification_types) == 0:
|
if len(notification_types) == 0:
|
||||||
logger.info('No notifications enabled: Notifications will not be sent!')
|
logger.info('No notifications enabled: Notifications will not be sent!')
|
||||||
|
|
||||||
for scraper_cls in available_scrapers():
|
for scraper_cls in available_scrapers():
|
||||||
|
if scraper_cls.__name__ not in scraper_filter:
|
||||||
|
continue
|
||||||
session = get_session(
|
session = get_session(
|
||||||
cookiejar,
|
cookiejar=cookiejar,
|
||||||
with_cfscrape=scraper_cls.requires_cfscrape(),
|
with_cfscrape=scraper_cls.requires_cfscrape(),
|
||||||
ignore_cache=ignore_cache,
|
ignore_cache=ignore_cache,
|
||||||
)
|
)
|
||||||
scraper = scraper_cls(session)
|
scraper = scraper_cls(session)
|
||||||
if scraper_cls.__name__ not in scraper_filter:
|
|
||||||
continue
|
|
||||||
logger.info(
|
logger.info(
|
||||||
'Running %s, appending to "%s"',
|
'Running %s (%s), appending to "%s"',
|
||||||
scraper_cls.__name__,
|
scraper_cls.__name__,
|
||||||
|
type(session).__name__,
|
||||||
scraper.dataset_name,
|
scraper.dataset_name,
|
||||||
)
|
)
|
||||||
result_rows = []
|
result_rows = []
|
||||||
|
@ -138,8 +142,9 @@ def main(
|
||||||
for result in scraper.scrape():
|
for result in scraper.scrape():
|
||||||
result_rows.append(result)
|
result_rows.append(result)
|
||||||
del result
|
del result
|
||||||
except requests.exceptions.HTTPError:
|
except requests.exceptions.HTTPError as e:
|
||||||
logger.exception('Failed in running %s', scraper_cls.__name__)
|
logger.exception('Failed in running %s', scraper_cls.__name__)
|
||||||
|
logger.error('User-Agent: %s', e.request.headers['user-agent'])
|
||||||
continue
|
continue
|
||||||
status = util.extend_csv_file(
|
status = util.extend_csv_file(
|
||||||
OUTPUT_PATH / f'{scraper.dataset_name}.csv',
|
OUTPUT_PATH / f'{scraper.dataset_name}.csv',
|
||||||
|
|
Loading…
Reference in New Issue
Block a user