1
0

Use new improved cloudflare avoider
Some checks failed
Run Python tests (through Pytest) / Test (push) Failing after 28s
Verify Python project can be installed, loaded and have version checked / Test (push) Failing after 26s

This commit is contained in:
Jon Michael Aanes 2025-06-16 23:24:09 +02:00
parent d945fb81fb
commit 119b380f8a
3 changed files with 34 additions and 29 deletions

View File

@ -5,7 +5,7 @@ Sub-module for importing time-based data into Obsidian.
import dataclasses import dataclasses
import datetime import datetime
from collections.abc import Iterable, Iterator from collections.abc import Iterator
from logging import getLogger from logging import getLogger
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@ -104,9 +104,9 @@ def import_workout_csv(vault: ObsidianVault, rows: Rows) -> int:
return num_updated return num_updated
def import_step_counts_csv(vault: ObsidianVault, rows: Rows) -> int: MINIMUM_BELIEVABLE_STEP_COUNT = 300
MINIMUM_STEPS = 300
def import_step_counts_csv(vault: ObsidianVault, rows: Rows) -> int:
num_updated = 0 num_updated = 0
rows_per_date = {} rows_per_date = {}
@ -121,7 +121,7 @@ def import_step_counts_csv(vault: ObsidianVault, rows: Rows) -> int:
} }
for date, steps in steps_per_date.items(): for date, steps in steps_per_date.items():
if steps < MINIMUM_STEPS: if steps < MINIMUM_BELIEVABLE_STEP_COUNT:
continue continue
was_updated = vault.add_statistic(date, 'Steps', steps) was_updated = vault.add_statistic(date, 'Steps', steps)
if was_updated: if was_updated:

View File

@ -1,6 +1,9 @@
import argparse import argparse
import logging import logging
logging.basicConfig()
logging.getLogger('personal_data').setLevel('INFO')
import personal_data.main import personal_data.main
from personal_data.notification import NotificationType from personal_data.notification import NotificationType
@ -37,9 +40,6 @@ def parse_arguments():
def main(): def main():
logging.basicConfig()
logging.getLogger('personal_data').setLevel('INFO')
args = parse_arguments() args = parse_arguments()
scraper_filter = frozenset(args.fetchers) scraper_filter = frozenset(args.fetchers)

View File

@ -12,10 +12,10 @@ from . import data, fetchers, notification, util
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
try: try:
import cfscrape import cloudscraper
except ImportError: except ImportError:
cfscrape = None cloudscraper = None
logger.exception('cfscrape not installed: Certain fetchers might not work') logger.exception('cloudscraper not installed: Certain fetchers might not work')
try: try:
import browser_cookie3 import browser_cookie3
@ -26,11 +26,6 @@ except ImportError:
OUTPUT_PATH = Path('./output') OUTPUT_PATH = Path('./output')
logging.basicConfig(
format='%(asctime)s %(levelname)s %(module)s:%(lineno)d - %(message)s',
)
logger.setLevel('INFO')
STANDARD_HEADERS = { STANDARD_HEADERS = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0',
@ -39,9 +34,9 @@ STANDARD_HEADERS = {
} }
if cfscrape: if cloudscraper:
class CachedCfScrape(requests_cache.CacheMixin, cfscrape.CloudflareScraper): class CachedCfScrape(requests_cache.CacheMixin, cloudscraper.CloudScraper):
pass pass
@ -54,13 +49,21 @@ def get_session(
with_cfscrape: bool, with_cfscrape: bool,
ignore_cache: bool, ignore_cache: bool,
) -> requests.Session: ) -> requests.Session:
if with_cfscrape and cfscrape: session_class = requests_cache.CachedSession
if with_cfscrape:
if cloudscraper:
session_class = CachedCfScrape session_class = CachedCfScrape
if ignore_cache: if ignore_cache:
logger.warning('HTTP cache disabled') logger.warning('HTTP cache disabled')
return cfscrape.create_scraper() return cloudscraper.create_scraper(
interpreter='js2py',
delay=5,
debug=False,
)
else:
logger.error('Expected cloudscraper, but not defined!')
else: else:
session_class = requests_cache.CachedSession
if ignore_cache: if ignore_cache:
logger.warning('HTTP cache disabled') logger.warning('HTTP cache disabled')
return requests.Session() return requests.Session()
@ -100,6 +103,7 @@ def get_cookiejar(use_cookiejar: bool):
browser_cookie3.firefox( browser_cookie3.firefox(
'/home/jmaa/.cachy/mbui5xg7.default-release/cookies.sqlite', '/home/jmaa/.cachy/mbui5xg7.default-release/cookies.sqlite',
) )
logger.warning('Cookiejar has %s cookies', len(cookiejar))
if len(cookiejar) > 10: if len(cookiejar) > 10:
return cookiejar return cookiejar
logger.warning('No cookiejar is used') logger.warning('No cookiejar is used')
@ -114,23 +118,23 @@ def main(
notification_types: frozenset[notification.NotificationType], notification_types: frozenset[notification.NotificationType],
) -> None: ) -> None:
cookiejar = get_cookiejar(use_cookiejar) cookiejar = get_cookiejar(use_cookiejar)
logger.warning('Cookiejar has %s cookies', len(cookiejar))
if len(notification_types) == 0: if len(notification_types) == 0:
logger.info('No notifications enabled: Notifications will not be sent!') logger.info('No notifications enabled: Notifications will not be sent!')
for scraper_cls in available_scrapers(): for scraper_cls in available_scrapers():
if scraper_cls.__name__ not in scraper_filter:
continue
session = get_session( session = get_session(
cookiejar, cookiejar=cookiejar,
with_cfscrape=scraper_cls.requires_cfscrape(), with_cfscrape=scraper_cls.requires_cfscrape(),
ignore_cache=ignore_cache, ignore_cache=ignore_cache,
) )
scraper = scraper_cls(session) scraper = scraper_cls(session)
if scraper_cls.__name__ not in scraper_filter:
continue
logger.info( logger.info(
'Running %s, appending to "%s"', 'Running %s (%s), appending to "%s"',
scraper_cls.__name__, scraper_cls.__name__,
type(session).__name__,
scraper.dataset_name, scraper.dataset_name,
) )
result_rows = [] result_rows = []
@ -138,8 +142,9 @@ def main(
for result in scraper.scrape(): for result in scraper.scrape():
result_rows.append(result) result_rows.append(result)
del result del result
except requests.exceptions.HTTPError: except requests.exceptions.HTTPError as e:
logger.exception('Failed in running %s', scraper_cls.__name__) logger.exception('Failed in running %s', scraper_cls.__name__)
logger.error('User-Agent: %s', e.request.headers['user-agent'])
continue continue
status = util.extend_csv_file( status = util.extend_csv_file(
OUTPUT_PATH / f'{scraper.dataset_name}.csv', OUTPUT_PATH / f'{scraper.dataset_name}.csv',