From 119b380f8abac15fe1e92e7c30da8f738e739c2c Mon Sep 17 00:00:00 2001 From: Jon Michael Aanes Date: Mon, 16 Jun 2025 23:24:09 +0200 Subject: [PATCH] Use new improved cloudflare avoider --- obsidian_import/__init__.py | 8 +++--- personal_data/__main__.py | 6 ++--- personal_data/main.py | 49 ++++++++++++++++++++----------------- 3 files changed, 34 insertions(+), 29 deletions(-) diff --git a/obsidian_import/__init__.py b/obsidian_import/__init__.py index 99628d3..b417b75 100644 --- a/obsidian_import/__init__.py +++ b/obsidian_import/__init__.py @@ -5,7 +5,7 @@ Sub-module for importing time-based data into Obsidian. import dataclasses import datetime -from collections.abc import Iterable, Iterator +from collections.abc import Iterator from logging import getLogger from pathlib import Path from typing import Any @@ -104,9 +104,9 @@ def import_workout_csv(vault: ObsidianVault, rows: Rows) -> int: return num_updated -def import_step_counts_csv(vault: ObsidianVault, rows: Rows) -> int: - MINIMUM_STEPS = 300 +MINIMUM_BELIEVABLE_STEP_COUNT = 300 +def import_step_counts_csv(vault: ObsidianVault, rows: Rows) -> int: num_updated = 0 rows_per_date = {} @@ -121,7 +121,7 @@ def import_step_counts_csv(vault: ObsidianVault, rows: Rows) -> int: } for date, steps in steps_per_date.items(): - if steps < MINIMUM_STEPS: + if steps < MINIMUM_BELIEVABLE_STEP_COUNT: continue was_updated = vault.add_statistic(date, 'Steps', steps) if was_updated: diff --git a/personal_data/__main__.py b/personal_data/__main__.py index 208da55..564712c 100644 --- a/personal_data/__main__.py +++ b/personal_data/__main__.py @@ -1,6 +1,9 @@ import argparse import logging +logging.basicConfig() +logging.getLogger('personal_data').setLevel('INFO') + import personal_data.main from personal_data.notification import NotificationType @@ -37,9 +40,6 @@ def parse_arguments(): def main(): - logging.basicConfig() - logging.getLogger('personal_data').setLevel('INFO') - args = parse_arguments() scraper_filter = frozenset(args.fetchers) diff --git a/personal_data/main.py b/personal_data/main.py index 6006323..f3f330e 100644 --- a/personal_data/main.py +++ b/personal_data/main.py @@ -12,10 +12,10 @@ from . import data, fetchers, notification, util logger = logging.getLogger(__name__) try: - import cfscrape + import cloudscraper except ImportError: - cfscrape = None - logger.exception('cfscrape not installed: Certain fetchers might not work') + cloudscraper = None + logger.exception('cloudscraper not installed: Certain fetchers might not work') try: import browser_cookie3 @@ -26,11 +26,6 @@ except ImportError: OUTPUT_PATH = Path('./output') -logging.basicConfig( - format='%(asctime)s %(levelname)s %(module)s:%(lineno)d - %(message)s', -) -logger.setLevel('INFO') - STANDARD_HEADERS = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0', @@ -39,9 +34,9 @@ STANDARD_HEADERS = { } -if cfscrape: +if cloudscraper: - class CachedCfScrape(requests_cache.CacheMixin, cfscrape.CloudflareScraper): + class CachedCfScrape(requests_cache.CacheMixin, cloudscraper.CloudScraper): pass @@ -54,13 +49,21 @@ def get_session( with_cfscrape: bool, ignore_cache: bool, ) -> requests.Session: - if with_cfscrape and cfscrape: - session_class = CachedCfScrape - if ignore_cache: - logger.warning('HTTP cache disabled') - return cfscrape.create_scraper() + session_class = requests_cache.CachedSession + if with_cfscrape: + if cloudscraper: + session_class = CachedCfScrape + + if ignore_cache: + logger.warning('HTTP cache disabled') + return cloudscraper.create_scraper( + interpreter='js2py', + delay=5, + debug=False, + ) + else: + logger.error('Expected cloudscraper, but not defined!') else: - session_class = requests_cache.CachedSession if ignore_cache: logger.warning('HTTP cache disabled') return requests.Session() @@ -100,6 +103,7 @@ def get_cookiejar(use_cookiejar: bool): browser_cookie3.firefox( '/home/jmaa/.cachy/mbui5xg7.default-release/cookies.sqlite', ) + logger.warning('Cookiejar has %s cookies', len(cookiejar)) if len(cookiejar) > 10: return cookiejar logger.warning('No cookiejar is used') @@ -114,23 +118,23 @@ def main( notification_types: frozenset[notification.NotificationType], ) -> None: cookiejar = get_cookiejar(use_cookiejar) - logger.warning('Cookiejar has %s cookies', len(cookiejar)) if len(notification_types) == 0: logger.info('No notifications enabled: Notifications will not be sent!') for scraper_cls in available_scrapers(): + if scraper_cls.__name__ not in scraper_filter: + continue session = get_session( - cookiejar, + cookiejar=cookiejar, with_cfscrape=scraper_cls.requires_cfscrape(), ignore_cache=ignore_cache, ) scraper = scraper_cls(session) - if scraper_cls.__name__ not in scraper_filter: - continue logger.info( - 'Running %s, appending to "%s"', + 'Running %s (%s), appending to "%s"', scraper_cls.__name__, + type(session).__name__, scraper.dataset_name, ) result_rows = [] @@ -138,8 +142,9 @@ def main( for result in scraper.scrape(): result_rows.append(result) del result - except requests.exceptions.HTTPError: + except requests.exceptions.HTTPError as e: logger.exception('Failed in running %s', scraper_cls.__name__) + logger.error('User-Agent: %s', e.request.headers['user-agent']) continue status = util.extend_csv_file( OUTPUT_PATH / f'{scraper.dataset_name}.csv',