Use new improved cloudflare avoider

2025-06-16 23:24:09 +02:00 · 2025-06-16 23:24:09 +02:00 · 119b380f8a
commit 119b380f8a
parent d945fb81fb
3 changed files with 34 additions and 29 deletions
--- a/obsidian_import/init.py
+++ b/obsidian_import/init.py
@ -5,7 +5,7 @@ Sub-module for importing time-based data into Obsidian.

 import dataclasses
 import datetime
-from collections.abc import Iterable, Iterator
+from collections.abc import Iterator
 from logging import getLogger
 from pathlib import Path
 from typing import Any
@ -104,9 +104,9 @@ def import_workout_csv(vault: ObsidianVault, rows: Rows) -> int:
    return num_updated


-def import_step_counts_csv(vault: ObsidianVault, rows: Rows) -> int:
-    MINIMUM_STEPS = 300
+MINIMUM_BELIEVABLE_STEP_COUNT = 300

+def import_step_counts_csv(vault: ObsidianVault, rows: Rows) -> int:
    num_updated = 0

    rows_per_date = {}
@ -121,7 +121,7 @@ def import_step_counts_csv(vault: ObsidianVault, rows: Rows) -> int:
    }

    for date, steps in steps_per_date.items():
-        if steps < MINIMUM_STEPS:
+        if steps < MINIMUM_BELIEVABLE_STEP_COUNT:
            continue
        was_updated = vault.add_statistic(date, 'Steps', steps)
        if was_updated:
--- a/personal_data/main.py
+++ b/personal_data/main.py
@ -1,6 +1,9 @@
 import argparse
 import logging

+logging.basicConfig()
+logging.getLogger('personal_data').setLevel('INFO')
+
 import personal_data.main
 from personal_data.notification import NotificationType

@ -37,9 +40,6 @@ def parse_arguments():


 def main():
-    logging.basicConfig()
-    logging.getLogger('personal_data').setLevel('INFO')
-
    args = parse_arguments()
    scraper_filter = frozenset(args.fetchers)

--- a/personal_data/main.py
+++ b/personal_data/main.py
@ -12,10 +12,10 @@ from . import data, fetchers, notification, util
 logger = logging.getLogger(__name__)

 try:
-    import cfscrape
+    import cloudscraper
 except ImportError:
-    cfscrape = None
-    logger.exception('cfscrape not installed: Certain fetchers might not work')
+    cloudscraper = None
+    logger.exception('cloudscraper not installed: Certain fetchers might not work')

 try:
    import browser_cookie3
@ -26,11 +26,6 @@ except ImportError:

 OUTPUT_PATH = Path('./output')

-logging.basicConfig(
-    format='%(asctime)s %(levelname)s %(module)s:%(lineno)d - %(message)s',
-)
-logger.setLevel('INFO')
-

 STANDARD_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0',
@ -39,9 +34,9 @@ STANDARD_HEADERS = {
 }


-if cfscrape:
+if cloudscraper:

-    class CachedCfScrape(requests_cache.CacheMixin, cfscrape.CloudflareScraper):
+    class CachedCfScrape(requests_cache.CacheMixin, cloudscraper.CloudScraper):
        pass


@ -54,13 +49,21 @@ def get_session(
    with_cfscrape: bool,
    ignore_cache: bool,
 ) -> requests.Session:
-    if with_cfscrape and cfscrape:
+    session_class = requests_cache.CachedSession
+    if with_cfscrape:
+        if cloudscraper:
            session_class = CachedCfScrape
+
            if ignore_cache:
                logger.warning('HTTP cache disabled')
-            return cfscrape.create_scraper()
+                return cloudscraper.create_scraper(
+                    interpreter='js2py',
+                    delay=5,
+                    debug=False,
+                )
+        else:
+            logger.error('Expected cloudscraper, but not defined!')
    else:
-        session_class = requests_cache.CachedSession
        if ignore_cache:
            logger.warning('HTTP cache disabled')
            return requests.Session()
@ -100,6 +103,7 @@ def get_cookiejar(use_cookiejar: bool):
        browser_cookie3.firefox(
            '/home/jmaa/.cachy/mbui5xg7.default-release/cookies.sqlite',
        )
+        logger.warning('Cookiejar has %s cookies', len(cookiejar))
        if len(cookiejar) > 10:
            return cookiejar
    logger.warning('No cookiejar is used')
@ -114,23 +118,23 @@ def main(
    notification_types: frozenset[notification.NotificationType],
 ) -> None:
    cookiejar = get_cookiejar(use_cookiejar)
-    logger.warning('Cookiejar has %s cookies', len(cookiejar))

    if len(notification_types) == 0:
        logger.info('No notifications enabled: Notifications will not be sent!')

    for scraper_cls in available_scrapers():
+        if scraper_cls.__name__ not in scraper_filter:
+            continue
        session = get_session(
-            cookiejar,
+            cookiejar=cookiejar,
            with_cfscrape=scraper_cls.requires_cfscrape(),
            ignore_cache=ignore_cache,
        )
        scraper = scraper_cls(session)
-        if scraper_cls.__name__ not in scraper_filter:
-            continue
        logger.info(
-            'Running %s, appending to "%s"',
+            'Running %s (%s), appending to "%s"',
            scraper_cls.__name__,
+            type(session).__name__,
            scraper.dataset_name,
        )
        result_rows = []
@ -138,8 +142,9 @@ def main(
            for result in scraper.scrape():
                result_rows.append(result)
                del result
-        except requests.exceptions.HTTPError:
+        except requests.exceptions.HTTPError as e:
            logger.exception('Failed in running %s', scraper_cls.__name__)
+            logger.error('User-Agent: %s', e.request.headers['user-agent'])
            continue
        status = util.extend_csv_file(
            OUTPUT_PATH / f'{scraper.dataset_name}.csv',