From 119b380f8abac15fe1e92e7c30da8f738e739c2c Mon Sep 17 00:00:00 2001
From: Jon Michael Aanes <jonjmaa@gmail.com>
Date: Mon, 16 Jun 2025 23:24:09 +0200
Subject: [PATCH] Use new improved cloudflare avoider

---
 obsidian_import/__init__.py |  8 +++---
 personal_data/__main__.py   |  6 ++---
 personal_data/main.py       | 49 ++++++++++++++++++++-----------------
 3 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/obsidian_import/__init__.py b/obsidian_import/__init__.py
index 99628d3..b417b75 100644
--- a/obsidian_import/__init__.py
+++ b/obsidian_import/__init__.py
@@ -5,7 +5,7 @@ Sub-module for importing time-based data into Obsidian.
 
 import dataclasses
 import datetime
-from collections.abc import Iterable, Iterator
+from collections.abc import Iterator
 from logging import getLogger
 from pathlib import Path
 from typing import Any
@@ -104,9 +104,9 @@ def import_workout_csv(vault: ObsidianVault, rows: Rows) -> int:
     return num_updated
 
 
-def import_step_counts_csv(vault: ObsidianVault, rows: Rows) -> int:
-    MINIMUM_STEPS = 300
+MINIMUM_BELIEVABLE_STEP_COUNT = 300
 
+def import_step_counts_csv(vault: ObsidianVault, rows: Rows) -> int:
     num_updated = 0
 
     rows_per_date = {}
@@ -121,7 +121,7 @@ def import_step_counts_csv(vault: ObsidianVault, rows: Rows) -> int:
     }
 
     for date, steps in steps_per_date.items():
-        if steps < MINIMUM_STEPS:
+        if steps < MINIMUM_BELIEVABLE_STEP_COUNT:
             continue
         was_updated = vault.add_statistic(date, 'Steps', steps)
         if was_updated:
diff --git a/personal_data/__main__.py b/personal_data/__main__.py
index 208da55..564712c 100644
--- a/personal_data/__main__.py
+++ b/personal_data/__main__.py
@@ -1,6 +1,9 @@
 import argparse
 import logging
 
+logging.basicConfig()
+logging.getLogger('personal_data').setLevel('INFO')
+
 import personal_data.main
 from personal_data.notification import NotificationType
 
@@ -37,9 +40,6 @@ def parse_arguments():
 
 
 def main():
-    logging.basicConfig()
-    logging.getLogger('personal_data').setLevel('INFO')
-
     args = parse_arguments()
     scraper_filter = frozenset(args.fetchers)
 
diff --git a/personal_data/main.py b/personal_data/main.py
index 6006323..f3f330e 100644
--- a/personal_data/main.py
+++ b/personal_data/main.py
@@ -12,10 +12,10 @@ from . import data, fetchers, notification, util
 logger = logging.getLogger(__name__)
 
 try:
-    import cfscrape
+    import cloudscraper
 except ImportError:
-    cfscrape = None
-    logger.exception('cfscrape not installed: Certain fetchers might not work')
+    cloudscraper = None
+    logger.exception('cloudscraper not installed: Certain fetchers might not work')
 
 try:
     import browser_cookie3
@@ -26,11 +26,6 @@ except ImportError:
 
 OUTPUT_PATH = Path('./output')
 
-logging.basicConfig(
-    format='%(asctime)s %(levelname)s %(module)s:%(lineno)d - %(message)s',
-)
-logger.setLevel('INFO')
-
 
 STANDARD_HEADERS = {
     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0',
@@ -39,9 +34,9 @@ STANDARD_HEADERS = {
 }
 
 
-if cfscrape:
+if cloudscraper:
 
-    class CachedCfScrape(requests_cache.CacheMixin, cfscrape.CloudflareScraper):
+    class CachedCfScrape(requests_cache.CacheMixin, cloudscraper.CloudScraper):
         pass
 
 
@@ -54,13 +49,21 @@ def get_session(
     with_cfscrape: bool,
     ignore_cache: bool,
 ) -> requests.Session:
-    if with_cfscrape and cfscrape:
-        session_class = CachedCfScrape
-        if ignore_cache:
-            logger.warning('HTTP cache disabled')
-            return cfscrape.create_scraper()
+    session_class = requests_cache.CachedSession
+    if with_cfscrape:
+        if cloudscraper:
+            session_class = CachedCfScrape
+
+            if ignore_cache:
+                logger.warning('HTTP cache disabled')
+                return cloudscraper.create_scraper(
+                    interpreter='js2py',
+                    delay=5,
+                    debug=False,
+                )
+        else:
+            logger.error('Expected cloudscraper, but not defined!')
     else:
-        session_class = requests_cache.CachedSession
         if ignore_cache:
             logger.warning('HTTP cache disabled')
             return requests.Session()
@@ -100,6 +103,7 @@ def get_cookiejar(use_cookiejar: bool):
         browser_cookie3.firefox(
             '/home/jmaa/.cachy/mbui5xg7.default-release/cookies.sqlite',
         )
+        logger.warning('Cookiejar has %s cookies', len(cookiejar))
         if len(cookiejar) > 10:
             return cookiejar
     logger.warning('No cookiejar is used')
@@ -114,23 +118,23 @@ def main(
     notification_types: frozenset[notification.NotificationType],
 ) -> None:
     cookiejar = get_cookiejar(use_cookiejar)
-    logger.warning('Cookiejar has %s cookies', len(cookiejar))
 
     if len(notification_types) == 0:
         logger.info('No notifications enabled: Notifications will not be sent!')
 
     for scraper_cls in available_scrapers():
+        if scraper_cls.__name__ not in scraper_filter:
+            continue
         session = get_session(
-            cookiejar,
+            cookiejar=cookiejar,
             with_cfscrape=scraper_cls.requires_cfscrape(),
             ignore_cache=ignore_cache,
         )
         scraper = scraper_cls(session)
-        if scraper_cls.__name__ not in scraper_filter:
-            continue
         logger.info(
-            'Running %s, appending to "%s"',
+            'Running %s (%s), appending to "%s"',
             scraper_cls.__name__,
+            type(session).__name__,
             scraper.dataset_name,
         )
         result_rows = []
@@ -138,8 +142,9 @@ def main(
             for result in scraper.scrape():
                 result_rows.append(result)
                 del result
-        except requests.exceptions.HTTPError:
+        except requests.exceptions.HTTPError as e:
             logger.exception('Failed in running %s', scraper_cls.__name__)
+            logger.error('User-Agent: %s', e.request.headers['user-agent'])
             continue
         status = util.extend_csv_file(
             OUTPUT_PATH / f'{scraper.dataset_name}.csv',