1
0

Compare commits

...

10 Commits

6 changed files with 102 additions and 35 deletions

View File

@ -261,17 +261,22 @@ PATH_WATCHED = Path('output/show_episodes_watched.csv')
PATH_PLAYED = Path('output/games_played.csv') PATH_PLAYED = Path('output/games_played.csv')
PATH_WORKOUT = Path('/home/jmaa/Notes/workout.csv') PATH_WORKOUT = Path('/home/jmaa/Notes/workout.csv')
PATH_STEP_COUNTS = Path( PATH_STEP_COUNTS = Path(
'/home/jmaa/personal-archive/misc-data/step_counts_2023-07-26_to_2024-09-21.csv', '/home/jmaa/Notes/Rawbackupdata/Steps/exportStepCount_2025-03-15_22-58-20',
) )
PATH_STEPMANIA = Path('output/stepmania.csv') PATH_STEPMANIA = Path('output/stepmania.csv')
IMPORTERS = [ IMPORTERS = [
{'path': PATH_WORKOUT, 'import_rows': import_workout_csv}, {'path': PATH_WORKOUT, 'standard_variant': True, 'import_rows': import_workout_csv},
{'path': PATH_STEP_COUNTS, 'import_rows': import_step_counts_csv}, {'path': PATH_STEP_COUNTS, 'import_rows': import_step_counts_csv},
{'path': PATH_STEPMANIA, 'import_rows': import_stepmania_steps_csv}, {
'path': PATH_STEPMANIA,
'standard_variant': True,
'import_rows': import_stepmania_steps_csv,
},
{ {
'path': PATH_PLAYED, 'path': PATH_PLAYED,
'standard_variant': True,
'import_rows': lambda vault, rows: import_activity_sample_csv( 'import_rows': lambda vault, rows: import_activity_sample_csv(
vault, vault,
rows, rows,
@ -281,6 +286,7 @@ IMPORTERS = [
}, },
{ {
'path': PATH_WATCHED, 'path': PATH_WATCHED,
'standard_variant': True,
'import_rows': lambda vault, rows: import_activity_sample_csv( 'import_rows': lambda vault, rows: import_activity_sample_csv(
vault, vault,
rows, rows,
@ -301,7 +307,9 @@ def import_data(obsidian_path: Path, dry_run=True):
import_def['path'], import_def['path'],
) )
continue continue
rows = load_csv_file(import_def['path']) rows = load_csv_file(
import_def['path'], sniff=not import_def.get('standard_variant'),
)
logger.info('Loaded CSV with %d lines', len(rows)) logger.info('Loaded CSV with %d lines', len(rows))
num_files_updated = import_def['import_rows'](vault, rows) num_files_updated = import_def['import_rows'](vault, rows)
logger.info('Updated %d files', num_files_updated) logger.info('Updated %d files', num_files_updated)

View File

@ -2,6 +2,7 @@ import csv
import dataclasses import dataclasses
import datetime import datetime
import decimal import decimal
import logging
import typing import typing
import urllib.parse import urllib.parse
from collections.abc import Callable from collections.abc import Callable
@ -11,6 +12,8 @@ from typing import Any
from frozendict import frozendict from frozendict import frozendict
logger = logging.getLogger(__name__)
CSV_DIALECT = 'one_true_dialect' CSV_DIALECT = 'one_true_dialect'
csv.register_dialect(CSV_DIALECT, lineterminator='\n', skipinitialspace=True) csv.register_dialect(CSV_DIALECT, lineterminator='\n', skipinitialspace=True)
@ -86,10 +89,12 @@ def load_csv_file(csv_file: Path, sniff=False) -> list[frozendict[str, typing.An
dicts: list[frozendict] = [] dicts: list[frozendict] = []
with open(csv_file) as csvfile: with open(csv_file) as csvfile:
if sniff: if sniff:
dialect = csv.Sniffer().sniff(csvfile.read(1024)) logger.warning('Sniffing CSV variant: %s', csv_file)
dialect = csv.Sniffer().sniff(csvfile.read(1024), delimiters=',;')
csvfile.seek(0) csvfile.seek(0)
else: else:
dialect = CSV_DIALECT dialect = CSV_DIALECT
logger.warning('Loading CSV file: %s', csv_file)
reader = csv.DictReader(csvfile, dialect=dialect) reader = csv.DictReader(csvfile, dialect=dialect)
for row in reader: for row in reader:
for k in list(row.keys()): for k in list(row.keys()):

View File

@ -4,9 +4,9 @@ from collections.abc import Iterator, Mapping
from typing import Any from typing import Any
from personal_data.data import DeduplicateMode, Scraper from personal_data.data import DeduplicateMode, Scraper
from ..util import safe_del
from .. import secrets from .. import secrets
from ..util import safe_del
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)

View File

@ -1,41 +1,87 @@
import csv import datetime
import json import json
import logging import logging
import subprocess import subprocess
from dataclasses import dataclass from dataclasses import dataclass
from typing import ClassVar
from personal_data.data import DeduplicateMode, Scraper from personal_data.data import DeduplicateMode, Scraper
from ..util import safe_del from ..util import safe_del
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
PLAYLIST_ID = 'PLAfDVJvDKCvOMvfoTL7eW8GkWNJwd90eV' PLAYLIST_ID = 'PLAfDVJvDKCvOMvfoTL7eW8GkWNJwd90eV'
#PLAYLIST_ID='LL'
@dataclass(frozen=True)
class YoutubeFavoritesScraper(Scraper):
dataset_name: str = 'youtube_favorites'
deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS
deduplicate_ignore_columns = []
def scrape(self) -> list[dict]: def scrape(watch_history: bool) -> list[dict[str, str]]:
"""Use yt-dlp to fetch the list of favorited videos. This is a placeholder for invoking yt-dlp and parsing its output.""" """Use yt-dlp to fetch the list of favorited videos. This is a placeholder for invoking yt-dlp and parsing its output."""
result = subprocess.run( if watch_history:
[ url = 'https://www.youtube.com/feed/history'
ytdlp_args = [
'yt-dlp',
url,
'--dump-json',
'--cookies-from-browser',
'firefox:/home/jmaa/.cachy/mbui5xg7.default-release',
]
else:
url = f'https://www.youtube.com/playlist?list={PLAYLIST_ID}'
ytdlp_args = [
'yt-dlp', 'yt-dlp',
'--flat-playlist', '--flat-playlist',
'--dump-json', '--dump-json',
f'https://www.youtube.com/playlist?list={PLAYLIST_ID}', url,
], ]
result = subprocess.run(
ytdlp_args,
capture_output=True, capture_output=True,
text=True, text=True,
) )
if result.returncode != 0: if result.returncode != 0:
raise RuntimeError(f'Non-zero returncode in command: {result.returncode}\n\n{result.stderr}') message = (
'Non-zero returncode in command: '
+ str(result.returncode)
+ '\n\n'
+ result.stderr
)
raise RuntimeError(message)
output = []
for line in result.stdout.splitlines(): for line in result.stdout.splitlines():
data = json.loads(line) data = json.loads(line)
if watch_history:
if data.get('thumbnails'):
data['thumbnail'] = data['thumbnails'][-1]['url']
if data.get('timestamp'):
data['watch_datetime'] = datetime.datetime.fromtimestamp(
int(data['timestamp']),
tz=datetime.timezone.utc,
).isoformat()
else:
data['thumbnail'] = data['thumbnails'][-1]['url'] data['thumbnail'] = data['thumbnails'][-1]['url']
safe_del(data, '_type', '_version', 'thumbnails') safe_del(data, '_type', '_version', 'thumbnails')
yield data output.append(data)
return output
@dataclass(frozen=True)
class YoutubeFavoritesScraper(Scraper):
dataset_name: str = 'youtube_favorites'
deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS
deduplicate_ignore_columns: ClassVar[list[str]] = []
def scrape(self) -> list[dict]:
yield from scrape(watch_history=False)
@dataclass(frozen=True)
class YoutubeWatchHistoryScraper(Scraper):
dataset_name: str = 'youtube_watch_history'
deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS
deduplicate_ignore_columns: ClassVar[list[str]] = []
def scrape(self) -> list[dict]:
yield from scrape(watch_history=True)

View File

@ -91,6 +91,19 @@ def available_scraper_names() -> list[str]:
return [scraper_cls.__name__ for scraper_cls in available_scrapers()] return [scraper_cls.__name__ for scraper_cls in available_scrapers()]
def get_cookiejar(use_cookiejar: bool):
if use_cookiejar:
logger.warning('Got cookiejar from firefox')
cookiejar = browsercookie.firefox()
if len(cookiejar) > 10:
return cookiejar
browsercookie.firefox(['/home/jmaa/.cachy/mbui5xg7.default-release/cookies.sqlite'])
if len(cookiejar) > 10:
return cookiejar
logger.warning('No cookiejar is used')
return []
def main( def main(
scraper_filter: frozenset[str], scraper_filter: frozenset[str],
*, *,
@ -98,12 +111,8 @@ def main(
ignore_cache: bool, ignore_cache: bool,
notification_types: frozenset[notification.NotificationType], notification_types: frozenset[notification.NotificationType],
) -> None: ) -> None:
if use_cookiejar: cookiejar = get_cookiejar(use_cookiejar)
cookiejar = browsercookie.firefox() logger.warning('Cookiejar has %s cookies', len(cookiejar))
logger.info('Got cookiejar from firefox: %s cookies', len(cookiejar))
else:
cookiejar = []
logger.warning('No cookiejar is used')
if len(notification_types) == 0: if len(notification_types) == 0:
logger.info('No notifications enabled: Notifications will not be sent!') logger.info('No notifications enabled: Notifications will not be sent!')

View File

@ -20,7 +20,6 @@ def safe_del(d: dict, *keys: str):
del d[key] del d[key]
def equals_without_fields( def equals_without_fields(
a: Mapping[str, Any], a: Mapping[str, Any],
b: Mapping[str, Any], b: Mapping[str, Any],