1
0

Compare commits

..

No commits in common. "e787779b5873ec2b0ae62c53045a7946716b8218" and "9058279b4e2d8398a150624f25a212375fff0315" have entirely different histories.

6 changed files with 35 additions and 102 deletions

View File

@ -261,22 +261,17 @@ PATH_WATCHED = Path('output/show_episodes_watched.csv')
PATH_PLAYED = Path('output/games_played.csv')
PATH_WORKOUT = Path('/home/jmaa/Notes/workout.csv')
PATH_STEP_COUNTS = Path(
'/home/jmaa/Notes/Rawbackupdata/Steps/exportStepCount_2025-03-15_22-58-20',
'/home/jmaa/personal-archive/misc-data/step_counts_2023-07-26_to_2024-09-21.csv',
)
PATH_STEPMANIA = Path('output/stepmania.csv')
IMPORTERS = [
{'path': PATH_WORKOUT, 'standard_variant': True, 'import_rows': import_workout_csv},
{'path': PATH_WORKOUT, 'import_rows': import_workout_csv},
{'path': PATH_STEP_COUNTS, 'import_rows': import_step_counts_csv},
{
'path': PATH_STEPMANIA,
'standard_variant': True,
'import_rows': import_stepmania_steps_csv,
},
{'path': PATH_STEPMANIA, 'import_rows': import_stepmania_steps_csv},
{
'path': PATH_PLAYED,
'standard_variant': True,
'import_rows': lambda vault, rows: import_activity_sample_csv(
vault,
rows,
@ -286,7 +281,6 @@ IMPORTERS = [
},
{
'path': PATH_WATCHED,
'standard_variant': True,
'import_rows': lambda vault, rows: import_activity_sample_csv(
vault,
rows,
@ -307,9 +301,7 @@ def import_data(obsidian_path: Path, dry_run=True):
import_def['path'],
)
continue
rows = load_csv_file(
import_def['path'], sniff=not import_def.get('standard_variant'),
)
rows = load_csv_file(import_def['path'])
logger.info('Loaded CSV with %d lines', len(rows))
num_files_updated = import_def['import_rows'](vault, rows)
logger.info('Updated %d files', num_files_updated)

View File

@ -2,7 +2,6 @@ import csv
import dataclasses
import datetime
import decimal
import logging
import typing
import urllib.parse
from collections.abc import Callable
@ -12,8 +11,6 @@ from typing import Any
from frozendict import frozendict
logger = logging.getLogger(__name__)
CSV_DIALECT = 'one_true_dialect'
csv.register_dialect(CSV_DIALECT, lineterminator='\n', skipinitialspace=True)
@ -89,12 +86,10 @@ def load_csv_file(csv_file: Path, sniff=False) -> list[frozendict[str, typing.An
dicts: list[frozendict] = []
with open(csv_file) as csvfile:
if sniff:
logger.warning('Sniffing CSV variant: %s', csv_file)
dialect = csv.Sniffer().sniff(csvfile.read(1024), delimiters=',;')
dialect = csv.Sniffer().sniff(csvfile.read(1024))
csvfile.seek(0)
else:
dialect = CSV_DIALECT
logger.warning('Loading CSV file: %s', csv_file)
reader = csv.DictReader(csvfile, dialect=dialect)
for row in reader:
for k in list(row.keys()):

View File

@ -4,9 +4,9 @@ from collections.abc import Iterator, Mapping
from typing import Any
from personal_data.data import DeduplicateMode, Scraper
from ..util import safe_del
from .. import secrets
from ..util import safe_del
logger = logging.getLogger(__name__)

View File

@ -1,87 +1,41 @@
import datetime
import csv
import json
import logging
import subprocess
from dataclasses import dataclass
from typing import ClassVar
from personal_data.data import DeduplicateMode, Scraper
from ..util import safe_del
logger = logging.getLogger(__name__)
PLAYLIST_ID = 'PLAfDVJvDKCvOMvfoTL7eW8GkWNJwd90eV'
def scrape(watch_history: bool) -> list[dict[str, str]]:
"""Use yt-dlp to fetch the list of favorited videos. This is a placeholder for invoking yt-dlp and parsing its output."""
if watch_history:
url = 'https://www.youtube.com/feed/history'
ytdlp_args = [
'yt-dlp',
url,
'--dump-json',
'--cookies-from-browser',
'firefox:/home/jmaa/.cachy/mbui5xg7.default-release',
]
else:
url = f'https://www.youtube.com/playlist?list={PLAYLIST_ID}'
ytdlp_args = [
'yt-dlp',
'--flat-playlist',
'--dump-json',
url,
]
result = subprocess.run(
ytdlp_args,
capture_output=True,
text=True,
)
if result.returncode != 0:
message = (
'Non-zero returncode in command: '
+ str(result.returncode)
+ '\n\n'
+ result.stderr
)
raise RuntimeError(message)
output = []
for line in result.stdout.splitlines():
data = json.loads(line)
if watch_history:
if data.get('thumbnails'):
data['thumbnail'] = data['thumbnails'][-1]['url']
if data.get('timestamp'):
data['watch_datetime'] = datetime.datetime.fromtimestamp(
int(data['timestamp']),
tz=datetime.timezone.utc,
).isoformat()
else:
data['thumbnail'] = data['thumbnails'][-1]['url']
safe_del(data, '_type', '_version', 'thumbnails')
output.append(data)
return output
PLAYLIST_ID='PLAfDVJvDKCvOMvfoTL7eW8GkWNJwd90eV'
#PLAYLIST_ID='LL'
@dataclass(frozen=True)
class YoutubeFavoritesScraper(Scraper):
dataset_name: str = 'youtube_favorites'
deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS
deduplicate_ignore_columns: ClassVar[list[str]] = []
deduplicate_ignore_columns = []
def scrape(self) -> list[dict]:
yield from scrape(watch_history=False)
"""Use yt-dlp to fetch the list of favorited videos. This is a placeholder for invoking yt-dlp and parsing its output."""
result = subprocess.run(
[
'yt-dlp',
'--flat-playlist',
'--dump-json',
f'https://www.youtube.com/playlist?list={PLAYLIST_ID}',
],
capture_output=True,
text=True,
)
if result.returncode != 0:
raise RuntimeError(f'Non-zero returncode in command: {result.returncode}\n\n{result.stderr}')
@dataclass(frozen=True)
class YoutubeWatchHistoryScraper(Scraper):
dataset_name: str = 'youtube_watch_history'
deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS
deduplicate_ignore_columns: ClassVar[list[str]] = []
def scrape(self) -> list[dict]:
yield from scrape(watch_history=True)
for line in result.stdout.splitlines():
data = json.loads(line)
data['thumbnail'] = data['thumbnails'][-1]['url']
safe_del(data, '_type', '_version', 'thumbnails')
yield data

View File

@ -91,19 +91,6 @@ def available_scraper_names() -> list[str]:
return [scraper_cls.__name__ for scraper_cls in available_scrapers()]
def get_cookiejar(use_cookiejar: bool):
if use_cookiejar:
logger.warning('Got cookiejar from firefox')
cookiejar = browsercookie.firefox()
if len(cookiejar) > 10:
return cookiejar
browsercookie.firefox(['/home/jmaa/.cachy/mbui5xg7.default-release/cookies.sqlite'])
if len(cookiejar) > 10:
return cookiejar
logger.warning('No cookiejar is used')
return []
def main(
scraper_filter: frozenset[str],
*,
@ -111,8 +98,12 @@ def main(
ignore_cache: bool,
notification_types: frozenset[notification.NotificationType],
) -> None:
cookiejar = get_cookiejar(use_cookiejar)
logger.warning('Cookiejar has %s cookies', len(cookiejar))
if use_cookiejar:
cookiejar = browsercookie.firefox()
logger.info('Got cookiejar from firefox: %s cookies', len(cookiejar))
else:
cookiejar = []
logger.warning('No cookiejar is used')
if len(notification_types) == 0:
logger.info('No notifications enabled: Notifications will not be sent!')

View File

@ -20,6 +20,7 @@ def safe_del(d: dict, *keys: str):
del d[key]
def equals_without_fields(
a: Mapping[str, Any],
b: Mapping[str, Any],