diff --git a/personal_data/fetchers/youtube.py b/personal_data/fetchers/youtube.py index a3b7c7a..41bcd2b 100644 --- a/personal_data/fetchers/youtube.py +++ b/personal_data/fetchers/youtube.py @@ -2,6 +2,7 @@ import csv import json import logging import subprocess +import datetime from dataclasses import dataclass from personal_data.data import DeduplicateMode, Scraper @@ -17,25 +18,44 @@ class YoutubeFavoritesScraper(Scraper): dataset_name: str = 'youtube_favorites' deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS deduplicate_ignore_columns = [] + watch_history: bool = False def scrape(self) -> list[dict]: """Use yt-dlp to fetch the list of favorited videos. This is a placeholder for invoking yt-dlp and parsing its output.""" - result = subprocess.run( - [ + if self.watch_history: + url = 'https://www.youtube.com/feed/history' + ytdlp_args = [ + 'yt-dlp', + '--dump-json', + url, + ] + else: + url = f'https://www.youtube.com/playlist?list={PLAYLIST_ID}' + ytdlp_args = [ 'yt-dlp', '--flat-playlist', '--dump-json', - f'https://www.youtube.com/playlist?list={PLAYLIST_ID}', - ], + url, + ] + result = subprocess.run( + ytdlp_args, capture_output=True, text=True, ) - + if result.returncode != 0: raise RuntimeError(f'Non-zero returncode in command: {result.returncode}\n\n{result.stderr}') - + + output = [] for line in result.stdout.splitlines(): data = json.loads(line) - data['thumbnail'] = data['thumbnails'][-1]['url'] + if self.watch_history: + if 'thumbnails' in data and data['thumbnails']: + data['thumbnail'] = data['thumbnails'][-1]['url'] + if 'timestamp' in data: + data['watch_datetime'] = datetime.datetime.fromtimestamp(int(data['timestamp'])).isoformat() + else: + data['thumbnail'] = data['thumbnails'][-1]['url'] safe_del(data, '_type', '_version', 'thumbnails') - yield data + output.append(data) + return output