import csv import json import logging import subprocess from dataclasses import dataclass from personal_data.data import DeduplicateMode, Scraper from ..util import safe_del logger = logging.getLogger(__name__) PLAYLIST_ID='PLAfDVJvDKCvOMvfoTL7eW8GkWNJwd90eV' #PLAYLIST_ID='LL' @dataclass(frozen=True) class YoutubeFavoritesScraper(Scraper): dataset_name: str = 'youtube_favorites' deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS deduplicate_ignore_columns = [] def scrape(self) -> list[dict]: """Use yt-dlp to fetch the list of favorited videos. This is a placeholder for invoking yt-dlp and parsing its output.""" result = subprocess.run( [ 'yt-dlp', '--flat-playlist', '--dump-json', f'https://www.youtube.com/playlist?list={PLAYLIST_ID}', ], capture_output=True, text=True, ) if result.returncode != 0: raise RuntimeError(f'Non-zero returncode in command: {result.returncode}\n\n{result.stderr}') for line in result.stdout.splitlines(): data = json.loads(line) data['thumbnail'] = data['thumbnails'][-1]['url'] safe_del(data, '_type', '_version', 'thumbnails') yield data