diff --git a/personal_data/fetchers/gitea.py b/personal_data/fetchers/gitea.py index 037993d..d7fd76a 100644 --- a/personal_data/fetchers/gitea.py +++ b/personal_data/fetchers/gitea.py @@ -4,9 +4,9 @@ from collections.abc import Iterator, Mapping from typing import Any from personal_data.data import DeduplicateMode, Scraper -from ..util import safe_del from .. import secrets +from ..util import safe_del logger = logging.getLogger(__name__) diff --git a/personal_data/fetchers/youtube.py b/personal_data/fetchers/youtube.py index fe12fb7..2c0dc50 100644 --- a/personal_data/fetchers/youtube.py +++ b/personal_data/fetchers/youtube.py @@ -11,7 +11,56 @@ from ..util import safe_del logger = logging.getLogger(__name__) PLAYLIST_ID = 'PLAfDVJvDKCvOMvfoTL7eW8GkWNJwd90eV' -# PLAYLIST_ID='LL' + + +def scrape(watch_history: bool) -> list[dict[str, str]]: + """Use yt-dlp to fetch the list of favorited videos. This is a placeholder for invoking yt-dlp and parsing its output.""" + if watch_history: + url = 'https://www.youtube.com/feed/history' + ytdlp_args = [ + 'yt-dlp', + '--dump-json', + url, + ] + else: + url = f'https://www.youtube.com/playlist?list={PLAYLIST_ID}' + ytdlp_args = [ + 'yt-dlp', + '--flat-playlist', + '--dump-json', + url, + ] + + print(ytdlp_args) + result = subprocess.run( + ytdlp_args, + capture_output=True, + text=True, + ) + + if result.returncode != 0: + raise RuntimeError( + f'Non-zero returncode in command: {result.returncode}\n\n{result.stderr}', + ) + + print(result.stderr) + print(result.stdout) + + output = [] + for line in result.stdout.splitlines(): + data = json.loads(line) + if watch_history: + if 'thumbnails' in data and data['thumbnails']: + data['thumbnail'] = data['thumbnails'][-1]['url'] + if 'timestamp' in data: + data['watch_datetime'] = datetime.datetime.fromtimestamp( + int(data['timestamp']), + ).isoformat() + else: + data['thumbnail'] = data['thumbnails'][-1]['url'] + safe_del(data, '_type', '_version', 'thumbnails') + output.append(data) + return output @dataclass(frozen=True) @@ -19,48 +68,16 @@ class YoutubeFavoritesScraper(Scraper): dataset_name: str = 'youtube_favorites' deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS deduplicate_ignore_columns = [] - watch_history: bool = False def scrape(self) -> list[dict]: - """Use yt-dlp to fetch the list of favorited videos. This is a placeholder for invoking yt-dlp and parsing its output.""" - if self.watch_history: - url = 'https://www.youtube.com/feed/history' - ytdlp_args = [ - 'yt-dlp', - '--dump-json', - url, - ] - else: - url = f'https://www.youtube.com/playlist?list={PLAYLIST_ID}' - ytdlp_args = [ - 'yt-dlp', - '--flat-playlist', - '--dump-json', - url, - ] - result = subprocess.run( - ytdlp_args, - capture_output=True, - text=True, - ) + yield from scrape(watch_history=False) - if result.returncode != 0: - raise RuntimeError( - f'Non-zero returncode in command: {result.returncode}\n\n{result.stderr}', - ) - output = [] - for line in result.stdout.splitlines(): - data = json.loads(line) - if self.watch_history: - if 'thumbnails' in data and data['thumbnails']: - data['thumbnail'] = data['thumbnails'][-1]['url'] - if 'timestamp' in data: - data['watch_datetime'] = datetime.datetime.fromtimestamp( - int(data['timestamp']), - ).isoformat() - else: - data['thumbnail'] = data['thumbnails'][-1]['url'] - safe_del(data, '_type', '_version', 'thumbnails') - output.append(data) - return output +@dataclass(frozen=True) +class YoutubeWatchHistoryScraper(Scraper): + dataset_name: str = 'youtube_watch_history' + deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS + deduplicate_ignore_columns = [] + + def scrape(self) -> list[dict]: + yield from scrape(watch_history=True) diff --git a/personal_data/util.py b/personal_data/util.py index 1267557..9a2f9d8 100644 --- a/personal_data/util.py +++ b/personal_data/util.py @@ -20,7 +20,6 @@ def safe_del(d: dict, *keys: str): del d[key] - def equals_without_fields( a: Mapping[str, Any], b: Mapping[str, Any],