diff --git a/personal_data/fetchers/gitea.py b/personal_data/fetchers/gitea.py index 760e301..037993d 100644 --- a/personal_data/fetchers/gitea.py +++ b/personal_data/fetchers/gitea.py @@ -4,18 +4,13 @@ from collections.abc import Iterator, Mapping from typing import Any from personal_data.data import DeduplicateMode, Scraper +from ..util import safe_del from .. import secrets logger = logging.getLogger(__name__) -def safe_del(d: dict, *keys: str): - for key in keys: - if key in d: - del d[key] - - def to_data_point(p: dict[str, Any]) -> Mapping[str, Any]: p['owner'] = p['owner']['login'] safe_del(p, 'permissions', 'internal_tracker') diff --git a/personal_data/fetchers/youtube.py b/personal_data/fetchers/youtube.py index a193bf2..a3b7c7a 100644 --- a/personal_data/fetchers/youtube.py +++ b/personal_data/fetchers/youtube.py @@ -5,57 +5,37 @@ import subprocess from dataclasses import dataclass from personal_data.data import DeduplicateMode, Scraper +from ..util import safe_del logger = logging.getLogger(__name__) +PLAYLIST_ID='PLAfDVJvDKCvOMvfoTL7eW8GkWNJwd90eV' +#PLAYLIST_ID='LL' @dataclass(frozen=True) class YoutubeFavoritesScraper(Scraper): dataset_name: str = 'youtube_favorites' deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS + deduplicate_ignore_columns = [] - def fetch_data(self) -> list[dict]: + def scrape(self) -> list[dict]: """Use yt-dlp to fetch the list of favorited videos. This is a placeholder for invoking yt-dlp and parsing its output.""" - try: - # Replace 'YOUR_FAVORITES_ID' with your actual favorites playlist ID. - result = subprocess.run( - [ - 'yt-dlp', - '--flat-playlist', - '--dump-json', - 'https://www.youtube.com/playlist?list=YOUR_FAVORITES_ID', - ], - capture_output=True, - check=True, - text=True, - ) - return [json.loads(line) for line in result.stdout.splitlines()] - except Exception: - logger.exception('Failed to fetch YouTube favorites') - raise - - def to_csv(self, videos: list[dict]) -> str: - """Convert the list of videos to CSV format.""" - headers = ['id', 'title', 'url', 'upload_date'] - rows = [headers] + [ + result = subprocess.run( [ - video.get('id'), - video.get('title'), - video.get('url'), - video.get('upload_date'), - ] - for video in videos - ] - from io import StringIO + 'yt-dlp', + '--flat-playlist', + '--dump-json', + f'https://www.youtube.com/playlist?list={PLAYLIST_ID}', + ], + capture_output=True, + text=True, + ) - sio = StringIO() - csv.writer(sio).writerows(rows) - return sio.getvalue() + if result.returncode != 0: + raise RuntimeError(f'Non-zero returncode in command: {result.returncode}\n\n{result.stderr}') - def run(self) -> None: - videos = self.fetch_data() - csv_data = self.to_csv(videos) - logger.info('Fetched and converted %d videos to CSV', len(videos)) - with open('youtube_favorites.csv', 'w', encoding='utf-8') as f: - f.write(csv_data) - logger.info('CSV file written to youtube_favorites.csv') + for line in result.stdout.splitlines(): + data = json.loads(line) + data['thumbnail'] = data['thumbnails'][-1]['url'] + safe_del(data, '_type', '_version', 'thumbnails') + yield data diff --git a/personal_data/util.py b/personal_data/util.py index f013280..1267557 100644 --- a/personal_data/util.py +++ b/personal_data/util.py @@ -14,6 +14,13 @@ from . import csv_import, data logger = logging.getLogger(__name__) +def safe_del(d: dict, *keys: str): + for key in keys: + if key in d: + del d[key] + + + def equals_without_fields( a: Mapping[str, Any], b: Mapping[str, Any],