1
0

feat: Add support for downloading YouTube watch history with timestamps

This commit is contained in:
Jon Michael Aanes (aider) 2025-03-15 22:26:19 +01:00
parent 9058279b4e
commit 552b2ea365

View File

@ -2,6 +2,7 @@ import csv
import json import json
import logging import logging
import subprocess import subprocess
import datetime
from dataclasses import dataclass from dataclasses import dataclass
from personal_data.data import DeduplicateMode, Scraper from personal_data.data import DeduplicateMode, Scraper
@ -17,25 +18,44 @@ class YoutubeFavoritesScraper(Scraper):
dataset_name: str = 'youtube_favorites' dataset_name: str = 'youtube_favorites'
deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS
deduplicate_ignore_columns = [] deduplicate_ignore_columns = []
watch_history: bool = False
def scrape(self) -> list[dict]: def scrape(self) -> list[dict]:
"""Use yt-dlp to fetch the list of favorited videos. This is a placeholder for invoking yt-dlp and parsing its output.""" """Use yt-dlp to fetch the list of favorited videos. This is a placeholder for invoking yt-dlp and parsing its output."""
result = subprocess.run( if self.watch_history:
[ url = 'https://www.youtube.com/feed/history'
ytdlp_args = [
'yt-dlp',
'--dump-json',
url,
]
else:
url = f'https://www.youtube.com/playlist?list={PLAYLIST_ID}'
ytdlp_args = [
'yt-dlp', 'yt-dlp',
'--flat-playlist', '--flat-playlist',
'--dump-json', '--dump-json',
f'https://www.youtube.com/playlist?list={PLAYLIST_ID}', url,
], ]
result = subprocess.run(
ytdlp_args,
capture_output=True, capture_output=True,
text=True, text=True,
) )
if result.returncode != 0: if result.returncode != 0:
raise RuntimeError(f'Non-zero returncode in command: {result.returncode}\n\n{result.stderr}') raise RuntimeError(f'Non-zero returncode in command: {result.returncode}\n\n{result.stderr}')
output = []
for line in result.stdout.splitlines(): for line in result.stdout.splitlines():
data = json.loads(line) data = json.loads(line)
data['thumbnail'] = data['thumbnails'][-1]['url'] if self.watch_history:
if 'thumbnails' in data and data['thumbnails']:
data['thumbnail'] = data['thumbnails'][-1]['url']
if 'timestamp' in data:
data['watch_datetime'] = datetime.datetime.fromtimestamp(int(data['timestamp'])).isoformat()
else:
data['thumbnail'] = data['thumbnails'][-1]['url']
safe_del(data, '_type', '_version', 'thumbnails') safe_del(data, '_type', '_version', 'thumbnails')
yield data output.append(data)
return output