feat: Add support for downloading YouTube watch history with timestamps
This commit is contained in:
parent
9058279b4e
commit
552b2ea365
|
@ -2,6 +2,7 @@ import csv
|
|||
import json
|
||||
import logging
|
||||
import subprocess
|
||||
import datetime
|
||||
from dataclasses import dataclass
|
||||
|
||||
from personal_data.data import DeduplicateMode, Scraper
|
||||
|
@ -17,16 +18,27 @@ class YoutubeFavoritesScraper(Scraper):
|
|||
dataset_name: str = 'youtube_favorites'
|
||||
deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS
|
||||
deduplicate_ignore_columns = []
|
||||
watch_history: bool = False
|
||||
|
||||
def scrape(self) -> list[dict]:
|
||||
"""Use yt-dlp to fetch the list of favorited videos. This is a placeholder for invoking yt-dlp and parsing its output."""
|
||||
result = subprocess.run(
|
||||
[
|
||||
if self.watch_history:
|
||||
url = 'https://www.youtube.com/feed/history'
|
||||
ytdlp_args = [
|
||||
'yt-dlp',
|
||||
'--dump-json',
|
||||
url,
|
||||
]
|
||||
else:
|
||||
url = f'https://www.youtube.com/playlist?list={PLAYLIST_ID}'
|
||||
ytdlp_args = [
|
||||
'yt-dlp',
|
||||
'--flat-playlist',
|
||||
'--dump-json',
|
||||
f'https://www.youtube.com/playlist?list={PLAYLIST_ID}',
|
||||
],
|
||||
url,
|
||||
]
|
||||
result = subprocess.run(
|
||||
ytdlp_args,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
@ -34,8 +46,16 @@ class YoutubeFavoritesScraper(Scraper):
|
|||
if result.returncode != 0:
|
||||
raise RuntimeError(f'Non-zero returncode in command: {result.returncode}\n\n{result.stderr}')
|
||||
|
||||
output = []
|
||||
for line in result.stdout.splitlines():
|
||||
data = json.loads(line)
|
||||
if self.watch_history:
|
||||
if 'thumbnails' in data and data['thumbnails']:
|
||||
data['thumbnail'] = data['thumbnails'][-1]['url']
|
||||
if 'timestamp' in data:
|
||||
data['watch_datetime'] = datetime.datetime.fromtimestamp(int(data['timestamp'])).isoformat()
|
||||
else:
|
||||
data['thumbnail'] = data['thumbnails'][-1]['url']
|
||||
safe_del(data, '_type', '_version', 'thumbnails')
|
||||
yield data
|
||||
output.append(data)
|
||||
return output
|
||||
|
|
Loading…
Reference in New Issue
Block a user