1
0

Restructure youtube

This commit is contained in:
Jon Michael Aanes 2025-03-15 22:33:10 +01:00
parent 103235759c
commit f7894f9d05
3 changed files with 60 additions and 44 deletions

View File

@ -4,9 +4,9 @@ from collections.abc import Iterator, Mapping
from typing import Any
from personal_data.data import DeduplicateMode, Scraper
from ..util import safe_del
from .. import secrets
from ..util import safe_del
logger = logging.getLogger(__name__)

View File

@ -11,19 +11,11 @@ from ..util import safe_del
logger = logging.getLogger(__name__)
PLAYLIST_ID = 'PLAfDVJvDKCvOMvfoTL7eW8GkWNJwd90eV'
# PLAYLIST_ID='LL'
@dataclass(frozen=True)
class YoutubeFavoritesScraper(Scraper):
dataset_name: str = 'youtube_favorites'
deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS
deduplicate_ignore_columns = []
watch_history: bool = False
def scrape(self) -> list[dict]:
def scrape(watch_history: bool) -> list[dict[str, str]]:
"""Use yt-dlp to fetch the list of favorited videos. This is a placeholder for invoking yt-dlp and parsing its output."""
if self.watch_history:
if watch_history:
url = 'https://www.youtube.com/feed/history'
ytdlp_args = [
'yt-dlp',
@ -38,6 +30,8 @@ class YoutubeFavoritesScraper(Scraper):
'--dump-json',
url,
]
print(ytdlp_args)
result = subprocess.run(
ytdlp_args,
capture_output=True,
@ -49,10 +43,13 @@ class YoutubeFavoritesScraper(Scraper):
f'Non-zero returncode in command: {result.returncode}\n\n{result.stderr}',
)
print(result.stderr)
print(result.stdout)
output = []
for line in result.stdout.splitlines():
data = json.loads(line)
if self.watch_history:
if watch_history:
if 'thumbnails' in data and data['thumbnails']:
data['thumbnail'] = data['thumbnails'][-1]['url']
if 'timestamp' in data:
@ -64,3 +61,23 @@ class YoutubeFavoritesScraper(Scraper):
safe_del(data, '_type', '_version', 'thumbnails')
output.append(data)
return output
@dataclass(frozen=True)
class YoutubeFavoritesScraper(Scraper):
dataset_name: str = 'youtube_favorites'
deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS
deduplicate_ignore_columns = []
def scrape(self) -> list[dict]:
yield from scrape(watch_history=False)
@dataclass(frozen=True)
class YoutubeWatchHistoryScraper(Scraper):
dataset_name: str = 'youtube_watch_history'
deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS
deduplicate_ignore_columns = []
def scrape(self) -> list[dict]:
yield from scrape(watch_history=True)

View File

@ -20,7 +20,6 @@ def safe_del(d: dict, *keys: str):
del d[key]
def equals_without_fields(
a: Mapping[str, Any],
b: Mapping[str, Any],