YouTube fixed
This commit is contained in:
parent
3d9c694fe8
commit
9058279b4e
|
@ -4,18 +4,13 @@ from collections.abc import Iterator, Mapping
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from personal_data.data import DeduplicateMode, Scraper
|
from personal_data.data import DeduplicateMode, Scraper
|
||||||
|
from ..util import safe_del
|
||||||
|
|
||||||
from .. import secrets
|
from .. import secrets
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def safe_del(d: dict, *keys: str):
|
|
||||||
for key in keys:
|
|
||||||
if key in d:
|
|
||||||
del d[key]
|
|
||||||
|
|
||||||
|
|
||||||
def to_data_point(p: dict[str, Any]) -> Mapping[str, Any]:
|
def to_data_point(p: dict[str, Any]) -> Mapping[str, Any]:
|
||||||
p['owner'] = p['owner']['login']
|
p['owner'] = p['owner']['login']
|
||||||
safe_del(p, 'permissions', 'internal_tracker')
|
safe_del(p, 'permissions', 'internal_tracker')
|
||||||
|
|
|
@ -5,57 +5,37 @@ import subprocess
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
from personal_data.data import DeduplicateMode, Scraper
|
from personal_data.data import DeduplicateMode, Scraper
|
||||||
|
from ..util import safe_del
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
PLAYLIST_ID='PLAfDVJvDKCvOMvfoTL7eW8GkWNJwd90eV'
|
||||||
|
#PLAYLIST_ID='LL'
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class YoutubeFavoritesScraper(Scraper):
|
class YoutubeFavoritesScraper(Scraper):
|
||||||
dataset_name: str = 'youtube_favorites'
|
dataset_name: str = 'youtube_favorites'
|
||||||
deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS
|
deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS
|
||||||
|
deduplicate_ignore_columns = []
|
||||||
|
|
||||||
def fetch_data(self) -> list[dict]:
|
def scrape(self) -> list[dict]:
|
||||||
"""Use yt-dlp to fetch the list of favorited videos. This is a placeholder for invoking yt-dlp and parsing its output."""
|
"""Use yt-dlp to fetch the list of favorited videos. This is a placeholder for invoking yt-dlp and parsing its output."""
|
||||||
try:
|
result = subprocess.run(
|
||||||
# Replace 'YOUR_FAVORITES_ID' with your actual favorites playlist ID.
|
|
||||||
result = subprocess.run(
|
|
||||||
[
|
|
||||||
'yt-dlp',
|
|
||||||
'--flat-playlist',
|
|
||||||
'--dump-json',
|
|
||||||
'https://www.youtube.com/playlist?list=YOUR_FAVORITES_ID',
|
|
||||||
],
|
|
||||||
capture_output=True,
|
|
||||||
check=True,
|
|
||||||
text=True,
|
|
||||||
)
|
|
||||||
return [json.loads(line) for line in result.stdout.splitlines()]
|
|
||||||
except Exception:
|
|
||||||
logger.exception('Failed to fetch YouTube favorites')
|
|
||||||
raise
|
|
||||||
|
|
||||||
def to_csv(self, videos: list[dict]) -> str:
|
|
||||||
"""Convert the list of videos to CSV format."""
|
|
||||||
headers = ['id', 'title', 'url', 'upload_date']
|
|
||||||
rows = [headers] + [
|
|
||||||
[
|
[
|
||||||
video.get('id'),
|
'yt-dlp',
|
||||||
video.get('title'),
|
'--flat-playlist',
|
||||||
video.get('url'),
|
'--dump-json',
|
||||||
video.get('upload_date'),
|
f'https://www.youtube.com/playlist?list={PLAYLIST_ID}',
|
||||||
]
|
],
|
||||||
for video in videos
|
capture_output=True,
|
||||||
]
|
text=True,
|
||||||
from io import StringIO
|
)
|
||||||
|
|
||||||
sio = StringIO()
|
if result.returncode != 0:
|
||||||
csv.writer(sio).writerows(rows)
|
raise RuntimeError(f'Non-zero returncode in command: {result.returncode}\n\n{result.stderr}')
|
||||||
return sio.getvalue()
|
|
||||||
|
|
||||||
def run(self) -> None:
|
for line in result.stdout.splitlines():
|
||||||
videos = self.fetch_data()
|
data = json.loads(line)
|
||||||
csv_data = self.to_csv(videos)
|
data['thumbnail'] = data['thumbnails'][-1]['url']
|
||||||
logger.info('Fetched and converted %d videos to CSV', len(videos))
|
safe_del(data, '_type', '_version', 'thumbnails')
|
||||||
with open('youtube_favorites.csv', 'w', encoding='utf-8') as f:
|
yield data
|
||||||
f.write(csv_data)
|
|
||||||
logger.info('CSV file written to youtube_favorites.csv')
|
|
||||||
|
|
|
@ -14,6 +14,13 @@ from . import csv_import, data
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def safe_del(d: dict, *keys: str):
|
||||||
|
for key in keys:
|
||||||
|
if key in d:
|
||||||
|
del d[key]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def equals_without_fields(
|
def equals_without_fields(
|
||||||
a: Mapping[str, Any],
|
a: Mapping[str, Any],
|
||||||
b: Mapping[str, Any],
|
b: Mapping[str, Any],
|
||||||
|
|
Loading…
Reference in New Issue
Block a user