Restructure youtube
This commit is contained in:
parent
103235759c
commit
f7894f9d05
|
@ -4,9 +4,9 @@ from collections.abc import Iterator, Mapping
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from personal_data.data import DeduplicateMode, Scraper
|
from personal_data.data import DeduplicateMode, Scraper
|
||||||
from ..util import safe_del
|
|
||||||
|
|
||||||
from .. import secrets
|
from .. import secrets
|
||||||
|
from ..util import safe_del
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,56 @@ from ..util import safe_del
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
PLAYLIST_ID = 'PLAfDVJvDKCvOMvfoTL7eW8GkWNJwd90eV'
|
PLAYLIST_ID = 'PLAfDVJvDKCvOMvfoTL7eW8GkWNJwd90eV'
|
||||||
# PLAYLIST_ID='LL'
|
|
||||||
|
|
||||||
|
def scrape(watch_history: bool) -> list[dict[str, str]]:
|
||||||
|
"""Use yt-dlp to fetch the list of favorited videos. This is a placeholder for invoking yt-dlp and parsing its output."""
|
||||||
|
if watch_history:
|
||||||
|
url = 'https://www.youtube.com/feed/history'
|
||||||
|
ytdlp_args = [
|
||||||
|
'yt-dlp',
|
||||||
|
'--dump-json',
|
||||||
|
url,
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
url = f'https://www.youtube.com/playlist?list={PLAYLIST_ID}'
|
||||||
|
ytdlp_args = [
|
||||||
|
'yt-dlp',
|
||||||
|
'--flat-playlist',
|
||||||
|
'--dump-json',
|
||||||
|
url,
|
||||||
|
]
|
||||||
|
|
||||||
|
print(ytdlp_args)
|
||||||
|
result = subprocess.run(
|
||||||
|
ytdlp_args,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.returncode != 0:
|
||||||
|
raise RuntimeError(
|
||||||
|
f'Non-zero returncode in command: {result.returncode}\n\n{result.stderr}',
|
||||||
|
)
|
||||||
|
|
||||||
|
print(result.stderr)
|
||||||
|
print(result.stdout)
|
||||||
|
|
||||||
|
output = []
|
||||||
|
for line in result.stdout.splitlines():
|
||||||
|
data = json.loads(line)
|
||||||
|
if watch_history:
|
||||||
|
if 'thumbnails' in data and data['thumbnails']:
|
||||||
|
data['thumbnail'] = data['thumbnails'][-1]['url']
|
||||||
|
if 'timestamp' in data:
|
||||||
|
data['watch_datetime'] = datetime.datetime.fromtimestamp(
|
||||||
|
int(data['timestamp']),
|
||||||
|
).isoformat()
|
||||||
|
else:
|
||||||
|
data['thumbnail'] = data['thumbnails'][-1]['url']
|
||||||
|
safe_del(data, '_type', '_version', 'thumbnails')
|
||||||
|
output.append(data)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
|
@ -19,48 +68,16 @@ class YoutubeFavoritesScraper(Scraper):
|
||||||
dataset_name: str = 'youtube_favorites'
|
dataset_name: str = 'youtube_favorites'
|
||||||
deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS
|
deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS
|
||||||
deduplicate_ignore_columns = []
|
deduplicate_ignore_columns = []
|
||||||
watch_history: bool = False
|
|
||||||
|
|
||||||
def scrape(self) -> list[dict]:
|
def scrape(self) -> list[dict]:
|
||||||
"""Use yt-dlp to fetch the list of favorited videos. This is a placeholder for invoking yt-dlp and parsing its output."""
|
yield from scrape(watch_history=False)
|
||||||
if self.watch_history:
|
|
||||||
url = 'https://www.youtube.com/feed/history'
|
|
||||||
ytdlp_args = [
|
|
||||||
'yt-dlp',
|
|
||||||
'--dump-json',
|
|
||||||
url,
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
url = f'https://www.youtube.com/playlist?list={PLAYLIST_ID}'
|
|
||||||
ytdlp_args = [
|
|
||||||
'yt-dlp',
|
|
||||||
'--flat-playlist',
|
|
||||||
'--dump-json',
|
|
||||||
url,
|
|
||||||
]
|
|
||||||
result = subprocess.run(
|
|
||||||
ytdlp_args,
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
if result.returncode != 0:
|
|
||||||
raise RuntimeError(
|
|
||||||
f'Non-zero returncode in command: {result.returncode}\n\n{result.stderr}',
|
|
||||||
)
|
|
||||||
|
|
||||||
output = []
|
@dataclass(frozen=True)
|
||||||
for line in result.stdout.splitlines():
|
class YoutubeWatchHistoryScraper(Scraper):
|
||||||
data = json.loads(line)
|
dataset_name: str = 'youtube_watch_history'
|
||||||
if self.watch_history:
|
deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS
|
||||||
if 'thumbnails' in data and data['thumbnails']:
|
deduplicate_ignore_columns = []
|
||||||
data['thumbnail'] = data['thumbnails'][-1]['url']
|
|
||||||
if 'timestamp' in data:
|
def scrape(self) -> list[dict]:
|
||||||
data['watch_datetime'] = datetime.datetime.fromtimestamp(
|
yield from scrape(watch_history=True)
|
||||||
int(data['timestamp']),
|
|
||||||
).isoformat()
|
|
||||||
else:
|
|
||||||
data['thumbnail'] = data['thumbnails'][-1]['url']
|
|
||||||
safe_del(data, '_type', '_version', 'thumbnails')
|
|
||||||
output.append(data)
|
|
||||||
return output
|
|
||||||
|
|
|
@ -20,7 +20,6 @@ def safe_del(d: dict, *keys: str):
|
||||||
del d[key]
|
del d[key]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def equals_without_fields(
|
def equals_without_fields(
|
||||||
a: Mapping[str, Any],
|
a: Mapping[str, Any],
|
||||||
b: Mapping[str, Any],
|
b: Mapping[str, Any],
|
||||||
|
|
Loading…
Reference in New Issue
Block a user