Compare commits
10 Commits
9058279b4e
...
e787779b58
Author | SHA1 | Date | |
---|---|---|---|
e787779b58 | |||
af42e3ba90 | |||
857be3cf2f | |||
c3d2b98bb5 | |||
638a3ae842 | |||
c4291f0b60 | |||
f5446d9093 | |||
f7894f9d05 | |||
103235759c | |||
552b2ea365 |
|
@ -261,17 +261,22 @@ PATH_WATCHED = Path('output/show_episodes_watched.csv')
|
|||
PATH_PLAYED = Path('output/games_played.csv')
|
||||
PATH_WORKOUT = Path('/home/jmaa/Notes/workout.csv')
|
||||
PATH_STEP_COUNTS = Path(
|
||||
'/home/jmaa/personal-archive/misc-data/step_counts_2023-07-26_to_2024-09-21.csv',
|
||||
'/home/jmaa/Notes/Rawbackupdata/Steps/exportStepCount_2025-03-15_22-58-20',
|
||||
)
|
||||
PATH_STEPMANIA = Path('output/stepmania.csv')
|
||||
|
||||
|
||||
IMPORTERS = [
|
||||
{'path': PATH_WORKOUT, 'import_rows': import_workout_csv},
|
||||
{'path': PATH_WORKOUT, 'standard_variant': True, 'import_rows': import_workout_csv},
|
||||
{'path': PATH_STEP_COUNTS, 'import_rows': import_step_counts_csv},
|
||||
{'path': PATH_STEPMANIA, 'import_rows': import_stepmania_steps_csv},
|
||||
{
|
||||
'path': PATH_STEPMANIA,
|
||||
'standard_variant': True,
|
||||
'import_rows': import_stepmania_steps_csv,
|
||||
},
|
||||
{
|
||||
'path': PATH_PLAYED,
|
||||
'standard_variant': True,
|
||||
'import_rows': lambda vault, rows: import_activity_sample_csv(
|
||||
vault,
|
||||
rows,
|
||||
|
@ -281,6 +286,7 @@ IMPORTERS = [
|
|||
},
|
||||
{
|
||||
'path': PATH_WATCHED,
|
||||
'standard_variant': True,
|
||||
'import_rows': lambda vault, rows: import_activity_sample_csv(
|
||||
vault,
|
||||
rows,
|
||||
|
@ -301,7 +307,9 @@ def import_data(obsidian_path: Path, dry_run=True):
|
|||
import_def['path'],
|
||||
)
|
||||
continue
|
||||
rows = load_csv_file(import_def['path'])
|
||||
rows = load_csv_file(
|
||||
import_def['path'], sniff=not import_def.get('standard_variant'),
|
||||
)
|
||||
logger.info('Loaded CSV with %d lines', len(rows))
|
||||
num_files_updated = import_def['import_rows'](vault, rows)
|
||||
logger.info('Updated %d files', num_files_updated)
|
||||
|
|
|
@ -2,6 +2,7 @@ import csv
|
|||
import dataclasses
|
||||
import datetime
|
||||
import decimal
|
||||
import logging
|
||||
import typing
|
||||
import urllib.parse
|
||||
from collections.abc import Callable
|
||||
|
@ -11,6 +12,8 @@ from typing import Any
|
|||
|
||||
from frozendict import frozendict
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
CSV_DIALECT = 'one_true_dialect'
|
||||
csv.register_dialect(CSV_DIALECT, lineterminator='\n', skipinitialspace=True)
|
||||
|
||||
|
@ -86,10 +89,12 @@ def load_csv_file(csv_file: Path, sniff=False) -> list[frozendict[str, typing.An
|
|||
dicts: list[frozendict] = []
|
||||
with open(csv_file) as csvfile:
|
||||
if sniff:
|
||||
dialect = csv.Sniffer().sniff(csvfile.read(1024))
|
||||
logger.warning('Sniffing CSV variant: %s', csv_file)
|
||||
dialect = csv.Sniffer().sniff(csvfile.read(1024), delimiters=',;')
|
||||
csvfile.seek(0)
|
||||
else:
|
||||
dialect = CSV_DIALECT
|
||||
logger.warning('Loading CSV file: %s', csv_file)
|
||||
reader = csv.DictReader(csvfile, dialect=dialect)
|
||||
for row in reader:
|
||||
for k in list(row.keys()):
|
||||
|
|
|
@ -4,9 +4,9 @@ from collections.abc import Iterator, Mapping
|
|||
from typing import Any
|
||||
|
||||
from personal_data.data import DeduplicateMode, Scraper
|
||||
from ..util import safe_del
|
||||
|
||||
from .. import secrets
|
||||
from ..util import safe_del
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
|
@ -1,41 +1,87 @@
|
|||
import csv
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import subprocess
|
||||
from dataclasses import dataclass
|
||||
from typing import ClassVar
|
||||
|
||||
from personal_data.data import DeduplicateMode, Scraper
|
||||
|
||||
from ..util import safe_del
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PLAYLIST_ID='PLAfDVJvDKCvOMvfoTL7eW8GkWNJwd90eV'
|
||||
#PLAYLIST_ID='LL'
|
||||
PLAYLIST_ID = 'PLAfDVJvDKCvOMvfoTL7eW8GkWNJwd90eV'
|
||||
|
||||
|
||||
def scrape(watch_history: bool) -> list[dict[str, str]]:
|
||||
"""Use yt-dlp to fetch the list of favorited videos. This is a placeholder for invoking yt-dlp and parsing its output."""
|
||||
if watch_history:
|
||||
url = 'https://www.youtube.com/feed/history'
|
||||
ytdlp_args = [
|
||||
'yt-dlp',
|
||||
url,
|
||||
'--dump-json',
|
||||
'--cookies-from-browser',
|
||||
'firefox:/home/jmaa/.cachy/mbui5xg7.default-release',
|
||||
]
|
||||
else:
|
||||
url = f'https://www.youtube.com/playlist?list={PLAYLIST_ID}'
|
||||
ytdlp_args = [
|
||||
'yt-dlp',
|
||||
'--flat-playlist',
|
||||
'--dump-json',
|
||||
url,
|
||||
]
|
||||
|
||||
result = subprocess.run(
|
||||
ytdlp_args,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
message = (
|
||||
'Non-zero returncode in command: '
|
||||
+ str(result.returncode)
|
||||
+ '\n\n'
|
||||
+ result.stderr
|
||||
)
|
||||
raise RuntimeError(message)
|
||||
|
||||
output = []
|
||||
for line in result.stdout.splitlines():
|
||||
data = json.loads(line)
|
||||
if watch_history:
|
||||
if data.get('thumbnails'):
|
||||
data['thumbnail'] = data['thumbnails'][-1]['url']
|
||||
if data.get('timestamp'):
|
||||
data['watch_datetime'] = datetime.datetime.fromtimestamp(
|
||||
int(data['timestamp']),
|
||||
tz=datetime.timezone.utc,
|
||||
).isoformat()
|
||||
else:
|
||||
data['thumbnail'] = data['thumbnails'][-1]['url']
|
||||
safe_del(data, '_type', '_version', 'thumbnails')
|
||||
output.append(data)
|
||||
return output
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class YoutubeFavoritesScraper(Scraper):
|
||||
dataset_name: str = 'youtube_favorites'
|
||||
deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS
|
||||
deduplicate_ignore_columns = []
|
||||
deduplicate_ignore_columns: ClassVar[list[str]] = []
|
||||
|
||||
def scrape(self) -> list[dict]:
|
||||
"""Use yt-dlp to fetch the list of favorited videos. This is a placeholder for invoking yt-dlp and parsing its output."""
|
||||
result = subprocess.run(
|
||||
[
|
||||
'yt-dlp',
|
||||
'--flat-playlist',
|
||||
'--dump-json',
|
||||
f'https://www.youtube.com/playlist?list={PLAYLIST_ID}',
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
yield from scrape(watch_history=False)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f'Non-zero returncode in command: {result.returncode}\n\n{result.stderr}')
|
||||
|
||||
for line in result.stdout.splitlines():
|
||||
data = json.loads(line)
|
||||
data['thumbnail'] = data['thumbnails'][-1]['url']
|
||||
safe_del(data, '_type', '_version', 'thumbnails')
|
||||
yield data
|
||||
@dataclass(frozen=True)
|
||||
class YoutubeWatchHistoryScraper(Scraper):
|
||||
dataset_name: str = 'youtube_watch_history'
|
||||
deduplicate_mode: DeduplicateMode = DeduplicateMode.BY_ALL_COLUMNS
|
||||
deduplicate_ignore_columns: ClassVar[list[str]] = []
|
||||
|
||||
def scrape(self) -> list[dict]:
|
||||
yield from scrape(watch_history=True)
|
||||
|
|
|
@ -91,6 +91,19 @@ def available_scraper_names() -> list[str]:
|
|||
return [scraper_cls.__name__ for scraper_cls in available_scrapers()]
|
||||
|
||||
|
||||
def get_cookiejar(use_cookiejar: bool):
|
||||
if use_cookiejar:
|
||||
logger.warning('Got cookiejar from firefox')
|
||||
cookiejar = browsercookie.firefox()
|
||||
if len(cookiejar) > 10:
|
||||
return cookiejar
|
||||
browsercookie.firefox(['/home/jmaa/.cachy/mbui5xg7.default-release/cookies.sqlite'])
|
||||
if len(cookiejar) > 10:
|
||||
return cookiejar
|
||||
logger.warning('No cookiejar is used')
|
||||
return []
|
||||
|
||||
|
||||
def main(
|
||||
scraper_filter: frozenset[str],
|
||||
*,
|
||||
|
@ -98,12 +111,8 @@ def main(
|
|||
ignore_cache: bool,
|
||||
notification_types: frozenset[notification.NotificationType],
|
||||
) -> None:
|
||||
if use_cookiejar:
|
||||
cookiejar = browsercookie.firefox()
|
||||
logger.info('Got cookiejar from firefox: %s cookies', len(cookiejar))
|
||||
else:
|
||||
cookiejar = []
|
||||
logger.warning('No cookiejar is used')
|
||||
cookiejar = get_cookiejar(use_cookiejar)
|
||||
logger.warning('Cookiejar has %s cookies', len(cookiejar))
|
||||
|
||||
if len(notification_types) == 0:
|
||||
logger.info('No notifications enabled: Notifications will not be sent!')
|
||||
|
|
|
@ -20,7 +20,6 @@ def safe_del(d: dict, *keys: str):
|
|||
del d[key]
|
||||
|
||||
|
||||
|
||||
def equals_without_fields(
|
||||
a: Mapping[str, Any],
|
||||
b: Mapping[str, Any],
|
||||
|
|
Loading…
Reference in New Issue
Block a user