1
0

Improved csv sniffing

This commit is contained in:
Jon Michael Aanes 2025-03-16 00:02:08 +01:00
parent 857be3cf2f
commit af42e3ba90
3 changed files with 20 additions and 6 deletions

View File

@ -261,17 +261,22 @@ PATH_WATCHED = Path('output/show_episodes_watched.csv')
PATH_PLAYED = Path('output/games_played.csv') PATH_PLAYED = Path('output/games_played.csv')
PATH_WORKOUT = Path('/home/jmaa/Notes/workout.csv') PATH_WORKOUT = Path('/home/jmaa/Notes/workout.csv')
PATH_STEP_COUNTS = Path( PATH_STEP_COUNTS = Path(
'/home/jmaa/personal-archive/misc-data/step_counts_2023-07-26_to_2024-09-21.csv', '/home/jmaa/Notes/Rawbackupdata/Steps/exportStepCount_2025-03-15_22-58-20',
) )
PATH_STEPMANIA = Path('output/stepmania.csv') PATH_STEPMANIA = Path('output/stepmania.csv')
IMPORTERS = [ IMPORTERS = [
{'path': PATH_WORKOUT, 'import_rows': import_workout_csv}, {'path': PATH_WORKOUT, 'standard_variant': True, 'import_rows': import_workout_csv},
{'path': PATH_STEP_COUNTS, 'import_rows': import_step_counts_csv}, {'path': PATH_STEP_COUNTS, 'import_rows': import_step_counts_csv},
{'path': PATH_STEPMANIA, 'import_rows': import_stepmania_steps_csv}, {
'path': PATH_STEPMANIA,
'standard_variant': True,
'import_rows': import_stepmania_steps_csv,
},
{ {
'path': PATH_PLAYED, 'path': PATH_PLAYED,
'standard_variant': True,
'import_rows': lambda vault, rows: import_activity_sample_csv( 'import_rows': lambda vault, rows: import_activity_sample_csv(
vault, vault,
rows, rows,
@ -281,6 +286,7 @@ IMPORTERS = [
}, },
{ {
'path': PATH_WATCHED, 'path': PATH_WATCHED,
'standard_variant': True,
'import_rows': lambda vault, rows: import_activity_sample_csv( 'import_rows': lambda vault, rows: import_activity_sample_csv(
vault, vault,
rows, rows,
@ -301,7 +307,9 @@ def import_data(obsidian_path: Path, dry_run=True):
import_def['path'], import_def['path'],
) )
continue continue
rows = load_csv_file(import_def['path']) rows = load_csv_file(
import_def['path'], sniff=not import_def.get('standard_variant'),
)
logger.info('Loaded CSV with %d lines', len(rows)) logger.info('Loaded CSV with %d lines', len(rows))
num_files_updated = import_def['import_rows'](vault, rows) num_files_updated = import_def['import_rows'](vault, rows)
logger.info('Updated %d files', num_files_updated) logger.info('Updated %d files', num_files_updated)

View File

@ -2,6 +2,7 @@ import csv
import dataclasses import dataclasses
import datetime import datetime
import decimal import decimal
import logging
import typing import typing
import urllib.parse import urllib.parse
from collections.abc import Callable from collections.abc import Callable
@ -11,6 +12,8 @@ from typing import Any
from frozendict import frozendict from frozendict import frozendict
logger = logging.getLogger(__name__)
CSV_DIALECT = 'one_true_dialect' CSV_DIALECT = 'one_true_dialect'
csv.register_dialect(CSV_DIALECT, lineterminator='\n', skipinitialspace=True) csv.register_dialect(CSV_DIALECT, lineterminator='\n', skipinitialspace=True)
@ -86,10 +89,12 @@ def load_csv_file(csv_file: Path, sniff=False) -> list[frozendict[str, typing.An
dicts: list[frozendict] = [] dicts: list[frozendict] = []
with open(csv_file) as csvfile: with open(csv_file) as csvfile:
if sniff: if sniff:
dialect = csv.Sniffer().sniff(csvfile.read(1024)) logger.warning('Sniffing CSV variant: %s', csv_file)
dialect = csv.Sniffer().sniff(csvfile.read(1024), delimiters=',;')
csvfile.seek(0) csvfile.seek(0)
else: else:
dialect = CSV_DIALECT dialect = CSV_DIALECT
logger.warning('Loading CSV file: %s', csv_file)
reader = csv.DictReader(csvfile, dialect=dialect) reader = csv.DictReader(csvfile, dialect=dialect)
for row in reader: for row in reader:
for k in list(row.keys()): for k in list(row.keys()):

View File

@ -22,7 +22,8 @@ def scrape(watch_history: bool) -> list[dict[str, str]]:
'yt-dlp', 'yt-dlp',
url, url,
'--dump-json', '--dump-json',
'--cookies-from-browser', 'firefox:/home/jmaa/.cachy/mbui5xg7.default-release', '--cookies-from-browser',
'firefox:/home/jmaa/.cachy/mbui5xg7.default-release',
] ]
else: else:
url = f'https://www.youtube.com/playlist?list={PLAYLIST_ID}' url = f'https://www.youtube.com/playlist?list={PLAYLIST_ID}'