personal-data/obsidian_import/__init__.py

"""Obsidian Import.

Sub-module for importing time-based data into Obsidian.
"""

import dataclasses
import datetime
from collections.abc import Iterator
from logging import getLogger
from pathlib import Path
from typing import Any

from personal_data.activity import (
    ActivitySample,
    Label,
    RealizedActivitySample,
    heuristically_realize_samples,
    merge_adjacent_samples,
)
from personal_data.csv_import import determine_possible_keys, load_csv_file, start_end

from .obsidian import Event, ObsidianVault

logger = getLogger(__name__)

Row = dict[str, Any]
Rows = list[Row]


def iterate_samples_from_rows(rows: Rows) -> Iterator[ActivitySample]:
    assert len(rows) > 0

    if True:
        event_data = rows[len(rows) // 2]  # Hopefully select a useful representative.
        possible_keys = determine_possible_keys(event_data)
        logger.info('Found possible keys: %s', possible_keys)
        del event_data

    assert len(possible_keys.time_start) + len(possible_keys.time_end) >= 1
    assert len(possible_keys.image) >= 0

    for event_data in rows:
        (start_at, end_at) = start_end(event_data, possible_keys)
        labels = [Label(k, event_data.get(k)) for k in possible_keys.misc if k in event_data]

        # Create event
        yield ActivitySample(
            labels=tuple(labels),
            start_at=start_at,
            end_at=end_at,
        )

        del event_data


def import_workout_csv(vault: ObsidianVault, rows: Rows) -> int:
    num_updated = 0
    for row in rows:
        date = row['Date']
        was_updated = False
        mapping = {
            'Cycling (mins)': ('Cycling (Duration)', 'minutes'),
            'Cycling (kcals)': ('Cycling (kcals)', ''),
            'Weight (Kg)': ('Weight (Kg)', ''),
        }

        for input_key, (output_key, unit) in mapping.items():
            v = row.get(input_key)
            if v is not None:
                if unit:
                    v = str(v) + ' ' + unit
            was_updated |= vault.add_statistic(date, output_key, v)
            if input_key != output_key:
                was_updated |= vault.add_statistic(date, input_key, None)
            del input_key, output_key, unit, v

        if was_updated:
            num_updated += 1
        del row, date
    return num_updated


def import_step_counts_csv(vault: ObsidianVault, rows: Rows) -> int:
    MINIMUM = 300

    num_updated = 0

    rows_per_date = {}
    for row in rows:
        date = row['Start'].date()
        rows_per_date.setdefault(date, [])
        rows_per_date[date].append(row)
        del date, row

    steps_per_date = {
        date: sum(row['Steps'] for row in rows) for date, rows in rows_per_date.items()
    }

    for date, steps in steps_per_date.items():
        if steps < MINIMUM:
            continue
        was_updated = vault.add_statistic(date, 'Steps', steps)
        if was_updated:
            num_updated += 1
        del date, steps, was_updated

    return num_updated


def escape_for_obsidian_link(link: str) -> str:
    return link.replace(':', ' ').replace('/', ' ').replace('  ', ' ')


@dataclasses.dataclass(frozen=True)
class EventContent:
    verb: str
    subject: str
    comment: str


def import_activity_sample_csv(
    vault: ObsidianVault,
    rows: Rows,
    content_mapper,
    group_category: str | None = None,
) -> int:
    samples = heuristically_realize_samples(list(iterate_samples_from_rows(rows)))

    if group_category is not None:
        samples = merge_adjacent_samples(list(samples), group_category)

    samples_per_date: dict[datetime.date, list[RealizedActivitySample]] = {}
    for sample in samples:
        date: datetime.date = sample.start_at.date()
        samples_per_date.setdefault(date, [])
        samples_per_date[date].append(sample)
        del date, sample
    del rows

    def map_to_event(sample: RealizedActivitySample) -> Event:
        content = content_mapper(sample)
        expected_tz = datetime.timezone(
            datetime.timedelta(hours=2),
        )  # TODO: Determine this in a more intelligent manner
        return Event(
            sample.start_at.astimezone(expected_tz)
            .replace(second=0, microsecond=0)
            .time(),
            sample.end_at.astimezone(expected_tz)
            .replace(second=0, microsecond=0)
            .time(),
            verb=content.verb,
            subject=escape_for_obsidian_link(content.subject),
            comment=content.comment,
        )

    num_updated = 0

    for date, samples in samples_per_date.items():
        events = [map_to_event(sample) for sample in samples]
        was_updated = vault.add_events(date, events)

        if was_updated:
            num_updated += 1
        del date, was_updated

    return num_updated


def import_activity_sample_csv_from_file(
    vault: ObsidianVault,
    data_path: Path,
    content_mapper,
    **kwargs,
) -> int:
    rows = load_csv_file(data_path)
    logger.info('Loaded CSV with %d lines (%s)', len(rows), data_path)
    num_updated = import_activity_sample_csv(vault, rows, content_mapper, **kwargs)
    logger.info('Updated %d files', num_updated)


def map_watched_series_content(sample: RealizedActivitySample) -> EventContent:
    subject = sample.single_label_with_category('series.name')
    comment = '{} Episode {}: *{}*'.format(
        sample.single_label_with_category('season.name'),
        sample.single_label_with_category('episode.index'),
        sample.single_label_with_category('episode.name'),
    )
    return EventContent(
        verb='Watched',
        subject=subject,
        comment=comment,
    )


def map_games_played_content(sample: RealizedActivitySample) -> EventContent:
    subject = sample.single_label_with_category('game.name')
    comment = ''
    return EventContent(
        verb='Played',
        subject=subject,
        comment=comment,
    )


def import_watched_series_csv_from_file(vault: ObsidianVault) -> int:
    data_path = Path('output/show_episodes_watched.csv')
    return import_activity_sample_csv_from_file(
        vault,
        data_path,
        map_watched_series_content,
    )


def import_played_games_csv_from_file(vault: ObsidianVault) -> int:
    data_path = Path('output/games_played.csv')
    return import_activity_sample_csv_from_file(
        vault,
        data_path,
        map_games_played_content,
        group_category='game.name',
    )


def import_data(obsidian_path: Path, dry_run=True):
    vault = ObsidianVault(obsidian_path, read_only=dry_run and 'silent' or None)

    if False:
        data_path = Path('/home/jmaa/Notes/workout.csv')
        rows = load_csv_file(data_path)
        logger.info('Loaded CSV with %d lines', len(rows))
        num_updated = import_workout_csv(vault, rows)
        logger.info('Updated %d files', num_updated)

    if False:
        data_path = Path(
            '/home/jmaa/personal-archive/misc-data/step_counts_2023-07-26_to_2024-09-21.csv',
        )
        rows = load_csv_file(data_path)
        logger.info('Loaded CSV with %d lines', len(rows))
        num_updated = import_step_counts_csv(vault, rows)
        logger.info('Updated %d files', num_updated)

    import_watched_series_csv_from_file(vault)
    import_played_games_csv_from_file(vault)

    num_dirty = len([f for f in vault.internal_file_text_cache.values() if f.is_dirty])
    logger.info('dirty files in cache: %d', num_dirty)
    logger.info('clean files in cache: %d', len(vault.internal_file_text_cache) - num_dirty)
    if not dry_run:
        vault.flush_cache()
Obsidian import initial attempt 2024-10-03 21:23:47 +00:00			`"""Obsidian Import.`

			`Sub-module for importing time-based data into Obsidian.`
			`"""`

Improving event import 2024-10-21 21:38:07 +00:00			`import dataclasses`
Obsidian import initial attempt 2024-10-03 21:23:47 +00:00			`import datetime`
Ruff 2024-10-23 19:30:23 +00:00			`from collections.abc import Iterator`
Obsidian import initial attempt 2024-10-03 21:23:47 +00:00			`from logging import getLogger`
Ruff 2024-10-03 21:24:12 +00:00			`from pathlib import Path`
Step counts 2024-10-08 19:22:18 +00:00			`from typing import Any`
Ruff 2024-10-03 21:24:12 +00:00
Ruff 2024-10-23 19:30:23 +00:00			`from personal_data.activity import (`
			`ActivitySample,`
			`Label,`
			`RealizedActivitySample,`
			`heuristically_realize_samples,`
			`merge_adjacent_samples,`
Games played import works 2024-10-21 22:04:34 +00:00			`)`
Ruff 2024-10-23 19:30:23 +00:00			`from personal_data.csv_import import determine_possible_keys, load_csv_file, start_end`
Ruff 2024-10-03 21:24:12 +00:00
Ruff 2024-10-10 22:54:01 +00:00			`from .obsidian import Event, ObsidianVault`
Ruff 2024-10-03 21:24:12 +00:00
Obsidian import initial attempt 2024-10-03 21:23:47 +00:00			`logger = getLogger(__name__)`

Ruff 2024-10-10 22:54:01 +00:00			`Row = dict[str, Any]`
Support importing series events 2024-10-10 21:50:48 +00:00			`Rows = list[Row]`

Ruff 2024-10-23 19:30:23 +00:00
obsidian_import use imported functionality 2024-10-14 18:38:35 +00:00			`def iterate_samples_from_rows(rows: Rows) -> Iterator[ActivitySample]:`
			`assert len(rows) > 0`

			`if True:`
			`event_data = rows[len(rows) // 2] # Hopefully select a useful representative.`
			`possible_keys = determine_possible_keys(event_data)`
			`logger.info('Found possible keys: %s', possible_keys)`
			`del event_data`

			`assert len(possible_keys.time_start) + len(possible_keys.time_end) >= 1`
			`assert len(possible_keys.image) >= 0`

			`for event_data in rows:`
			`(start_at, end_at) = start_end(event_data, possible_keys)`
Added cache layer for texts. Allows for waiting until all changes have been effectuated before saving to disk. 2024-11-17 15:51:31 +00:00			`labels = [Label(k, event_data.get(k)) for k in possible_keys.misc if k in event_data]`
obsidian_import use imported functionality 2024-10-14 18:38:35 +00:00
			`# Create event`
			`yield ActivitySample(`
			`labels=tuple(labels),`
			`start_at=start_at,`
			`end_at=end_at,`
			`)`

			`del event_data`


Support importing series events 2024-10-10 21:50:48 +00:00			`def import_workout_csv(vault: ObsidianVault, rows: Rows) -> int:`
Obsidian import initial attempt 2024-10-03 21:23:47 +00:00			`num_updated = 0`
			`for row in rows:`
			`date = row['Date']`
			`was_updated = False`
			`mapping = {`
			`'Cycling (mins)': ('Cycling (Duration)', 'minutes'),`
			`'Cycling (kcals)': ('Cycling (kcals)', ''),`
			`'Weight (Kg)': ('Weight (Kg)', ''),`
			`}`

			`for input_key, (output_key, unit) in mapping.items():`
			`v = row.get(input_key)`
Remove mapped fields 2024-10-03 21:32:30 +00:00			`if v is not None:`
			`if unit:`
			`v = str(v) + ' ' + unit`
			`was_updated \|= vault.add_statistic(date, output_key, v)`
			`if input_key != output_key:`
			`was_updated \|= vault.add_statistic(date, input_key, None)`
Obsidian import initial attempt 2024-10-03 21:23:47 +00:00			`del input_key, output_key, unit, v`

			`if was_updated:`
			`num_updated += 1`
			`del row, date`
Step counts 2024-10-08 19:22:18 +00:00			`return num_updated`

Ruff 2024-10-10 22:54:01 +00:00
Support importing series events 2024-10-10 21:50:48 +00:00			`def import_step_counts_csv(vault: ObsidianVault, rows: Rows) -> int:`
Step counts 2024-10-08 19:22:18 +00:00			`MINIMUM = 300`

			`num_updated = 0`

Support importing series events 2024-10-10 21:50:48 +00:00			`rows_per_date = {}`
Step counts 2024-10-08 19:22:18 +00:00			`for row in rows:`
			`date = row['Start'].date()`
Support importing series events 2024-10-10 21:50:48 +00:00			`rows_per_date.setdefault(date, [])`
			`rows_per_date[date].append(row)`
Step counts 2024-10-08 19:22:18 +00:00			`del date, row`

Ruff 2024-10-10 22:54:01 +00:00			`steps_per_date = {`
			`date: sum(row['Steps'] for row in rows) for date, rows in rows_per_date.items()`
			`}`
Step counts 2024-10-08 19:22:18 +00:00
Support importing series events 2024-10-10 21:50:48 +00:00			`for date, steps in steps_per_date.items():`
Step counts 2024-10-08 19:22:18 +00:00			`if steps < MINIMUM:`
			`continue`
			`was_updated = vault.add_statistic(date, 'Steps', steps)`
			`if was_updated:`
			`num_updated += 1`
			`del date, steps, was_updated`

			`return num_updated`

Ruff 2024-10-23 19:30:23 +00:00
Data improvements 2024-10-20 16:27:32 +00:00			`def escape_for_obsidian_link(link: str) -> str:`
			`return link.replace(':', ' ').replace('/', ' ').replace(' ', ' ')`


Improving event import 2024-10-21 21:38:07 +00:00			`@dataclasses.dataclass(frozen=True)`
			`class EventContent:`
			`verb: str`
			`subject: str`
			`comment: str`
Ruff 2024-10-10 22:54:01 +00:00
Support importing series events 2024-10-10 21:50:48 +00:00
Ruff 2024-10-23 19:30:23 +00:00			`def import_activity_sample_csv(`
			`vault: ObsidianVault,`
			`rows: Rows,`
			`content_mapper,`
			`group_category: str \| None = None,`
			`) -> int:`
obsidian_import use imported functionality 2024-10-14 18:38:35 +00:00			`samples = heuristically_realize_samples(list(iterate_samples_from_rows(rows)))`
Support importing series events 2024-10-10 21:50:48 +00:00
Games played import works 2024-10-21 22:04:34 +00:00			`if group_category is not None:`
			`samples = merge_adjacent_samples(list(samples), group_category)`

obsidian_import use imported functionality 2024-10-14 18:38:35 +00:00			`samples_per_date: dict[datetime.date, list[RealizedActivitySample]] = {}`
			`for sample in samples:`
			`date: datetime.date = sample.start_at.date()`
			`samples_per_date.setdefault(date, [])`
			`samples_per_date[date].append(sample)`
			`del date, sample`
Support importing series events 2024-10-10 21:50:48 +00:00			`del rows`

obsidian_import use imported functionality 2024-10-14 18:38:35 +00:00			`def map_to_event(sample: RealizedActivitySample) -> Event:`
Improving event import 2024-10-21 21:38:07 +00:00			`content = content_mapper(sample)`
Ruff 2024-10-23 19:30:23 +00:00			`expected_tz = datetime.timezone(`
			`datetime.timedelta(hours=2),`
			`) # TODO: Determine this in a more intelligent manner`
			`return Event(`
			`sample.start_at.astimezone(expected_tz)`
			`.replace(second=0, microsecond=0)`
			`.time(),`
			`sample.end_at.astimezone(expected_tz)`
			`.replace(second=0, microsecond=0)`
			`.time(),`
			`verb=content.verb,`
			`subject=escape_for_obsidian_link(content.subject),`
			`comment=content.comment,`
Ruff 2024-10-10 22:54:01 +00:00			`)`
ObsidianVault now supports events 2024-10-08 20:57:41 +00:00
obsidian_import use imported functionality 2024-10-14 18:38:35 +00:00			`num_updated = 0`

			`for date, samples in samples_per_date.items():`
			`events = [map_to_event(sample) for sample in samples]`
Support importing series events 2024-10-10 21:50:48 +00:00			`was_updated = vault.add_events(date, events)`

			`if was_updated:`
			`num_updated += 1`
			`del date, was_updated`

			`return num_updated`
ObsidianVault now supports events 2024-10-08 20:57:41 +00:00
Ruff 2024-10-23 19:30:23 +00:00
			`def import_activity_sample_csv_from_file(`
			`vault: ObsidianVault,`
			`data_path: Path,`
			`content_mapper,`
			`**kwargs,`
			`) -> int:`
Improving event import 2024-10-21 21:38:07 +00:00			`rows = load_csv_file(data_path)`
			`logger.info('Loaded CSV with %d lines (%s)', len(rows), data_path)`
Games played import works 2024-10-21 22:04:34 +00:00			`num_updated = import_activity_sample_csv(vault, rows, content_mapper, **kwargs)`
Improving event import 2024-10-21 21:38:07 +00:00			`logger.info('Updated %d files', num_updated)`

Ruff 2024-10-23 19:30:23 +00:00
Improving event import 2024-10-21 21:38:07 +00:00			`def map_watched_series_content(sample: RealizedActivitySample) -> EventContent:`
			`subject = sample.single_label_with_category('series.name')`
			`comment = '{} Episode {}: {}'.format(`
			`sample.single_label_with_category('season.name'),`
			`sample.single_label_with_category('episode.index'),`
			`sample.single_label_with_category('episode.name'),`
			`)`
			`return EventContent(`
Ruff 2024-10-23 19:30:23 +00:00			`verb='Watched',`
			`subject=subject,`
			`comment=comment,`
Improving event import 2024-10-21 21:38:07 +00:00			`)`

Ruff 2024-10-23 19:30:23 +00:00
Improving event import 2024-10-21 21:38:07 +00:00			`def map_games_played_content(sample: RealizedActivitySample) -> EventContent:`
			`subject = sample.single_label_with_category('game.name')`
Games played import works 2024-10-21 22:04:34 +00:00			`comment = ''`
Improving event import 2024-10-21 21:38:07 +00:00			`return EventContent(`
Ruff 2024-10-23 19:30:23 +00:00			`verb='Played',`
			`subject=subject,`
			`comment=comment,`
Improving event import 2024-10-21 21:38:07 +00:00			`)`

Ruff 2024-10-23 19:30:23 +00:00
Improving event import 2024-10-21 21:38:07 +00:00			`def import_watched_series_csv_from_file(vault: ObsidianVault) -> int:`
			`data_path = Path('output/show_episodes_watched.csv')`
Ruff 2024-10-23 19:30:23 +00:00			`return import_activity_sample_csv_from_file(`
			`vault,`
			`data_path,`
			`map_watched_series_content,`
			`)`

Improving event import 2024-10-21 21:38:07 +00:00
			`def import_played_games_csv_from_file(vault: ObsidianVault) -> int:`
Merged all dataset outputs 2024-10-25 19:47:44 +00:00			`data_path = Path('output/games_played.csv')`
Ruff 2024-10-23 19:30:23 +00:00			`return import_activity_sample_csv_from_file(`
			`vault,`
			`data_path,`
			`map_games_played_content,`
			`group_category='game.name',`
			`)`

Ruff 2024-10-10 22:54:01 +00:00
Step counts 2024-10-08 19:22:18 +00:00			`def import_data(obsidian_path: Path, dry_run=True):`
			`vault = ObsidianVault(obsidian_path, read_only=dry_run and 'silent' or None)`

ObsidianVault now supports events 2024-10-08 20:57:41 +00:00			`if False:`
			`data_path = Path('/home/jmaa/Notes/workout.csv')`
			`rows = load_csv_file(data_path)`
			`logger.info('Loaded CSV with %d lines', len(rows))`
			`num_updated = import_workout_csv(vault, rows)`
			`logger.info('Updated %d files', num_updated)`

			`if False:`
Ruff 2024-10-10 22:54:01 +00:00			`data_path = Path(`
			`'/home/jmaa/personal-archive/misc-data/step_counts_2023-07-26_to_2024-09-21.csv',`
			`)`
ObsidianVault now supports events 2024-10-08 20:57:41 +00:00			`rows = load_csv_file(data_path)`
			`logger.info('Loaded CSV with %d lines', len(rows))`
			`num_updated = import_step_counts_csv(vault, rows)`
			`logger.info('Updated %d files', num_updated)`

Improving event import 2024-10-21 21:38:07 +00:00			`import_watched_series_csv_from_file(vault)`
			`import_played_games_csv_from_file(vault)`
Added cache layer for texts. Allows for waiting until all changes have been effectuated before saving to disk. 2024-11-17 15:51:31 +00:00
			`num_dirty = len([f for f in vault.internal_file_text_cache.values() if f.is_dirty])`
			`logger.info('dirty files in cache: %d', num_dirty)`
			`logger.info('clean files in cache: %d', len(vault.internal_file_text_cache) - num_dirty)`
			`if not dry_run:`
			`vault.flush_cache()`