From 99513481649665af16f389c2995bf77653a4c35b Mon Sep 17 00:00:00 2001 From: Jon Michael Aanes Date: Thu, 31 Oct 2024 22:37:50 +0100 Subject: [PATCH] Automatically select period key --- libpurple_to_markdown/__main__.py | 61 +++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 15 deletions(-) diff --git a/libpurple_to_markdown/__main__.py b/libpurple_to_markdown/__main__.py index d8c3829..3c7f9dc 100644 --- a/libpurple_to_markdown/__main__.py +++ b/libpurple_to_markdown/__main__.py @@ -1,7 +1,8 @@ import argparse import logging from pathlib import Path -from collections.abc import Iterable +from collections.abc import Iterable, Iterator +import dataclasses from . import ( filter_useless_messages, @@ -10,32 +11,60 @@ from . import ( merge_adjacent_messages, synctech_sms, ) -from .data import Message +from .data import Message, MYSELF logger = logging.getLogger(__name__) +def group_messages(messages: Iterable[Message], key) -> dict[str, list[Message]]: + by_key: dict[str, list[Message]] = {} + for msg in messages: + by_key.setdefault(key(msg), []).append(msg) + del msg + return by_key + def group_messages_by_chat_id(messages: Iterable[Message]) -> dict[str, list[Message]]: - by_period: dict[str, list[Message]] = {} - for msg in messages: - by_period.setdefault(msg.chat_id, []).append(msg) - del msg - return by_period + return group_messages(messages, key=lambda msg: msg.chat_id) + +def year_and_month_period_key(msg: Message): + return f'{msg.sent_at.year}-{msg.sent_at.month:02}' + +def year_period_key(msg: Message): + return f'{msg.sent_at.year}' + +def year_quarter_period_key(msg: Message): + quarter = int((msg.sent_at.month-1)/3)+1 + return f'{msg.sent_at.year}-Q{quarter:01}' + +MAX_AVERAGE_MESSAGES_PER_PERIOD = 120 def group_messages_by_period(messages: Iterable[Message]) -> dict[str, list[Message]]: - by_period: dict[str, list[Message]] = {} - for msg in messages: - period_key = f'{msg.sent_at.year}-{msg.sent_at.month:02}' - by_period.setdefault(period_key, []).append(msg) - del msg - return by_period + possible_period_keys = [(lambda msg: 'Full History'), year_period_key, year_quarter_period_key, year_and_month_period_key] + + for period_key in possible_period_keys: + grouped = group_messages(messages, key=period_key) + average_num_messages = sum(len(grouped[k]) for k in grouped) / len(grouped) + if average_num_messages <= MAX_AVERAGE_MESSAGES_PER_PERIOD: + break + + del period_key, average_num_messages + + return grouped + +def replace_myself(messages: Iterable[Message], myself: str) -> Iterator[Message]: + for msg in messages: + if msg.sender == MYSELF: + yield dataclasses.replace(msg, sender = myself) + else: + yield msg def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--purple', type=Path, dest='purple_folder') parser.add_argument('--synctech', type=Path, dest='synctech_sms_backup_file') parser.add_argument('--output', type=Path) + parser.add_argument('--myself', type=str, default='Myself') return parser.parse_args() @@ -52,6 +81,7 @@ def main(): logger.fatal('No input file given!') return + all_messages = replace_myself(all_messages, myself=args.myself) all_messages = list(all_messages) logger.info('%d messages after loading', len(all_messages)) @@ -64,11 +94,12 @@ def main(): for chat_id, messages_in_chat_original in messages_by_chat_id.items(): messages_in_chat = merge_adjacent_messages(messages_in_chat_original ) - messages_by_period = group_messages_by_period(messages_in_chat) + logger.info(' "%s": %d messages, %d periods (%d msg/period avg)', chat_id, len(messages_in_chat_original), len(messages_by_period), len(messages_in_chat_original)/ len(messages_by_period)) for period_key, messages in messages_by_period.items(): - output_file = args.output / f'{chat_id} - {period_key}.md' + output_file = args.output / chat_id / f'{period_key}.md' + output_file.parent.mkdir(exist_ok=True) logger.info('Writing % 5d messages to %s', len(messages), output_file) with open(output_file, 'w') as f: f.write(