1
0
libpurple-to-markdown/libpurple_to_markdown/__main__.py

189 lines
5.9 KiB
Python
Raw Normal View History

2024-10-25 23:12:30 +00:00
import argparse
2024-10-31 21:38:22 +00:00
import dataclasses
2024-11-17 11:20:52 +00:00
import datetime
2024-11-17 11:21:08 +00:00
import logging
from collections.abc import Callable, Iterable, Iterator, Mapping
2024-10-31 21:38:22 +00:00
from pathlib import Path
2024-10-26 12:08:22 +00:00
2024-10-26 13:17:46 +00:00
from . import (
filter_useless_messages,
libpurple,
2024-10-26 13:17:46 +00:00
merge_adjacent_messages,
2024-10-31 19:36:38 +00:00
synctech_sms,
2024-10-26 13:17:46 +00:00
)
2024-10-31 21:38:22 +00:00
from .data import MYSELF, Message
2024-11-03 22:49:35 +00:00
from .markdown import format_messages
2024-10-26 12:08:22 +00:00
logger = logging.getLogger(__name__)
2024-10-26 11:47:39 +00:00
2024-10-31 21:38:22 +00:00
2024-11-17 11:21:08 +00:00
def group_messages(
messages: Iterable[Message],
key_fn: Callable[[Message], str],
) -> dict[str, list[Message]]:
2024-10-31 21:37:50 +00:00
by_key: dict[str, list[Message]] = {}
2024-10-31 19:36:38 +00:00
for msg in messages:
2024-11-17 11:20:52 +00:00
by_key.setdefault(key_fn(msg), []).append(msg)
2024-10-31 19:36:38 +00:00
del msg
2024-10-31 21:37:50 +00:00
return by_key
def group_messages_by_chat_id(messages: Iterable[Message]) -> dict[str, list[Message]]:
2024-11-17 11:20:52 +00:00
return group_messages(messages, key_fn=lambda msg: msg.chat_id)
2024-10-31 21:37:50 +00:00
2024-10-31 21:38:22 +00:00
2024-10-31 21:37:50 +00:00
def year_and_month_period_key(msg: Message):
return f'{msg.sent_at.year}-{msg.sent_at.month:02}'
2024-10-31 21:38:22 +00:00
2024-10-31 21:37:50 +00:00
def year_period_key(msg: Message):
return f'{msg.sent_at.year}'
2024-10-31 21:38:22 +00:00
2024-10-31 21:37:50 +00:00
def year_quarter_period_key(msg: Message):
2024-10-31 21:38:22 +00:00
quarter = int((msg.sent_at.month - 1) / 3) + 1
2024-10-31 21:37:50 +00:00
return f'{msg.sent_at.year}-Q{quarter:01}'
2024-10-31 21:38:22 +00:00
2024-10-31 21:37:50 +00:00
MAX_AVERAGE_MESSAGES_PER_PERIOD = 120
2024-10-31 19:36:38 +00:00
2024-11-17 11:20:52 +00:00
TOO_FEW_MESSAGES_TO_CARE = 2
PERIOD_KEYS_BY_NAME: Mapping[str, Callable[[Message], str]] = {
2024-11-17 11:21:08 +00:00
'full': (lambda msg: 'full'),
'year': year_period_key,
'quarter': year_quarter_period_key,
'month': year_and_month_period_key,
2024-11-17 11:04:29 +00:00
}
2024-11-17 11:21:08 +00:00
def group_messages_by_period(
messages: Iterable[Message],
period_key: str | None = None,
) -> tuple[dict[str, list[Message]], Callable[[Message], str]]:
2024-11-17 11:20:52 +00:00
# Determine key function
2024-11-17 11:21:08 +00:00
possible_period_keys: Iterable[Callable[[Message], str]] = (
PERIOD_KEYS_BY_NAME.values()
)
2024-11-17 11:04:29 +00:00
if period_key is not None:
2024-11-17 11:21:08 +00:00
possible_period_keys = [PERIOD_KEYS_BY_NAME[period_key]]
2024-11-17 11:20:52 +00:00
del period_key
# Group by key
2024-11-17 11:04:29 +00:00
for period_key_fn in possible_period_keys:
2024-11-17 11:20:52 +00:00
grouped = group_messages(messages, key_fn=period_key_fn)
2024-10-31 21:37:50 +00:00
average_num_messages = sum(len(grouped[k]) for k in grouped) / len(grouped)
if average_num_messages <= MAX_AVERAGE_MESSAGES_PER_PERIOD:
break
2024-11-17 11:04:29 +00:00
del average_num_messages
2024-10-31 21:37:50 +00:00
2024-11-17 11:04:29 +00:00
return grouped, period_key_fn
2024-10-31 21:37:50 +00:00
2024-10-31 21:38:22 +00:00
2024-10-31 21:37:50 +00:00
def replace_myself(messages: Iterable[Message], myself: str) -> Iterator[Message]:
for msg in messages:
if msg.sender == MYSELF:
2024-10-31 21:38:22 +00:00
yield dataclasses.replace(msg, sender=myself)
2024-10-31 21:37:50 +00:00
else:
yield msg
2024-10-26 13:17:46 +00:00
2024-10-31 21:38:22 +00:00
2024-10-25 23:12:30 +00:00
def parse_args():
parser = argparse.ArgumentParser()
2024-10-31 19:36:38 +00:00
parser.add_argument('--purple', type=Path, dest='purple_folder')
parser.add_argument('--synctech', type=Path, dest='synctech_sms_backup_file')
2024-10-26 12:08:22 +00:00
parser.add_argument('--output', type=Path)
2024-10-31 21:37:50 +00:00
parser.add_argument('--myself', type=str, default='Myself')
2024-11-17 11:04:29 +00:00
parser.add_argument('--overwrite', action='store_true', dest='overwrite_files')
2024-11-17 11:21:08 +00:00
parser.add_argument(
'--period',
dest='period_key',
choices=list(PERIOD_KEYS_BY_NAME.keys()),
)
parser.add_argument(
'--skip-this-period',
action='store_true',
dest='skip_this_period',
)
2024-10-25 23:12:30 +00:00
return parser.parse_args()
2024-11-17 11:21:08 +00:00
def main():
2024-10-25 23:30:51 +00:00
logging.basicConfig()
logging.getLogger().setLevel('INFO')
2024-10-25 23:12:30 +00:00
args = parse_args()
2024-10-31 19:36:38 +00:00
if args.purple_folder:
all_messages = libpurple.parse_messages_in_chat_folder(args.purple_folder)
elif args.synctech_sms_backup_file:
2024-10-31 21:38:22 +00:00
all_messages = synctech_sms.parse_messages_in_backup_xml_file(
args.synctech_sms_backup_file,
)
2024-10-31 19:36:38 +00:00
else:
logger.fatal('No input file given!')
return
2024-10-31 21:37:50 +00:00
all_messages = replace_myself(all_messages, myself=args.myself)
2024-10-31 19:36:38 +00:00
all_messages = list(all_messages)
logger.info('%d messages after loading', len(all_messages))
all_messages = list(filter_useless_messages(all_messages))
logger.info('%d messages after filtering', len(all_messages))
messages_by_chat_id = group_messages_by_chat_id(all_messages)
logger.info('%d message groups', len(messages_by_chat_id))
del all_messages
2024-10-26 12:08:22 +00:00
2024-10-31 19:36:38 +00:00
for chat_id, messages_in_chat_original in messages_by_chat_id.items():
2024-10-31 21:38:22 +00:00
messages_in_chat = merge_adjacent_messages(messages_in_chat_original)
2024-11-17 11:20:52 +00:00
if len(messages_in_chat) <= TOO_FEW_MESSAGES_TO_CARE:
2024-11-03 16:15:03 +00:00
logger.info(' "%s": Skipped due to too few messages', chat_id)
2024-11-03 16:14:43 +00:00
continue
2024-11-17 11:21:08 +00:00
messages_by_period, period_key_fn = group_messages_by_period(
messages_in_chat,
args.period_key,
)
2024-10-31 21:38:22 +00:00
logger.info(
' "%s": %d messages, %d periods (%d msg/period avg)',
chat_id,
len(messages_in_chat_original),
len(messages_by_period),
len(messages_in_chat_original) / len(messages_by_period),
)
2024-10-26 12:08:22 +00:00
2024-11-17 11:21:08 +00:00
this_period_name = period_key_fn(Message(datetime.datetime.now(), '', '', ''))
2024-11-17 11:04:29 +00:00
for period_key_name, messages in messages_by_period.items():
2024-11-03 16:15:03 +00:00
file_escaped_chat_id = chat_id.replace(' ', '-')
output_file = (
2024-11-17 11:04:29 +00:00
args.output / chat_id / f'{file_escaped_chat_id}-{period_key_name}.md'
2024-11-03 16:15:03 +00:00
)
2024-11-17 11:04:29 +00:00
2024-10-31 19:36:38 +00:00
logger.info('Writing % 5d messages to %s', len(messages), output_file)
2024-11-17 11:04:29 +00:00
if this_period_name == period_key_name:
logger.info('Skipping due to --skip-this-period: %s', output_file)
continue
if output_file.exists() and not args.overwrite_files:
logger.info('Skipping existing file: %s', output_file)
continue
# Create folders and file
2024-11-17 11:20:52 +00:00
output_file.parent.mkdir(exist_ok=True, parents=True)
2024-10-31 19:36:38 +00:00
with open(output_file, 'w') as f:
f.write(
format_messages(
messages,
2024-11-17 11:04:29 +00:00
title=f'{chat_id} - {period_key_name}',
2024-10-31 19:36:38 +00:00
),
)
2024-10-26 12:08:22 +00:00
2024-11-17 11:04:29 +00:00
del period_key_name, messages, output_file
2024-10-31 19:36:38 +00:00
del chat_id, messages_in_chat_original, messages_in_chat
2024-10-26 11:47:39 +00:00
if __name__ == '__main__':
main()