2024-10-25 23:12:30 +00:00
|
|
|
import argparse
|
2024-10-31 21:38:22 +00:00
|
|
|
import dataclasses
|
2024-10-25 23:30:51 +00:00
|
|
|
import logging
|
2024-10-31 21:37:50 +00:00
|
|
|
from collections.abc import Iterable, Iterator
|
2024-10-31 21:38:22 +00:00
|
|
|
from pathlib import Path
|
2024-10-26 12:08:22 +00:00
|
|
|
|
2024-10-26 13:17:46 +00:00
|
|
|
from . import (
|
|
|
|
filter_useless_messages,
|
|
|
|
format_messages,
|
2024-10-31 17:58:55 +00:00
|
|
|
libpurple,
|
2024-10-26 13:17:46 +00:00
|
|
|
merge_adjacent_messages,
|
2024-10-31 19:36:38 +00:00
|
|
|
synctech_sms,
|
2024-10-26 13:17:46 +00:00
|
|
|
)
|
2024-10-31 21:38:22 +00:00
|
|
|
from .data import MYSELF, Message
|
2024-10-25 22:34:53 +00:00
|
|
|
|
2024-10-26 12:08:22 +00:00
|
|
|
logger = logging.getLogger(__name__)
|
2024-10-26 11:47:39 +00:00
|
|
|
|
2024-10-31 21:38:22 +00:00
|
|
|
|
2024-10-31 21:37:50 +00:00
|
|
|
def group_messages(messages: Iterable[Message], key) -> dict[str, list[Message]]:
|
|
|
|
by_key: dict[str, list[Message]] = {}
|
2024-10-31 19:36:38 +00:00
|
|
|
for msg in messages:
|
2024-10-31 21:37:50 +00:00
|
|
|
by_key.setdefault(key(msg), []).append(msg)
|
2024-10-31 19:36:38 +00:00
|
|
|
del msg
|
2024-10-31 21:37:50 +00:00
|
|
|
return by_key
|
|
|
|
|
|
|
|
|
|
|
|
def group_messages_by_chat_id(messages: Iterable[Message]) -> dict[str, list[Message]]:
|
|
|
|
return group_messages(messages, key=lambda msg: msg.chat_id)
|
|
|
|
|
2024-10-31 21:38:22 +00:00
|
|
|
|
2024-10-31 21:37:50 +00:00
|
|
|
def year_and_month_period_key(msg: Message):
|
|
|
|
return f'{msg.sent_at.year}-{msg.sent_at.month:02}'
|
|
|
|
|
2024-10-31 21:38:22 +00:00
|
|
|
|
2024-10-31 21:37:50 +00:00
|
|
|
def year_period_key(msg: Message):
|
|
|
|
return f'{msg.sent_at.year}'
|
|
|
|
|
2024-10-31 21:38:22 +00:00
|
|
|
|
2024-10-31 21:37:50 +00:00
|
|
|
def year_quarter_period_key(msg: Message):
|
2024-10-31 21:38:22 +00:00
|
|
|
quarter = int((msg.sent_at.month - 1) / 3) + 1
|
2024-10-31 21:37:50 +00:00
|
|
|
return f'{msg.sent_at.year}-Q{quarter:01}'
|
|
|
|
|
2024-10-31 21:38:22 +00:00
|
|
|
|
2024-10-31 21:37:50 +00:00
|
|
|
MAX_AVERAGE_MESSAGES_PER_PERIOD = 120
|
2024-10-31 19:36:38 +00:00
|
|
|
|
2024-10-25 22:34:53 +00:00
|
|
|
|
2024-10-31 21:38:22 +00:00
|
|
|
def group_messages_by_period(messages: Iterable[Message]) -> dict[str, list[Message]]:
|
|
|
|
possible_period_keys = [
|
2024-11-03 16:14:43 +00:00
|
|
|
(lambda msg: 'full'),
|
2024-10-31 21:38:22 +00:00
|
|
|
year_period_key,
|
|
|
|
year_quarter_period_key,
|
|
|
|
year_and_month_period_key,
|
|
|
|
]
|
2024-10-31 21:37:50 +00:00
|
|
|
|
|
|
|
for period_key in possible_period_keys:
|
|
|
|
grouped = group_messages(messages, key=period_key)
|
|
|
|
average_num_messages = sum(len(grouped[k]) for k in grouped) / len(grouped)
|
|
|
|
if average_num_messages <= MAX_AVERAGE_MESSAGES_PER_PERIOD:
|
|
|
|
break
|
|
|
|
|
|
|
|
del period_key, average_num_messages
|
|
|
|
|
|
|
|
return grouped
|
|
|
|
|
2024-10-31 21:38:22 +00:00
|
|
|
|
2024-10-31 21:37:50 +00:00
|
|
|
def replace_myself(messages: Iterable[Message], myself: str) -> Iterator[Message]:
|
|
|
|
for msg in messages:
|
|
|
|
if msg.sender == MYSELF:
|
2024-10-31 21:38:22 +00:00
|
|
|
yield dataclasses.replace(msg, sender=myself)
|
2024-10-31 21:37:50 +00:00
|
|
|
else:
|
|
|
|
yield msg
|
2024-10-26 13:17:46 +00:00
|
|
|
|
2024-10-31 21:38:22 +00:00
|
|
|
|
2024-10-25 23:12:30 +00:00
|
|
|
def parse_args():
|
|
|
|
parser = argparse.ArgumentParser()
|
2024-10-31 19:36:38 +00:00
|
|
|
parser.add_argument('--purple', type=Path, dest='purple_folder')
|
|
|
|
parser.add_argument('--synctech', type=Path, dest='synctech_sms_backup_file')
|
2024-10-26 12:08:22 +00:00
|
|
|
parser.add_argument('--output', type=Path)
|
2024-10-31 21:37:50 +00:00
|
|
|
parser.add_argument('--myself', type=str, default='Myself')
|
2024-10-25 23:12:30 +00:00
|
|
|
return parser.parse_args()
|
2024-10-25 22:34:53 +00:00
|
|
|
|
2024-10-26 11:47:39 +00:00
|
|
|
|
2024-10-25 22:34:53 +00:00
|
|
|
def main():
|
2024-10-25 23:30:51 +00:00
|
|
|
logging.basicConfig()
|
|
|
|
logging.getLogger().setLevel('INFO')
|
2024-10-25 23:12:30 +00:00
|
|
|
args = parse_args()
|
2024-10-25 22:34:53 +00:00
|
|
|
|
2024-10-31 19:36:38 +00:00
|
|
|
if args.purple_folder:
|
|
|
|
all_messages = libpurple.parse_messages_in_chat_folder(args.purple_folder)
|
|
|
|
elif args.synctech_sms_backup_file:
|
2024-10-31 21:38:22 +00:00
|
|
|
all_messages = synctech_sms.parse_messages_in_backup_xml_file(
|
|
|
|
args.synctech_sms_backup_file,
|
|
|
|
)
|
2024-10-31 19:36:38 +00:00
|
|
|
else:
|
|
|
|
logger.fatal('No input file given!')
|
|
|
|
return
|
|
|
|
|
2024-10-31 21:37:50 +00:00
|
|
|
all_messages = replace_myself(all_messages, myself=args.myself)
|
2024-10-31 19:36:38 +00:00
|
|
|
all_messages = list(all_messages)
|
|
|
|
logger.info('%d messages after loading', len(all_messages))
|
|
|
|
|
|
|
|
all_messages = list(filter_useless_messages(all_messages))
|
|
|
|
logger.info('%d messages after filtering', len(all_messages))
|
|
|
|
|
|
|
|
messages_by_chat_id = group_messages_by_chat_id(all_messages)
|
|
|
|
logger.info('%d message groups', len(messages_by_chat_id))
|
|
|
|
del all_messages
|
2024-10-26 12:08:22 +00:00
|
|
|
|
2024-10-31 19:36:38 +00:00
|
|
|
for chat_id, messages_in_chat_original in messages_by_chat_id.items():
|
2024-10-31 21:38:22 +00:00
|
|
|
messages_in_chat = merge_adjacent_messages(messages_in_chat_original)
|
2024-11-03 16:14:43 +00:00
|
|
|
if len(messages_in_chat) <= 2:
|
|
|
|
logger.info(
|
|
|
|
' "%s": Skipped due to too few messages', chat_id)
|
|
|
|
continue
|
|
|
|
|
2024-10-31 19:36:38 +00:00
|
|
|
messages_by_period = group_messages_by_period(messages_in_chat)
|
2024-10-31 21:38:22 +00:00
|
|
|
logger.info(
|
|
|
|
' "%s": %d messages, %d periods (%d msg/period avg)',
|
|
|
|
chat_id,
|
|
|
|
len(messages_in_chat_original),
|
|
|
|
len(messages_by_period),
|
|
|
|
len(messages_in_chat_original) / len(messages_by_period),
|
|
|
|
)
|
2024-10-26 12:08:22 +00:00
|
|
|
|
2024-10-31 19:36:38 +00:00
|
|
|
for period_key, messages in messages_by_period.items():
|
2024-11-03 16:14:43 +00:00
|
|
|
file_escaped_chat_id = chat_id.replace(' ','-')
|
|
|
|
output_file = args.output / chat_id / f'{file_escaped_chat_id}-{period_key}.md'
|
2024-10-31 21:37:50 +00:00
|
|
|
output_file.parent.mkdir(exist_ok=True)
|
2024-10-31 19:36:38 +00:00
|
|
|
logger.info('Writing % 5d messages to %s', len(messages), output_file)
|
|
|
|
with open(output_file, 'w') as f:
|
|
|
|
f.write(
|
|
|
|
format_messages(
|
|
|
|
messages,
|
|
|
|
title=f'{chat_id} - {period_key}',
|
|
|
|
),
|
|
|
|
)
|
2024-10-26 12:08:22 +00:00
|
|
|
|
2024-10-31 19:36:38 +00:00
|
|
|
del period_key, messages, output_file
|
|
|
|
del chat_id, messages_in_chat_original, messages_in_chat
|
2024-10-25 22:34:53 +00:00
|
|
|
|
2024-10-26 11:47:39 +00:00
|
|
|
|
2024-10-25 22:34:53 +00:00
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|