From fa50a6970a42104ab9c9728e64bd5fff03f84731 Mon Sep 17 00:00:00 2001 From: Jon Michael Aanes Date: Sat, 26 Oct 2024 14:53:31 +0200 Subject: [PATCH] Improvements --- libpurple_to_markdown/__init__.py | 38 ++++++++++++++++++++----------- libpurple_to_markdown/__main__.py | 8 ++++--- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/libpurple_to_markdown/__init__.py b/libpurple_to_markdown/__init__.py index c95b1f6..dd74c0c 100644 --- a/libpurple_to_markdown/__init__.py +++ b/libpurple_to_markdown/__init__.py @@ -1,5 +1,6 @@ import dataclasses import datetime +from collections.abc import Iterator, Iterable import logging import re from pathlib import Path @@ -32,7 +33,7 @@ def datetime_sent( chat_start.date(), message_sent, chat_start.tzinfo, ) if chat_start.time() > message_sent: - naive = naive - datetime.timedelta(days=1) + naive = naive + datetime.timedelta(days=1) return naive @@ -63,9 +64,8 @@ def parse_messages_in_chat_file(path: Path) -> list[Message]: cur_sender: str | None = None cur_text: str = '' - if soup.p: - logger.warning('File indicates error message?') - return [] # TODO + if soup.body.p: + loglines = soup.body.p.children else: loglines = soup.body.children @@ -77,7 +77,7 @@ def parse_messages_in_chat_file(path: Path) -> list[Message]: # Get sender if c.b: assert cur_sender is None - cur_sender = c.b.get_text().strip().removesuffix(':') + cur_sender = c.b.get_text().strip().removesuffix(':').removeprefix('***').removesuffix('[m]') elif c.name in {None, 'span', 'font'}: cur_text += c.get_text() @@ -86,7 +86,8 @@ def parse_messages_in_chat_file(path: Path) -> list[Message]: cur_text += '<' + c['href'] + '>' elif c.name == 'br': - messages.append(Message(cur_sent_at, cur_sender, cur_text.strip())) + if cur_sender: + messages.append(Message(cur_sent_at, cur_sender, cur_text.strip())) cur_sent_at = None cur_sender = None cur_text = '' @@ -106,23 +107,25 @@ def parse_messages_in_chat_file(path: Path) -> list[Message]: def parse_messages_in_chat_folder(chat_folder_path: Path) -> list[Message]: messages = [] - for file_path in chat_folder_path.iterdir(): + for file_path in sorted(chat_folder_path.iterdir()): messages.extend(parse_messages_in_chat_file(file_path)) messages.sort() return messages -def format_message_as_citation(out, msg): +def format_message_as_citation(out: list[str], msg: Message) -> None: out.append(f'{msg.sent_at.date()} {msg.sent_at.time()} [[{msg.sender}]]:') out.append('\n') for line in msg.text.split('\n'): + line = re.sub(r'(<[\w ]+>)', r'`\1`', line) + line = re.sub(r'(\$\$\$)', r'`\1`', line) out.append(f'> {line}\n') del line out.append('\n') -def format_message_as_table(out, msg): +def format_message_as_table(out: list[str], msg: Message) -> None: out.append(f'| {msg.sent_at} | [[{msg.sender}]] | ') for line in msg.text.split('\n'): out.append(f'{line}') @@ -130,8 +133,8 @@ def format_message_as_table(out, msg): out.append('|\n') -def format_messages(messages: list[Message]) -> str: - out = ['# Chat 2018' '\n\n'] +def format_messages(messages: list[Message], title: str) -> str: + out = ['# ', title, '\n\n'] as_table = False @@ -154,8 +157,17 @@ def format_messages(messages: list[Message]) -> str: MSG_ADJACENTCY_DIST = datetime.timedelta(minutes=2) +def is_useless_message(msg: Message) -> bool: + return msg.sender.endswith('') or msg.sender == '' -def is_adjacent_messages(first, second): +def filter_useless_messages(messages: Iterable[Message]) -> Iterator[Message]: + for msg in messages: + if not is_useless_message(msg): + yield msg + else: + print(msg.text) + +def is_adjacent_messages(first: Message, second: Message) -> bool: return ( first.sender == second.sender and second.sent_at - first.sent_at <= MSG_ADJACENTCY_DIST @@ -168,7 +180,7 @@ def merge_texts(text1: str, text2: str) -> str: return text1 + (' ' if punctuated else '. ') + '\n' + text2 -def merge_adjacent_messages(messages: list[Message]) -> list[Message]: +def merge_adjacent_messages(messages: Iterable[Message]) -> list[Message]: out = [] for msg in messages: if out and is_adjacent_messages(out[-1], msg): diff --git a/libpurple_to_markdown/__main__.py b/libpurple_to_markdown/__main__.py index 4c6630d..79079b7 100644 --- a/libpurple_to_markdown/__main__.py +++ b/libpurple_to_markdown/__main__.py @@ -3,7 +3,8 @@ import logging from pathlib import Path import logging -from . import (format_messages, merge_adjacent_messages, parse_messages_in_chat_folder, Message) +from . import (format_messages, merge_adjacent_messages, + parse_messages_in_chat_folder, Message, filter_useless_messages) logger = logging.getLogger(__name__) @@ -30,15 +31,16 @@ def main(): receipient = args.path.name all_messages = parse_messages_in_chat_folder(args.path) + all_messages = filter_useless_messages(all_messages) all_messages = merge_adjacent_messages(all_messages) messages_by_year = group_messages_by_year(all_messages) for year, messages in messages_by_year.items(): output_file = args.output / f'{server} - {receipient} - {year}.md' - logger.info("Writing to %s", output_file) + logger.info("Writing %d messages to %s", len(messages), output_file) with open(output_file, 'w') as f: - f.write(format_messages(messages)) + f.write(format_messages(messages, title = f'{server} - {receipient} - {year}')) del year, messages, output_file