From 91ca8d66d8a67ca50b2d4684dcb5e3446521f062 Mon Sep 17 00:00:00 2001 From: Jon Michael Aanes Date: Thu, 31 Oct 2024 18:58:55 +0100 Subject: [PATCH] Restructure by moving datastructures and libpurple parsing into own module --- libpurple_to_markdown/__init__.py | 101 +---------------------------- libpurple_to_markdown/__main__.py | 6 +- libpurple_to_markdown/data.py | 14 ++++ libpurple_to_markdown/libpurple.py | 92 ++++++++++++++++++++++++++ test/test_init.py | 5 ++ 5 files changed, 116 insertions(+), 102 deletions(-) create mode 100644 libpurple_to_markdown/data.py create mode 100644 libpurple_to_markdown/libpurple.py diff --git a/libpurple_to_markdown/__init__.py b/libpurple_to_markdown/__init__.py index b5eeca7..968cd6c 100644 --- a/libpurple_to_markdown/__init__.py +++ b/libpurple_to_markdown/__init__.py @@ -30,29 +30,15 @@ import datetime import logging import re from collections.abc import Iterable, Iterator -from pathlib import Path - -import bs4 from ._version import __version__ +from .data import Message __all__ = ['__version__'] logger = logging.getLogger(__name__) -@dataclasses.dataclass(frozen=True, order=True) -class Message: - sent_at: datetime.datetime - sender: str - text: str - - def __post_init__(self): - assert self.sent_at is not None - assert self.sender is not None - assert self.text is not None - - def datetime_sent( chat_start: datetime.datetime, message_sent: datetime.time, @@ -67,89 +53,6 @@ def datetime_sent( return naive -def parse_timestamp(c) -> datetime.time: - timestamp_obj = c - if c.font is not None: - c = c.font - m = re.match(r'\((\d+):(\d+):(\d+)\)', timestamp_obj.get_text()) - return datetime.time(int(m.group(1)), int(m.group(2)), int(m.group(3))) - - -def parse_messages_in_chat_file(path: Path) -> list[Message]: - logger.info('Parsing %s', path) - chat_start = datetime.datetime.fromisoformat( - path.stem.removesuffix('CEST').removesuffix('CET'), - ) - - with open(path) as f: - soup = bs4.BeautifulSoup(f, 'lxml') - - if len(soup.contents) == 0: - logger.warning('File is empty?') - return [] - - messages = [] - - cur_sent_at: datetime.datetime | None = None - cur_sender: str | None = None - cur_text: str = '' - - if soup.body.p: - loglines = soup.body.p.children - else: - loglines = soup.body.children - - for c in loglines: - if c.name in {'font', 'span'} and cur_sent_at is None: - # Get timestamp - cur_sent_at = datetime_sent(chat_start, parse_timestamp(c)) - - # Get sender - if c.b: - assert cur_sender is None - cur_sender = ( - c.b.get_text() - .strip() - .removesuffix(':') - .removeprefix('***') - .removesuffix('[m]') - ) - - elif c.name in {None, 'span', 'font'}: - cur_text += c.get_text() - - elif c.name == 'a': - cur_text += '<' + c['href'] + '>' - - elif c.name == 'br': - if cur_sender: - messages.append(Message(cur_sent_at, cur_sender, cur_text.strip())) - cur_sent_at = None - cur_sender = None - cur_text = '' - - elif c.name == 'b': - # Indicates system message. Ignore - pass - - elif c.name in {'h1', 'h3'}: - pass # Ignore log header - - else: - assert False, c - - return messages - - -def parse_messages_in_chat_folder(chat_folder_path: Path) -> list[Message]: - messages = [] - for file_path in sorted(chat_folder_path.iterdir()): - messages.extend(parse_messages_in_chat_file(file_path)) - - messages.sort() - return messages - - def format_message_as_citation(out: list[str], msg: Message) -> None: out.append(f'{msg.sent_at.date()} {msg.sent_at.time()} [[{msg.sender}]]:') out.append('\n') @@ -220,7 +123,7 @@ def merge_texts(text1: str, text2: str) -> str: def merge_adjacent_messages(messages: Iterable[Message]) -> list[Message]: - out = [] + out: list[Message] = [] for msg in messages: if out and is_adjacent_messages(out[-1], msg): out[-1] = dataclasses.replace( diff --git a/libpurple_to_markdown/__main__.py b/libpurple_to_markdown/__main__.py index 5ee2a21..1d05056 100644 --- a/libpurple_to_markdown/__main__.py +++ b/libpurple_to_markdown/__main__.py @@ -3,12 +3,12 @@ import logging from pathlib import Path from . import ( - Message, filter_useless_messages, format_messages, + libpurple, merge_adjacent_messages, - parse_messages_in_chat_folder, ) +from .data import Message logger = logging.getLogger(__name__) @@ -37,7 +37,7 @@ def main(): server = args.path.parent.name receipient = args.path.name - all_messages = parse_messages_in_chat_folder(args.path) + all_messages = libpurple.parse_messages_in_chat_folder(args.path) all_messages = filter_useless_messages(all_messages) all_messages = merge_adjacent_messages(all_messages) diff --git a/libpurple_to_markdown/data.py b/libpurple_to_markdown/data.py new file mode 100644 index 0000000..7614de8 --- /dev/null +++ b/libpurple_to_markdown/data.py @@ -0,0 +1,14 @@ +import dataclasses +import datetime + + +@dataclasses.dataclass(frozen=True, order=True) +class Message: + sent_at: datetime.datetime + sender: str + text: str + + def __post_init__(self): + assert self.sent_at is not None + assert self.sender is not None + assert self.text is not None diff --git a/libpurple_to_markdown/libpurple.py b/libpurple_to_markdown/libpurple.py new file mode 100644 index 0000000..37ed6d7 --- /dev/null +++ b/libpurple_to_markdown/libpurple.py @@ -0,0 +1,92 @@ +import datetime +import logging +from pathlib import Path + +import bs4 + +from .data import Message + +logger = logging.getLogger(__name__) + + +def parse_timestamp(c) -> datetime.time: + timestamp_obj = c + if c.font is not None: + c = c.font + m = re.match(r'\((\d+):(\d+):(\d+)\)', timestamp_obj.get_text()) + return datetime.time(int(m.group(1)), int(m.group(2)), int(m.group(3))) + + +def parse_messages_in_chat_file(path: Path) -> list[Message]: + logger.info('Parsing %s', path) + chat_start = datetime.datetime.fromisoformat( + path.stem.removesuffix('CEST').removesuffix('CET'), + ) + + with open(path) as f: + soup = bs4.BeautifulSoup(f, 'lxml') + + if len(soup.contents) == 0: + logger.warning('File is empty?') + return [] + + messages = [] + + cur_sent_at: datetime.datetime | None = None + cur_sender: str | None = None + cur_text: str = '' + + if soup.body.p: + loglines = soup.body.p.children + else: + loglines = soup.body.children + + for c in loglines: + if c.name in {'font', 'span'} and cur_sent_at is None: + # Get timestamp + cur_sent_at = datetime_sent(chat_start, parse_timestamp(c)) + + # Get sender + if c.b: + assert cur_sender is None + cur_sender = ( + c.b.get_text() + .strip() + .removesuffix(':') + .removeprefix('***') + .removesuffix('[m]') + ) + + elif c.name in {None, 'span', 'font'}: + cur_text += c.get_text() + + elif c.name == 'a': + cur_text += '<' + c['href'] + '>' + + elif c.name == 'br': + if cur_sender: + messages.append(Message(cur_sent_at, cur_sender, cur_text.strip())) + cur_sent_at = None + cur_sender = None + cur_text = '' + + elif c.name == 'b': + # Indicates system message. Ignore + pass + + elif c.name in {'h1', 'h3'}: + pass # Ignore log header + + else: + assert False, c + + return messages + + +def parse_messages_in_chat_folder(chat_folder_path: Path) -> list[Message]: + messages = [] + for file_path in sorted(chat_folder_path.iterdir()): + messages.extend(parse_messages_in_chat_file(file_path)) + + messages.sort() + return messages diff --git a/test/test_init.py b/test/test_init.py index bf34bc9..ba39a25 100644 --- a/test/test_init.py +++ b/test/test_init.py @@ -3,3 +3,8 @@ import libpurple_to_markdown def test_version(): assert libpurple_to_markdown.__version__ is not None + + +def test_load(): + import libpurple_to_markdown.data + import libpurple_to_markdown.libpurple # noqa: F401