From 21c61f7d027601ad4f1efde6717d88d858692d2c Mon Sep 17 00:00:00 2001 From: Jon Michael Aanes Date: Sat, 26 Oct 2024 00:34:53 +0200 Subject: [PATCH] Initial commit for crazy log conversion project --- .gitignore | 20 +++++++ libpurple_to_markdown/__init__.py | 0 libpurple_to_markdown/__main__.py | 90 +++++++++++++++++++++++++++++++ 3 files changed, 110 insertions(+) create mode 100644 .gitignore create mode 100644 libpurple_to_markdown/__init__.py create mode 100644 libpurple_to_markdown/__main__.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..41180b8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,20 @@ +# Program specific +/output/ +/deps/ +/secrets +/private_deps/ +/data/ +/config/ + +# Python +__pycache__/ +/build/ +/dist/ +*.egg-info/ +.mypy_cache/ + +# Python, Testing +/test/secrets.py +/.coverage +/.hypothesis/ +/htmlcov/ diff --git a/libpurple_to_markdown/__init__.py b/libpurple_to_markdown/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libpurple_to_markdown/__main__.py b/libpurple_to_markdown/__main__.py new file mode 100644 index 0000000..e5549c8 --- /dev/null +++ b/libpurple_to_markdown/__main__.py @@ -0,0 +1,90 @@ + +import dataclasses +import re +import datetime +from pathlib import Path +import bs4 + +@dataclasses.dataclass(frozen=True, order=True) +class Message: + sent_at: datetime.datetime + sender: str + text: str + + def __post_init__(self): + assert self.sent_at is not None + assert self.sender is not None + assert self.text is not None + +def datetime_sent(chat_start: datetime.datetime, message_sent: datetime.time) -> datetime.datetime: + naive = datetime.datetime.combine(chat_start.date(), message_sent, chat_start.tzinfo) + if chat_start.time() > message_sent: + naive = naive - datetime.timedelta(days=1) + return naive + + +def parse_messages_in_chat_file(path: Path) -> list[Message]: + chat_start = datetime.datetime.fromisoformat(path.stem.removesuffix('CEST')) + + with open(path) as f: + soup = bs4.BeautifulSoup(f) + + messages = [] + + cur_sent_at: datetime.datetime | None = None + cur_sender: str | None = None + cur_text: str | None = None + + for c in soup.body.children: + if c.name == 'font': + # Get timestamp + m = re.match(r'\((\d+):(\d+):(\d+)\)', c.font.get_text()) + time_sent = datetime.time(int(m.group(1)), int(m.group(2)), int(m.group(3))) + cur_sent_at = datetime_sent(chat_start, time_sent) + + # Get sender + cur_sender = c.b.get_text().strip().removesuffix(':') + + elif c.name is None: + cur_text = c.get_text() + + elif c.name == 'a': + cur_text = '<' + c['href'] + '>' + + elif c.name == 'br': + messages.append(Message(cur_sent_at, cur_sender, cur_text)) + cur_sent_at = None + cur_sender = None + cur_text = None + + elif c.name == 'h3': + pass # Ignore log header + + + else: + assert False, c + + return messages + + + + +def parse_messages_in_chat_folder(chat_folder_path: Path) -> list[Message]: + messages = [] + for file_path in chat_folder_path.iterdir(): + messages.extend(parse_messages_in_chat_file(file_path)) + + messages.sort() + return messages + + + + +def main(): + path = Path() + + for message in parse_messages_in_chat_folder(path): + print(f'({message.sent_at}) {message.sender} : {message.text}') + +if __name__ == '__main__': + main()