diff --git a/libpurple_to_markdown/__init__.py b/libpurple_to_markdown/__init__.py index 2aa37dd..6785a14 100644 --- a/libpurple_to_markdown/__init__.py +++ b/libpurple_to_markdown/__init__.py @@ -24,13 +24,23 @@ def datetime_sent(chat_start: datetime.datetime, message_sent: datetime.time) -> naive = naive - datetime.timedelta(days=1) return naive +def parse_timestamp(c) -> datetime.time: + timestamp_obj = c + if c.font is not None: + c = c.font + m = re.match(r'\((\d+):(\d+):(\d+)\)', timestamp_obj.get_text()) + return datetime.time(int(m.group(1)), int(m.group(2)), int(m.group(3))) def parse_messages_in_chat_file(path: Path) -> list[Message]: logger.info('Parsing %s', path) - chat_start = datetime.datetime.fromisoformat(path.stem.removesuffix('CEST')) + chat_start = datetime.datetime.fromisoformat(path.stem.removesuffix('CEST').removesuffix('CET')) with open(path) as f: - soup = bs4.BeautifulSoup(f) + soup = bs4.BeautifulSoup(f, 'lxml') + + if len(soup.contents) == 0: + logger.warning('File is empty?') + return [] messages = [] @@ -38,21 +48,26 @@ def parse_messages_in_chat_file(path: Path) -> list[Message]: cur_sender: str = 'NOT DEFINED' cur_text: str = '' - for c in soup.body.children: - if c.name == 'font': + if soup.p: + logger.warning('File indicates error message?') + return [] # TODO + else: + loglines = soup.body.children + + for c in loglines: + if c.name in {'font','span'} and cur_sent_at is None: # Get timestamp - m = re.match(r'\((\d+):(\d+):(\d+)\)', c.font.get_text()) - time_sent = datetime.time(int(m.group(1)), int(m.group(2)), int(m.group(3))) - cur_sent_at = datetime_sent(chat_start, time_sent) + cur_sent_at = datetime_sent(chat_start, parse_timestamp(c)) # Get sender - cur_sender = c.b.get_text().strip().removesuffix(':') + if c.b: + cur_sender = c.b.get_text().strip().removesuffix(':') - elif c.name is None: - cur_text = c.get_text() + elif c.name in {None,'span','font'}: + cur_text += c.get_text() elif c.name == 'a': - cur_text = cur_text + '<' + c['href'] + '>' + cur_text += cur_text + '<' + c['href'] + '>' elif c.name == 'br': messages.append(Message(cur_sent_at, cur_sender, cur_text.strip())) @@ -60,7 +75,11 @@ def parse_messages_in_chat_file(path: Path) -> list[Message]: cur_sender = 'NOT DEFINED' cur_text = '' - elif c.name == 'h3': + elif c.name == 'b': + # Indicates system message. Ignore + pass + + elif c.name in {'h1', 'h3'}: pass # Ignore log header diff --git a/libpurple_to_markdown/__main__.py b/libpurple_to_markdown/__main__.py index b1035bb..5b4ea0b 100644 --- a/libpurple_to_markdown/__main__.py +++ b/libpurple_to_markdown/__main__.py @@ -1,4 +1,5 @@ import argparse +import logging from pathlib import Path from . import (parse_messages_in_chat_folder, merge_adjacent_messages, @@ -10,6 +11,8 @@ def parse_args(): return parser.parse_args() def main(): + logging.basicConfig() + logging.getLogger().setLevel('INFO') args = parse_args() messages = parse_messages_in_chat_folder(args.path)