import dataclasses import re import datetime from pathlib import Path import bs4 import logging logger = logging.getLogger(__name__) @dataclasses.dataclass(frozen=True, order=True) class Message: sent_at: datetime.datetime sender: str text: str def __post_init__(self): assert self.sent_at is not None assert self.sender is not None assert self.text is not None def datetime_sent(chat_start: datetime.datetime, message_sent: datetime.time) -> datetime.datetime: naive = datetime.datetime.combine(chat_start.date(), message_sent, chat_start.tzinfo) if chat_start.time() > message_sent: naive = naive - datetime.timedelta(days=1) return naive def parse_timestamp(c) -> datetime.time: timestamp_obj = c if c.font is not None: c = c.font m = re.match(r'\((\d+):(\d+):(\d+)\)', timestamp_obj.get_text()) return datetime.time(int(m.group(1)), int(m.group(2)), int(m.group(3))) def parse_messages_in_chat_file(path: Path) -> list[Message]: logger.info('Parsing %s', path) chat_start = datetime.datetime.fromisoformat(path.stem.removesuffix('CEST').removesuffix('CET')) with open(path) as f: soup = bs4.BeautifulSoup(f, 'lxml') if len(soup.contents) == 0: logger.warning('File is empty?') return [] messages = [] cur_sent_at: datetime.datetime | None = None cur_sender: str | None = None cur_text: str = '' if soup.p: logger.warning('File indicates error message?') return [] # TODO else: loglines = soup.body.children for c in loglines: if c.name in {'font','span'} and cur_sent_at is None: # Get timestamp cur_sent_at = datetime_sent(chat_start, parse_timestamp(c)) # Get sender if c.b: assert cur_sender is None cur_sender = c.b.get_text().strip().removesuffix(':') elif c.name in {None,'span','font'}: cur_text += c.get_text() elif c.name == 'a': cur_text += '<' + c['href'] + '>' elif c.name == 'br': messages.append(Message(cur_sent_at, cur_sender, cur_text.strip())) cur_sent_at = None cur_sender = None cur_text = '' elif c.name == 'b': # Indicates system message. Ignore pass elif c.name in {'h1', 'h3'}: pass # Ignore log header else: assert False, c return messages def parse_messages_in_chat_folder(chat_folder_path: Path) -> list[Message]: messages = [] for file_path in chat_folder_path.iterdir(): messages.extend(parse_messages_in_chat_file(file_path)) messages.sort() return messages def format_message_as_citation(out, msg): out.append(f'{msg.sent_at.date()} {msg.sent_at.time()} [[{msg.sender}]]:') out.append('\n') for line in msg.text.split('\n'): out.append(f'> {line}\n') del line out.append('\n') def format_message_as_table(out, msg): out.append(f'| {msg.sent_at} | [[{msg.sender}]] | ') for line in msg.text.split('\n'): out.append(f'{line}') del line out.append('|\n') def format_messages(messages: list[Message]) -> str: out = [ '# Chat 2018' '\n\n' ] as_table = False for msg_idx, msg in enumerate(messages): if msg_idx == 0 or messages[msg_idx-1].sent_at.date() != msg.sent_at.date(): out.append('---\n') out.append(f'## [[{msg.sent_at.date()}]]\n\n') if as_table: out.append('| sent at | sender | text |\n') out.append('| ------- | ------ | ---- |\n') if as_table: format_message_as_table(out, msg) else: format_message_as_citation(out, msg) del msg return ''.join(out) MSG_ADJACENTCY_DIST = datetime.timedelta(minutes=2) def is_adjacent_messages(first, second): return first.sender == second.sender and second.sent_at - first.sent_at <= MSG_ADJACENTCY_DIST def merge_texts(text1: str, text2: str) -> str: punctuated = text1.endswith('.?!,:') #return text1 + (' ' if punctuated else '. ') + text2 return text1 + (' ' if punctuated else '. ') + '\n' + text2 def merge_adjacent_messages(messages: list[Message]) -> list[Message]: out = [] for msg in messages: if out and is_adjacent_messages(out[-1], msg): out[-1] = dataclasses.replace(out[-1], text=merge_texts(out[-1].text, msg.text)) else: out.append(msg) return out