import dataclasses import re import datetime from pathlib import Path import bs4 @dataclasses.dataclass(frozen=True, order=True) class Message: sent_at: datetime.datetime sender: str text: str def __post_init__(self): assert self.sent_at is not None assert self.sender is not None assert self.text is not None def datetime_sent(chat_start: datetime.datetime, message_sent: datetime.time) -> datetime.datetime: naive = datetime.datetime.combine(chat_start.date(), message_sent, chat_start.tzinfo) if chat_start.time() > message_sent: naive = naive - datetime.timedelta(days=1) return naive def parse_messages_in_chat_file(path: Path) -> list[Message]: chat_start = datetime.datetime.fromisoformat(path.stem.removesuffix('CEST')) with open(path) as f: soup = bs4.BeautifulSoup(f) messages = [] cur_sent_at: datetime.datetime | None = None cur_sender: str | None = None cur_text: str | None = None for c in soup.body.children: if c.name == 'font': # Get timestamp m = re.match(r'\((\d+):(\d+):(\d+)\)', c.font.get_text()) time_sent = datetime.time(int(m.group(1)), int(m.group(2)), int(m.group(3))) cur_sent_at = datetime_sent(chat_start, time_sent) # Get sender cur_sender = c.b.get_text().strip().removesuffix(':') elif c.name is None: cur_text = c.get_text() elif c.name == 'a': cur_text = '<' + c['href'] + '>' elif c.name == 'br': messages.append(Message(cur_sent_at, cur_sender, cur_text)) cur_sent_at = None cur_sender = None cur_text = None elif c.name == 'h3': pass # Ignore log header else: assert False, c return messages def parse_messages_in_chat_folder(chat_folder_path: Path) -> list[Message]: messages = [] for file_path in chat_folder_path.iterdir(): messages.extend(parse_messages_in_chat_file(file_path)) messages.sort() return messages def main(): path = Path() for message in parse_messages_in_chat_folder(path): print(f'({message.sent_at}) {message.sender} : {message.text}') if __name__ == '__main__': main()