"""Backend for Pidgin/LibPurple. [Pidgin](https://pidgin.im/) is a multi-protocol instant messaging app. It stores logs as either plain text files, or as HTML files (default). This backend parses the HTML files, focusing on the IRC protocol-style logs. **This backend is not actively maintained.** """ import datetime import logging from pathlib import Path import bs4 from .data import Message logger = logging.getLogger(__name__) def parse_timestamp(c) -> datetime.time: timestamp_obj = c if c.font is not None: c = c.font m = re.match(r'\((\d+):(\d+):(\d+)\)', timestamp_obj.get_text()) return datetime.time(int(m.group(1)), int(m.group(2)), int(m.group(3))) def parse_messages_in_chat_file(path: Path, chat_id: str) -> list[Message]: logger.info('Parsing %s', path) chat_start = datetime.datetime.fromisoformat( path.stem.removesuffix('CEST').removesuffix('CET'), ) with open(path) as f: soup = bs4.BeautifulSoup(f, 'lxml') if len(soup.contents) == 0: logger.warning('File is empty?') return [] messages = [] cur_sent_at: datetime.datetime | None = None cur_sender: str | None = None cur_text: str = '' if soup.body.p: loglines = soup.body.p.children else: loglines = soup.body.children for c in loglines: if c.name in {'font', 'span'} and cur_sent_at is None: # Get timestamp cur_sent_at = datetime_sent(chat_start, parse_timestamp(c)) # Get sender if c.b: assert cur_sender is None cur_sender = ( c.b.get_text() .strip() .removesuffix(':') .removeprefix('***') .removesuffix('[m]') ) elif c.name in {None, 'span', 'font'}: cur_text += c.get_text() elif c.name == 'a': cur_text += '<' + c['href'] + '>' elif c.name == 'br': if cur_sender: messages.append(Message(cur_sent_at, cur_sender, cur_text.strip(), chat_id)) cur_sent_at = None cur_sender = None cur_text = '' elif c.name == 'b': # Indicates system message. Ignore pass elif c.name in {'h1', 'h3'}: pass # Ignore log header else: assert False, c return messages def parse_messages_in_chat_folder(chat_folder_path: Path) -> list[Message]: messages = [] server = args.purple_folder.parent.name receipient = args.purple_folder.name chat_id = f'{server} - {receipient}' for file_path in sorted(chat_folder_path.iterdir()): messages.extend(parse_messages_in_chat_file(file_path, chat_id)) messages.sort() return messages