1
0
libpurple-to-markdown/libpurple_to_markdown/libpurple.py

108 lines
2.8 KiB
Python
Raw Normal View History

"""Backend for Pidgin/LibPurple.
[Pidgin](https://pidgin.im/) is a multi-protocol instant messaging app. It
stores logs as either plain text files, or as HTML files (default).
This backend parses the HTML files, focusing on the IRC protocol-style logs.
**This backend is not actively maintained.**
"""
2024-10-31 21:38:22 +00:00
import datetime
import logging
from pathlib import Path
import bs4
from .data import Message
logger = logging.getLogger(__name__)
def parse_timestamp(c) -> datetime.time:
timestamp_obj = c
if c.font is not None:
c = c.font
m = re.match(r'\((\d+):(\d+):(\d+)\)', timestamp_obj.get_text())
return datetime.time(int(m.group(1)), int(m.group(2)), int(m.group(3)))
2024-10-31 19:36:38 +00:00
def parse_messages_in_chat_file(path: Path, chat_id: str) -> list[Message]:
logger.info('Parsing %s', path)
chat_start = datetime.datetime.fromisoformat(
path.stem.removesuffix('CEST').removesuffix('CET'),
)
with open(path) as f:
soup = bs4.BeautifulSoup(f, 'lxml')
if len(soup.contents) == 0:
logger.warning('File is empty?')
return []
messages = []
cur_sent_at: datetime.datetime | None = None
cur_sender: str | None = None
cur_text: str = ''
if soup.body.p:
loglines = soup.body.p.children
else:
loglines = soup.body.children
for c in loglines:
if c.name in {'font', 'span'} and cur_sent_at is None:
# Get timestamp
cur_sent_at = datetime_sent(chat_start, parse_timestamp(c))
# Get sender
if c.b:
assert cur_sender is None
cur_sender = (
c.b.get_text()
.strip()
.removesuffix(':')
.removeprefix('***')
.removesuffix('[m]')
)
elif c.name in {None, 'span', 'font'}:
cur_text += c.get_text()
elif c.name == 'a':
cur_text += '<' + c['href'] + '>'
elif c.name == 'br':
if cur_sender:
2024-10-31 21:38:22 +00:00
messages.append(
Message(cur_sent_at, cur_sender, cur_text.strip(), chat_id),
)
cur_sent_at = None
cur_sender = None
cur_text = ''
elif c.name == 'b':
# Indicates system message. Ignore
pass
elif c.name in {'h1', 'h3'}:
pass # Ignore log header
else:
assert False, c
return messages
def parse_messages_in_chat_folder(chat_folder_path: Path) -> list[Message]:
messages = []
2024-10-31 19:36:38 +00:00
server = args.purple_folder.parent.name
receipient = args.purple_folder.name
chat_id = f'{server} - {receipient}'
for file_path in sorted(chat_folder_path.iterdir()):
2024-10-31 19:36:38 +00:00
messages.extend(parse_messages_in_chat_file(file_path, chat_id))
messages.sort()
return messages