2024-10-26 13:17:31 +00:00
|
|
|
"""Libpurple to markdown conversion script.
|
|
|
|
|
|
|
|
Conversion script for HTML-based logs from [Pidgin/Libpurple](https://pidgin.im/) chat program.
|
|
|
|
|
|
|
|
**This is an one-off script, and is not actively maintained.**
|
|
|
|
|
|
|
|
## Motivation
|
|
|
|
|
|
|
|
I recently discovered [Obsidian](https://obsidian.md) and liked the prospect of
|
|
|
|
cross-referencing my notes with my old chat logs. Libpurple uses HTML logs if
|
|
|
|
you haven't configured it to something else (which I haden't).
|
|
|
|
|
|
|
|
I no longer use IRC or Pidgin as my entire friend group have switched to using
|
|
|
|
Matrix.
|
|
|
|
|
|
|
|
## Usage
|
|
|
|
|
|
|
|
From the repository root:
|
|
|
|
|
|
|
|
```bash
|
|
|
|
python -m libpurple_to_markdown LOG_DIRECTORY --output OUTPUT_FOLDER
|
|
|
|
```
|
|
|
|
|
|
|
|
It was made specifically for import into Obsidian, so it might not suite your
|
|
|
|
purposes, but it shouldn't be too difficult to adjust.
|
|
|
|
"""
|
2024-10-26 13:17:46 +00:00
|
|
|
|
2024-10-25 23:12:30 +00:00
|
|
|
import dataclasses
|
|
|
|
import datetime
|
2024-10-26 11:47:39 +00:00
|
|
|
import logging
|
|
|
|
import re
|
2024-10-26 13:17:46 +00:00
|
|
|
from collections.abc import Iterable, Iterator
|
2024-10-25 23:12:30 +00:00
|
|
|
from pathlib import Path
|
2024-10-26 11:47:39 +00:00
|
|
|
|
2024-10-25 23:12:30 +00:00
|
|
|
import bs4
|
|
|
|
|
2024-10-26 11:47:23 +00:00
|
|
|
from ._version import __version__
|
2024-10-26 11:47:39 +00:00
|
|
|
|
2024-10-26 11:47:23 +00:00
|
|
|
__all__ = ['__version__']
|
|
|
|
|
2024-10-25 23:12:30 +00:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
2024-10-26 11:47:39 +00:00
|
|
|
|
2024-10-25 23:12:30 +00:00
|
|
|
@dataclasses.dataclass(frozen=True, order=True)
|
|
|
|
class Message:
|
|
|
|
sent_at: datetime.datetime
|
|
|
|
sender: str
|
|
|
|
text: str
|
|
|
|
|
|
|
|
def __post_init__(self):
|
|
|
|
assert self.sent_at is not None
|
|
|
|
assert self.sender is not None
|
|
|
|
assert self.text is not None
|
|
|
|
|
2024-10-26 11:47:39 +00:00
|
|
|
|
|
|
|
def datetime_sent(
|
2024-10-26 13:17:46 +00:00
|
|
|
chat_start: datetime.datetime,
|
|
|
|
message_sent: datetime.time,
|
2024-10-26 11:47:39 +00:00
|
|
|
) -> datetime.datetime:
|
|
|
|
naive = datetime.datetime.combine(
|
2024-10-26 13:17:46 +00:00
|
|
|
chat_start.date(),
|
|
|
|
message_sent,
|
|
|
|
chat_start.tzinfo,
|
2024-10-26 11:47:39 +00:00
|
|
|
)
|
2024-10-25 23:12:30 +00:00
|
|
|
if chat_start.time() > message_sent:
|
2024-10-26 12:53:31 +00:00
|
|
|
naive = naive + datetime.timedelta(days=1)
|
2024-10-25 23:12:30 +00:00
|
|
|
return naive
|
|
|
|
|
2024-10-26 11:47:39 +00:00
|
|
|
|
2024-10-25 23:30:51 +00:00
|
|
|
def parse_timestamp(c) -> datetime.time:
|
|
|
|
timestamp_obj = c
|
|
|
|
if c.font is not None:
|
|
|
|
c = c.font
|
|
|
|
m = re.match(r'\((\d+):(\d+):(\d+)\)', timestamp_obj.get_text())
|
|
|
|
return datetime.time(int(m.group(1)), int(m.group(2)), int(m.group(3)))
|
2024-10-25 23:12:30 +00:00
|
|
|
|
2024-10-26 11:47:39 +00:00
|
|
|
|
2024-10-25 23:12:30 +00:00
|
|
|
def parse_messages_in_chat_file(path: Path) -> list[Message]:
|
|
|
|
logger.info('Parsing %s', path)
|
2024-10-26 11:47:39 +00:00
|
|
|
chat_start = datetime.datetime.fromisoformat(
|
|
|
|
path.stem.removesuffix('CEST').removesuffix('CET'),
|
|
|
|
)
|
2024-10-25 23:12:30 +00:00
|
|
|
|
|
|
|
with open(path) as f:
|
2024-10-25 23:30:51 +00:00
|
|
|
soup = bs4.BeautifulSoup(f, 'lxml')
|
|
|
|
|
|
|
|
if len(soup.contents) == 0:
|
|
|
|
logger.warning('File is empty?')
|
|
|
|
return []
|
2024-10-25 23:12:30 +00:00
|
|
|
|
|
|
|
messages = []
|
|
|
|
|
|
|
|
cur_sent_at: datetime.datetime | None = None
|
2024-10-26 00:08:45 +00:00
|
|
|
cur_sender: str | None = None
|
2024-10-25 23:12:30 +00:00
|
|
|
cur_text: str = ''
|
|
|
|
|
2024-10-26 12:53:31 +00:00
|
|
|
if soup.body.p:
|
|
|
|
loglines = soup.body.p.children
|
2024-10-25 23:30:51 +00:00
|
|
|
else:
|
|
|
|
loglines = soup.body.children
|
|
|
|
|
|
|
|
for c in loglines:
|
2024-10-26 11:47:39 +00:00
|
|
|
if c.name in {'font', 'span'} and cur_sent_at is None:
|
2024-10-25 23:12:30 +00:00
|
|
|
# Get timestamp
|
2024-10-25 23:30:51 +00:00
|
|
|
cur_sent_at = datetime_sent(chat_start, parse_timestamp(c))
|
2024-10-25 23:12:30 +00:00
|
|
|
|
|
|
|
# Get sender
|
2024-10-25 23:30:51 +00:00
|
|
|
if c.b:
|
2024-10-26 00:08:45 +00:00
|
|
|
assert cur_sender is None
|
2024-10-26 13:17:46 +00:00
|
|
|
cur_sender = (
|
|
|
|
c.b.get_text()
|
|
|
|
.strip()
|
|
|
|
.removesuffix(':')
|
|
|
|
.removeprefix('***')
|
|
|
|
.removesuffix('[m]')
|
|
|
|
)
|
2024-10-25 23:12:30 +00:00
|
|
|
|
2024-10-26 11:47:39 +00:00
|
|
|
elif c.name in {None, 'span', 'font'}:
|
2024-10-25 23:30:51 +00:00
|
|
|
cur_text += c.get_text()
|
2024-10-25 23:12:30 +00:00
|
|
|
|
|
|
|
elif c.name == 'a':
|
2024-10-26 00:08:45 +00:00
|
|
|
cur_text += '<' + c['href'] + '>'
|
2024-10-25 23:12:30 +00:00
|
|
|
|
|
|
|
elif c.name == 'br':
|
2024-10-26 12:53:31 +00:00
|
|
|
if cur_sender:
|
|
|
|
messages.append(Message(cur_sent_at, cur_sender, cur_text.strip()))
|
2024-10-25 23:12:30 +00:00
|
|
|
cur_sent_at = None
|
2024-10-26 00:08:45 +00:00
|
|
|
cur_sender = None
|
2024-10-25 23:12:30 +00:00
|
|
|
cur_text = ''
|
|
|
|
|
2024-10-25 23:30:51 +00:00
|
|
|
elif c.name == 'b':
|
|
|
|
# Indicates system message. Ignore
|
|
|
|
pass
|
|
|
|
|
|
|
|
elif c.name in {'h1', 'h3'}:
|
2024-10-26 11:47:39 +00:00
|
|
|
pass # Ignore log header
|
2024-10-25 23:12:30 +00:00
|
|
|
|
|
|
|
else:
|
|
|
|
assert False, c
|
|
|
|
|
|
|
|
return messages
|
|
|
|
|
|
|
|
|
|
|
|
def parse_messages_in_chat_folder(chat_folder_path: Path) -> list[Message]:
|
|
|
|
messages = []
|
2024-10-26 12:53:31 +00:00
|
|
|
for file_path in sorted(chat_folder_path.iterdir()):
|
2024-10-25 23:12:30 +00:00
|
|
|
messages.extend(parse_messages_in_chat_file(file_path))
|
|
|
|
|
|
|
|
messages.sort()
|
|
|
|
return messages
|
|
|
|
|
|
|
|
|
2024-10-26 12:53:31 +00:00
|
|
|
def format_message_as_citation(out: list[str], msg: Message) -> None:
|
2024-10-26 00:08:45 +00:00
|
|
|
out.append(f'{msg.sent_at.date()} {msg.sent_at.time()} [[{msg.sender}]]:')
|
|
|
|
out.append('\n')
|
|
|
|
for line in msg.text.split('\n'):
|
2024-10-26 12:53:31 +00:00
|
|
|
line = re.sub(r'(<[\w ]+>)', r'`\1`', line)
|
|
|
|
line = re.sub(r'(\$\$\$)', r'`\1`', line)
|
2024-10-26 00:08:45 +00:00
|
|
|
out.append(f'> {line}\n')
|
|
|
|
del line
|
|
|
|
out.append('\n')
|
|
|
|
|
2024-10-26 11:47:39 +00:00
|
|
|
|
2024-10-26 12:53:31 +00:00
|
|
|
def format_message_as_table(out: list[str], msg: Message) -> None:
|
2024-10-26 00:08:45 +00:00
|
|
|
out.append(f'| {msg.sent_at} | [[{msg.sender}]] | ')
|
|
|
|
for line in msg.text.split('\n'):
|
|
|
|
out.append(f'{line}')
|
|
|
|
del line
|
|
|
|
out.append('|\n')
|
|
|
|
|
2024-10-26 11:47:39 +00:00
|
|
|
|
2024-10-26 12:53:31 +00:00
|
|
|
def format_messages(messages: list[Message], title: str) -> str:
|
|
|
|
out = ['# ', title, '\n\n']
|
2024-10-25 23:12:30 +00:00
|
|
|
|
2024-10-26 00:08:45 +00:00
|
|
|
as_table = False
|
|
|
|
|
|
|
|
for msg_idx, msg in enumerate(messages):
|
2024-10-26 11:47:39 +00:00
|
|
|
if msg_idx == 0 or messages[msg_idx - 1].sent_at.date() != msg.sent_at.date():
|
2024-10-26 00:08:45 +00:00
|
|
|
out.append('---\n')
|
|
|
|
out.append(f'## [[{msg.sent_at.date()}]]\n\n')
|
|
|
|
if as_table:
|
|
|
|
out.append('| sent at | sender | text |\n')
|
|
|
|
out.append('| ------- | ------ | ---- |\n')
|
|
|
|
|
|
|
|
if as_table:
|
|
|
|
format_message_as_table(out, msg)
|
|
|
|
else:
|
|
|
|
format_message_as_citation(out, msg)
|
2024-10-25 23:12:30 +00:00
|
|
|
del msg
|
|
|
|
|
|
|
|
return ''.join(out)
|
|
|
|
|
2024-10-26 11:47:39 +00:00
|
|
|
|
2024-10-25 23:12:30 +00:00
|
|
|
MSG_ADJACENTCY_DIST = datetime.timedelta(minutes=2)
|
|
|
|
|
2024-10-26 13:17:46 +00:00
|
|
|
|
2024-10-26 12:53:31 +00:00
|
|
|
def is_useless_message(msg: Message) -> bool:
|
|
|
|
return msg.sender.endswith('<AUTO-REPLY>') or msg.sender == ''
|
2024-10-26 11:47:39 +00:00
|
|
|
|
2024-10-26 13:17:46 +00:00
|
|
|
|
2024-10-26 12:53:31 +00:00
|
|
|
def filter_useless_messages(messages: Iterable[Message]) -> Iterator[Message]:
|
|
|
|
for msg in messages:
|
|
|
|
if not is_useless_message(msg):
|
|
|
|
yield msg
|
|
|
|
else:
|
|
|
|
print(msg.text)
|
|
|
|
|
2024-10-26 13:17:46 +00:00
|
|
|
|
2024-10-26 12:53:31 +00:00
|
|
|
def is_adjacent_messages(first: Message, second: Message) -> bool:
|
2024-10-26 11:47:39 +00:00
|
|
|
return (
|
|
|
|
first.sender == second.sender
|
|
|
|
and second.sent_at - first.sent_at <= MSG_ADJACENTCY_DIST
|
|
|
|
)
|
|
|
|
|
2024-10-25 23:12:30 +00:00
|
|
|
|
2024-10-26 00:08:45 +00:00
|
|
|
def merge_texts(text1: str, text2: str) -> str:
|
|
|
|
punctuated = text1.endswith('.?!,:')
|
2024-10-26 11:47:39 +00:00
|
|
|
# return text1 + (' ' if punctuated else '. ') + text2
|
2024-10-26 00:08:45 +00:00
|
|
|
return text1 + (' ' if punctuated else '. ') + '\n' + text2
|
|
|
|
|
2024-10-26 11:47:39 +00:00
|
|
|
|
2024-10-26 12:53:31 +00:00
|
|
|
def merge_adjacent_messages(messages: Iterable[Message]) -> list[Message]:
|
2024-10-25 23:12:30 +00:00
|
|
|
out = []
|
|
|
|
for msg in messages:
|
|
|
|
if out and is_adjacent_messages(out[-1], msg):
|
2024-10-26 11:47:39 +00:00
|
|
|
out[-1] = dataclasses.replace(
|
2024-10-26 13:17:46 +00:00
|
|
|
out[-1],
|
|
|
|
text=merge_texts(out[-1].text, msg.text),
|
2024-10-26 11:47:39 +00:00
|
|
|
)
|
2024-10-25 23:12:30 +00:00
|
|
|
else:
|
|
|
|
out.append(msg)
|
|
|
|
return out
|