1
0
libpurple-to-markdown/libpurple_to_markdown/__init__.py

136 lines
3.7 KiB
Python
Raw Normal View History

2024-10-26 13:17:31 +00:00
"""Libpurple to markdown conversion script.
Conversion script for HTML-based logs from [Pidgin/Libpurple](https://pidgin.im/) chat program.
**This is an one-off script, and is not actively maintained.**
## Motivation
I recently discovered [Obsidian](https://obsidian.md) and liked the prospect of
cross-referencing my notes with my old chat logs. Libpurple uses HTML logs if
you haven't configured it to something else (which I haden't).
I no longer use IRC or Pidgin as my entire friend group have switched to using
Matrix.
## Usage
From the repository root:
```bash
python -m libpurple_to_markdown LOG_DIRECTORY --output OUTPUT_FOLDER
```
It was made specifically for import into Obsidian, so it might not suite your
purposes, but it shouldn't be too difficult to adjust.
"""
2024-10-26 13:17:46 +00:00
2024-10-25 23:12:30 +00:00
import dataclasses
import datetime
2024-10-26 11:47:39 +00:00
import logging
import re
2024-10-26 13:17:46 +00:00
from collections.abc import Iterable, Iterator
2024-10-25 23:12:30 +00:00
2024-10-26 11:47:23 +00:00
from ._version import __version__
from .data import Message
2024-10-26 11:47:39 +00:00
2024-10-26 11:47:23 +00:00
__all__ = ['__version__']
2024-10-25 23:12:30 +00:00
logger = logging.getLogger(__name__)
2024-10-26 11:47:39 +00:00
def datetime_sent(
2024-10-26 13:17:46 +00:00
chat_start: datetime.datetime,
message_sent: datetime.time,
2024-10-26 11:47:39 +00:00
) -> datetime.datetime:
naive = datetime.datetime.combine(
2024-10-26 13:17:46 +00:00
chat_start.date(),
message_sent,
chat_start.tzinfo,
2024-10-26 11:47:39 +00:00
)
2024-10-25 23:12:30 +00:00
if chat_start.time() > message_sent:
2024-10-26 12:53:31 +00:00
naive = naive + datetime.timedelta(days=1)
2024-10-25 23:12:30 +00:00
return naive
2024-10-26 11:47:39 +00:00
2024-10-26 12:53:31 +00:00
def format_message_as_citation(out: list[str], msg: Message) -> None:
2024-10-26 00:08:45 +00:00
out.append(f'{msg.sent_at.date()} {msg.sent_at.time()} [[{msg.sender}]]:')
out.append('\n')
for line in msg.text.split('\n'):
2024-10-26 12:53:31 +00:00
line = re.sub(r'(<[\w ]+>)', r'`\1`', line)
line = re.sub(r'(\$\$\$)', r'`\1`', line)
2024-10-26 00:08:45 +00:00
out.append(f'> {line}\n')
del line
out.append('\n')
2024-10-26 11:47:39 +00:00
2024-10-26 12:53:31 +00:00
def format_message_as_table(out: list[str], msg: Message) -> None:
2024-10-26 00:08:45 +00:00
out.append(f'| {msg.sent_at} | [[{msg.sender}]] | ')
for line in msg.text.split('\n'):
out.append(f'{line}')
del line
out.append('|\n')
2024-10-26 11:47:39 +00:00
2024-10-26 12:53:31 +00:00
def format_messages(messages: list[Message], title: str) -> str:
out = ['# ', title, '\n\n']
2024-10-25 23:12:30 +00:00
2024-10-26 00:08:45 +00:00
as_table = False
for msg_idx, msg in enumerate(messages):
2024-10-26 11:47:39 +00:00
if msg_idx == 0 or messages[msg_idx - 1].sent_at.date() != msg.sent_at.date():
2024-10-26 00:08:45 +00:00
out.append('---\n')
out.append(f'## [[{msg.sent_at.date()}]]\n\n')
if as_table:
out.append('| sent at | sender | text |\n')
out.append('| ------- | ------ | ---- |\n')
if as_table:
format_message_as_table(out, msg)
else:
format_message_as_citation(out, msg)
2024-10-25 23:12:30 +00:00
del msg
return ''.join(out)
2024-10-26 11:47:39 +00:00
2024-10-25 23:12:30 +00:00
MSG_ADJACENTCY_DIST = datetime.timedelta(minutes=2)
2024-10-26 13:17:46 +00:00
2024-10-26 12:53:31 +00:00
def is_useless_message(msg: Message) -> bool:
return msg.sender.endswith('<AUTO-REPLY>') or msg.sender == ''
2024-10-26 11:47:39 +00:00
2024-10-26 13:17:46 +00:00
2024-10-26 12:53:31 +00:00
def filter_useless_messages(messages: Iterable[Message]) -> Iterator[Message]:
for msg in messages:
if not is_useless_message(msg):
yield msg
else:
print(msg.text)
2024-10-26 13:17:46 +00:00
2024-10-26 12:53:31 +00:00
def is_adjacent_messages(first: Message, second: Message) -> bool:
2024-10-26 11:47:39 +00:00
return (
first.sender == second.sender
and second.sent_at - first.sent_at <= MSG_ADJACENTCY_DIST
)
2024-10-25 23:12:30 +00:00
2024-10-26 00:08:45 +00:00
def merge_texts(text1: str, text2: str) -> str:
punctuated = text1.endswith('.?!,:')
2024-10-26 11:47:39 +00:00
# return text1 + (' ' if punctuated else '. ') + text2
2024-10-26 00:08:45 +00:00
return text1 + (' ' if punctuated else '. ') + '\n' + text2
2024-10-26 11:47:39 +00:00
2024-10-26 12:53:31 +00:00
def merge_adjacent_messages(messages: Iterable[Message]) -> list[Message]:
out: list[Message] = []
2024-10-25 23:12:30 +00:00
for msg in messages:
if out and is_adjacent_messages(out[-1], msg):
2024-10-26 11:47:39 +00:00
out[-1] = dataclasses.replace(
2024-10-26 13:17:46 +00:00
out[-1],
text=merge_texts(out[-1].text, msg.text),
2024-10-26 11:47:39 +00:00
)
2024-10-25 23:12:30 +00:00
else:
out.append(msg)
return out