1
0
libpurple-to-markdown/libpurple_to_markdown/__init__.py

135 lines
3.9 KiB
Python
Raw Normal View History

"""# Markdown Message Conversion.
2024-10-26 13:17:31 +00:00
Conversion script from various messaging formats to markdown.
2024-10-26 13:17:31 +00:00
Supported input formats:
- [Pidgin/Libpurple](https://pidgin.im/) chat program HTML-based logs. **This
backend is not actively maintained.**
- [SyncTech Backup & Restore](https://www.synctech.com.au/sms-backup-restore/)
XML-based backup format.
2024-10-26 13:17:31 +00:00
## Motivation
Messaging applications are mostly good at sending real-time messages to other
people, but they generally do not possess any useful archival features. Most
messages are write-once read-once, and the apps where built for this use case.
More and more through, I am attracted to the prospect of archival; of
understanding who I am and who I _were_ when I wrote those messages.
2024-10-26 13:17:31 +00:00
I recently discovered [Obsidian](https://obsidian.md) and liked the prospect of
cross-referencing my notes with my old chat logs. Libpurple uses HTML logs if
you haven't configured it to something else (which I haden't).
I no longer use IRC or Pidgin as my entire friend group have switched to using
Matrix.
## Usage
2024-11-17 11:04:29 +00:00
There are two main import patterns:
- One-off archival import: For when you have a large set of messages to import
for a service that you don't use very much anymore.
- Recurring import: For when you are still using the service, and want to
import on a recurring basis.
This program will be default not overwrite existing files, as the user might
have modified it.
Special consideration must be taking for recurring imports if you expect to be
modifying the resulting files, for example if you are inserting links
using Obsidian's unlinked mentions feature. You might want to use the
`--skip-this-period` flag to avoid importing the current period until it has
become the last period. That way you won't accidentally modify the log, because
it has been finalized.
### One-off
This is the recommended command for the one-off case:
2024-10-26 13:17:31 +00:00
```bash
python -m libpurple_to_markdown LOG_DIRECTORY --output OUTPUT_FOLDER
```
2024-11-17 11:04:29 +00:00
### Recurring
This is the recommended command for the recurring import case:
```bash
python -m libpurple_to_markdown LOG_DIRECTORY --output OUTPUT_FOLDER --skip-this-period --period month
```
2024-11-03 16:24:38 +00:00
## TODO
2024-11-17 11:04:29 +00:00
- [ ] SyncTech: Decode MMS parts and reconstruct image attachments.
2024-10-26 13:17:31 +00:00
"""
2024-10-26 13:17:46 +00:00
2024-10-25 23:12:30 +00:00
import dataclasses
import datetime
2024-10-26 11:47:39 +00:00
import logging
2024-10-26 13:17:46 +00:00
from collections.abc import Iterable, Iterator
2024-10-25 23:12:30 +00:00
2024-10-26 11:47:23 +00:00
from ._version import __version__
from .data import Message
2024-10-26 11:47:39 +00:00
2024-10-26 11:47:23 +00:00
__all__ = ['__version__']
2024-10-25 23:12:30 +00:00
logger = logging.getLogger(__name__)
2024-10-26 11:47:39 +00:00
def datetime_sent(
2024-10-26 13:17:46 +00:00
chat_start: datetime.datetime,
message_sent: datetime.time,
2024-10-26 11:47:39 +00:00
) -> datetime.datetime:
naive = datetime.datetime.combine(
2024-10-26 13:17:46 +00:00
chat_start.date(),
message_sent,
chat_start.tzinfo,
2024-10-26 11:47:39 +00:00
)
2024-10-25 23:12:30 +00:00
if chat_start.time() > message_sent:
2024-10-26 12:53:31 +00:00
naive = naive + datetime.timedelta(days=1)
2024-10-25 23:12:30 +00:00
return naive
2024-10-26 11:47:39 +00:00
2024-10-25 23:12:30 +00:00
MSG_ADJACENTCY_DIST = datetime.timedelta(minutes=2)
2024-10-26 13:17:46 +00:00
2024-10-26 12:53:31 +00:00
def is_useless_message(msg: Message) -> bool:
return msg.sender.endswith('<AUTO-REPLY>') or msg.sender == ''
2024-10-26 11:47:39 +00:00
2024-10-26 13:17:46 +00:00
2024-10-26 12:53:31 +00:00
def filter_useless_messages(messages: Iterable[Message]) -> Iterator[Message]:
for msg in messages:
if not is_useless_message(msg):
yield msg
else:
print(msg.text)
2024-10-26 13:17:46 +00:00
2024-10-26 12:53:31 +00:00
def is_adjacent_messages(first: Message, second: Message) -> bool:
2024-10-26 11:47:39 +00:00
return (
first.sender == second.sender
and second.sent_at - first.sent_at <= MSG_ADJACENTCY_DIST
)
2024-11-03 22:58:40 +00:00
PUNCTUATION = ('.', '?', '!', ',', ':', ';')
2024-10-25 23:12:30 +00:00
2024-10-26 00:08:45 +00:00
def merge_texts(text1: str, text2: str) -> str:
2024-11-03 22:57:15 +00:00
punctuated = text1.endswith(PUNCTUATION)
return text1 + ('' if punctuated else '.') + '\n' + text2
2024-10-26 00:08:45 +00:00
2024-10-26 11:47:39 +00:00
2024-10-26 12:53:31 +00:00
def merge_adjacent_messages(messages: Iterable[Message]) -> list[Message]:
out: list[Message] = []
2024-10-25 23:12:30 +00:00
for msg in messages:
if out and is_adjacent_messages(out[-1], msg):
2024-10-26 11:47:39 +00:00
out[-1] = dataclasses.replace(
2024-10-26 13:17:46 +00:00
out[-1],
text=merge_texts(out[-1].text, msg.text),
2024-10-26 11:47:39 +00:00
)
2024-10-25 23:12:30 +00:00
else:
out.append(msg)
return out