1
0
This commit is contained in:
Jon Michael Aanes 2024-10-26 13:47:39 +02:00
parent 241c4f92c5
commit 4afb0f1364
Signed by: Jmaa
SSH Key Fingerprint: SHA256:Ab0GfHGCblESJx7JRE4fj4bFy/KRpeLhi41y4pF3sNA
3 changed files with 43 additions and 24 deletions

View File

@ -1,15 +1,18 @@
import dataclasses import dataclasses
import re
import datetime import datetime
from pathlib import Path
import bs4
import logging import logging
import re
from pathlib import Path
import bs4
from ._version import __version__ from ._version import __version__
__all__ = ['__version__'] __all__ = ['__version__']
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@dataclasses.dataclass(frozen=True, order=True) @dataclasses.dataclass(frozen=True, order=True)
class Message: class Message:
sent_at: datetime.datetime sent_at: datetime.datetime
@ -21,12 +24,18 @@ class Message:
assert self.sender is not None assert self.sender is not None
assert self.text is not None assert self.text is not None
def datetime_sent(chat_start: datetime.datetime, message_sent: datetime.time) -> datetime.datetime:
naive = datetime.datetime.combine(chat_start.date(), message_sent, chat_start.tzinfo) def datetime_sent(
chat_start: datetime.datetime, message_sent: datetime.time,
) -> datetime.datetime:
naive = datetime.datetime.combine(
chat_start.date(), message_sent, chat_start.tzinfo,
)
if chat_start.time() > message_sent: if chat_start.time() > message_sent:
naive = naive - datetime.timedelta(days=1) naive = naive - datetime.timedelta(days=1)
return naive return naive
def parse_timestamp(c) -> datetime.time: def parse_timestamp(c) -> datetime.time:
timestamp_obj = c timestamp_obj = c
if c.font is not None: if c.font is not None:
@ -34,9 +43,12 @@ def parse_timestamp(c) -> datetime.time:
m = re.match(r'\((\d+):(\d+):(\d+)\)', timestamp_obj.get_text()) m = re.match(r'\((\d+):(\d+):(\d+)\)', timestamp_obj.get_text())
return datetime.time(int(m.group(1)), int(m.group(2)), int(m.group(3))) return datetime.time(int(m.group(1)), int(m.group(2)), int(m.group(3)))
def parse_messages_in_chat_file(path: Path) -> list[Message]: def parse_messages_in_chat_file(path: Path) -> list[Message]:
logger.info('Parsing %s', path) logger.info('Parsing %s', path)
chat_start = datetime.datetime.fromisoformat(path.stem.removesuffix('CEST').removesuffix('CET')) chat_start = datetime.datetime.fromisoformat(
path.stem.removesuffix('CEST').removesuffix('CET'),
)
with open(path) as f: with open(path) as f:
soup = bs4.BeautifulSoup(f, 'lxml') soup = bs4.BeautifulSoup(f, 'lxml')
@ -58,7 +70,7 @@ def parse_messages_in_chat_file(path: Path) -> list[Message]:
loglines = soup.body.children loglines = soup.body.children
for c in loglines: for c in loglines:
if c.name in {'font','span'} and cur_sent_at is None: if c.name in {'font', 'span'} and cur_sent_at is None:
# Get timestamp # Get timestamp
cur_sent_at = datetime_sent(chat_start, parse_timestamp(c)) cur_sent_at = datetime_sent(chat_start, parse_timestamp(c))
@ -67,7 +79,7 @@ def parse_messages_in_chat_file(path: Path) -> list[Message]:
assert cur_sender is None assert cur_sender is None
cur_sender = c.b.get_text().strip().removesuffix(':') cur_sender = c.b.get_text().strip().removesuffix(':')
elif c.name in {None,'span','font'}: elif c.name in {None, 'span', 'font'}:
cur_text += c.get_text() cur_text += c.get_text()
elif c.name == 'a': elif c.name == 'a':
@ -86,15 +98,12 @@ def parse_messages_in_chat_file(path: Path) -> list[Message]:
elif c.name in {'h1', 'h3'}: elif c.name in {'h1', 'h3'}:
pass # Ignore log header pass # Ignore log header
else: else:
assert False, c assert False, c
return messages return messages
def parse_messages_in_chat_folder(chat_folder_path: Path) -> list[Message]: def parse_messages_in_chat_folder(chat_folder_path: Path) -> list[Message]:
messages = [] messages = []
for file_path in chat_folder_path.iterdir(): for file_path in chat_folder_path.iterdir():
@ -112,6 +121,7 @@ def format_message_as_citation(out, msg):
del line del line
out.append('\n') out.append('\n')
def format_message_as_table(out, msg): def format_message_as_table(out, msg):
out.append(f'| {msg.sent_at} | [[{msg.sender}]] | ') out.append(f'| {msg.sent_at} | [[{msg.sender}]] | ')
for line in msg.text.split('\n'): for line in msg.text.split('\n'):
@ -119,17 +129,14 @@ def format_message_as_table(out, msg):
del line del line
out.append('|\n') out.append('|\n')
def format_messages(messages: list[Message]) -> str: def format_messages(messages: list[Message]) -> str:
out = [ out = ['# Chat 2018' '\n\n']
'# Chat 2018'
'\n\n'
]
as_table = False as_table = False
for msg_idx, msg in enumerate(messages): for msg_idx, msg in enumerate(messages):
if msg_idx == 0 or messages[msg_idx-1].sent_at.date() != msg.sent_at.date(): if msg_idx == 0 or messages[msg_idx - 1].sent_at.date() != msg.sent_at.date():
out.append('---\n') out.append('---\n')
out.append(f'## [[{msg.sent_at.date()}]]\n\n') out.append(f'## [[{msg.sent_at.date()}]]\n\n')
if as_table: if as_table:
@ -144,21 +151,30 @@ def format_messages(messages: list[Message]) -> str:
return ''.join(out) return ''.join(out)
MSG_ADJACENTCY_DIST = datetime.timedelta(minutes=2) MSG_ADJACENTCY_DIST = datetime.timedelta(minutes=2)
def is_adjacent_messages(first, second): def is_adjacent_messages(first, second):
return first.sender == second.sender and second.sent_at - first.sent_at <= MSG_ADJACENTCY_DIST return (
first.sender == second.sender
and second.sent_at - first.sent_at <= MSG_ADJACENTCY_DIST
)
def merge_texts(text1: str, text2: str) -> str: def merge_texts(text1: str, text2: str) -> str:
punctuated = text1.endswith('.?!,:') punctuated = text1.endswith('.?!,:')
#return text1 + (' ' if punctuated else '. ') + text2 # return text1 + (' ' if punctuated else '. ') + text2
return text1 + (' ' if punctuated else '. ') + '\n' + text2 return text1 + (' ' if punctuated else '. ') + '\n' + text2
def merge_adjacent_messages(messages: list[Message]) -> list[Message]: def merge_adjacent_messages(messages: list[Message]) -> list[Message]:
out = [] out = []
for msg in messages: for msg in messages:
if out and is_adjacent_messages(out[-1], msg): if out and is_adjacent_messages(out[-1], msg):
out[-1] = dataclasses.replace(out[-1], text=merge_texts(out[-1].text, msg.text)) out[-1] = dataclasses.replace(
out[-1], text=merge_texts(out[-1].text, msg.text),
)
else: else:
out.append(msg) out.append(msg)
return out return out

View File

@ -2,14 +2,15 @@ import argparse
import logging import logging
from pathlib import Path from pathlib import Path
from . import (parse_messages_in_chat_folder, merge_adjacent_messages, from . import format_messages, merge_adjacent_messages, parse_messages_in_chat_folder
format_messages)
def parse_args(): def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('path', type=Path) parser.add_argument('path', type=Path)
return parser.parse_args() return parser.parse_args()
def main(): def main():
logging.basicConfig() logging.basicConfig()
logging.getLogger().setLevel('INFO') logging.getLogger().setLevel('INFO')
@ -19,5 +20,6 @@ def main():
messages = merge_adjacent_messages(messages) messages = merge_adjacent_messages(messages)
print(format_messages(messages)) print(format_messages(messages))
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@ -1,4 +1,5 @@
import libpurple_to_markdown import libpurple_to_markdown
def test_version(): def test_version():
assert libpurple_to_markdown.__version__ is not None assert libpurple_to_markdown.__version__ is not None