1
0

Restructure by moving datastructures and libpurple parsing into own module
All checks were successful
Run Python tests (through Pytest) / Test (push) Successful in 25s
Verify Python project can be installed, loaded and have version checked / Test (push) Successful in 22s

This commit is contained in:
Jon Michael Aanes 2024-10-31 18:58:55 +01:00
parent 68b2d9fff2
commit 91ca8d66d8
Signed by: Jmaa
SSH Key Fingerprint: SHA256:Ab0GfHGCblESJx7JRE4fj4bFy/KRpeLhi41y4pF3sNA
5 changed files with 116 additions and 102 deletions

View File

@ -30,29 +30,15 @@ import datetime
import logging import logging
import re import re
from collections.abc import Iterable, Iterator from collections.abc import Iterable, Iterator
from pathlib import Path
import bs4
from ._version import __version__ from ._version import __version__
from .data import Message
__all__ = ['__version__'] __all__ = ['__version__']
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@dataclasses.dataclass(frozen=True, order=True)
class Message:
sent_at: datetime.datetime
sender: str
text: str
def __post_init__(self):
assert self.sent_at is not None
assert self.sender is not None
assert self.text is not None
def datetime_sent( def datetime_sent(
chat_start: datetime.datetime, chat_start: datetime.datetime,
message_sent: datetime.time, message_sent: datetime.time,
@ -67,89 +53,6 @@ def datetime_sent(
return naive return naive
def parse_timestamp(c) -> datetime.time:
timestamp_obj = c
if c.font is not None:
c = c.font
m = re.match(r'\((\d+):(\d+):(\d+)\)', timestamp_obj.get_text())
return datetime.time(int(m.group(1)), int(m.group(2)), int(m.group(3)))
def parse_messages_in_chat_file(path: Path) -> list[Message]:
logger.info('Parsing %s', path)
chat_start = datetime.datetime.fromisoformat(
path.stem.removesuffix('CEST').removesuffix('CET'),
)
with open(path) as f:
soup = bs4.BeautifulSoup(f, 'lxml')
if len(soup.contents) == 0:
logger.warning('File is empty?')
return []
messages = []
cur_sent_at: datetime.datetime | None = None
cur_sender: str | None = None
cur_text: str = ''
if soup.body.p:
loglines = soup.body.p.children
else:
loglines = soup.body.children
for c in loglines:
if c.name in {'font', 'span'} and cur_sent_at is None:
# Get timestamp
cur_sent_at = datetime_sent(chat_start, parse_timestamp(c))
# Get sender
if c.b:
assert cur_sender is None
cur_sender = (
c.b.get_text()
.strip()
.removesuffix(':')
.removeprefix('***')
.removesuffix('[m]')
)
elif c.name in {None, 'span', 'font'}:
cur_text += c.get_text()
elif c.name == 'a':
cur_text += '<' + c['href'] + '>'
elif c.name == 'br':
if cur_sender:
messages.append(Message(cur_sent_at, cur_sender, cur_text.strip()))
cur_sent_at = None
cur_sender = None
cur_text = ''
elif c.name == 'b':
# Indicates system message. Ignore
pass
elif c.name in {'h1', 'h3'}:
pass # Ignore log header
else:
assert False, c
return messages
def parse_messages_in_chat_folder(chat_folder_path: Path) -> list[Message]:
messages = []
for file_path in sorted(chat_folder_path.iterdir()):
messages.extend(parse_messages_in_chat_file(file_path))
messages.sort()
return messages
def format_message_as_citation(out: list[str], msg: Message) -> None: def format_message_as_citation(out: list[str], msg: Message) -> None:
out.append(f'{msg.sent_at.date()} {msg.sent_at.time()} [[{msg.sender}]]:') out.append(f'{msg.sent_at.date()} {msg.sent_at.time()} [[{msg.sender}]]:')
out.append('\n') out.append('\n')
@ -220,7 +123,7 @@ def merge_texts(text1: str, text2: str) -> str:
def merge_adjacent_messages(messages: Iterable[Message]) -> list[Message]: def merge_adjacent_messages(messages: Iterable[Message]) -> list[Message]:
out = [] out: list[Message] = []
for msg in messages: for msg in messages:
if out and is_adjacent_messages(out[-1], msg): if out and is_adjacent_messages(out[-1], msg):
out[-1] = dataclasses.replace( out[-1] = dataclasses.replace(

View File

@ -3,12 +3,12 @@ import logging
from pathlib import Path from pathlib import Path
from . import ( from . import (
Message,
filter_useless_messages, filter_useless_messages,
format_messages, format_messages,
libpurple,
merge_adjacent_messages, merge_adjacent_messages,
parse_messages_in_chat_folder,
) )
from .data import Message
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -37,7 +37,7 @@ def main():
server = args.path.parent.name server = args.path.parent.name
receipient = args.path.name receipient = args.path.name
all_messages = parse_messages_in_chat_folder(args.path) all_messages = libpurple.parse_messages_in_chat_folder(args.path)
all_messages = filter_useless_messages(all_messages) all_messages = filter_useless_messages(all_messages)
all_messages = merge_adjacent_messages(all_messages) all_messages = merge_adjacent_messages(all_messages)

View File

@ -0,0 +1,14 @@
import dataclasses
import datetime
@dataclasses.dataclass(frozen=True, order=True)
class Message:
sent_at: datetime.datetime
sender: str
text: str
def __post_init__(self):
assert self.sent_at is not None
assert self.sender is not None
assert self.text is not None

View File

@ -0,0 +1,92 @@
import datetime
import logging
from pathlib import Path
import bs4
from .data import Message
logger = logging.getLogger(__name__)
def parse_timestamp(c) -> datetime.time:
timestamp_obj = c
if c.font is not None:
c = c.font
m = re.match(r'\((\d+):(\d+):(\d+)\)', timestamp_obj.get_text())
return datetime.time(int(m.group(1)), int(m.group(2)), int(m.group(3)))
def parse_messages_in_chat_file(path: Path) -> list[Message]:
logger.info('Parsing %s', path)
chat_start = datetime.datetime.fromisoformat(
path.stem.removesuffix('CEST').removesuffix('CET'),
)
with open(path) as f:
soup = bs4.BeautifulSoup(f, 'lxml')
if len(soup.contents) == 0:
logger.warning('File is empty?')
return []
messages = []
cur_sent_at: datetime.datetime | None = None
cur_sender: str | None = None
cur_text: str = ''
if soup.body.p:
loglines = soup.body.p.children
else:
loglines = soup.body.children
for c in loglines:
if c.name in {'font', 'span'} and cur_sent_at is None:
# Get timestamp
cur_sent_at = datetime_sent(chat_start, parse_timestamp(c))
# Get sender
if c.b:
assert cur_sender is None
cur_sender = (
c.b.get_text()
.strip()
.removesuffix(':')
.removeprefix('***')
.removesuffix('[m]')
)
elif c.name in {None, 'span', 'font'}:
cur_text += c.get_text()
elif c.name == 'a':
cur_text += '<' + c['href'] + '>'
elif c.name == 'br':
if cur_sender:
messages.append(Message(cur_sent_at, cur_sender, cur_text.strip()))
cur_sent_at = None
cur_sender = None
cur_text = ''
elif c.name == 'b':
# Indicates system message. Ignore
pass
elif c.name in {'h1', 'h3'}:
pass # Ignore log header
else:
assert False, c
return messages
def parse_messages_in_chat_folder(chat_folder_path: Path) -> list[Message]:
messages = []
for file_path in sorted(chat_folder_path.iterdir()):
messages.extend(parse_messages_in_chat_file(file_path))
messages.sort()
return messages

View File

@ -3,3 +3,8 @@ import libpurple_to_markdown
def test_version(): def test_version():
assert libpurple_to_markdown.__version__ is not None assert libpurple_to_markdown.__version__ is not None
def test_load():
import libpurple_to_markdown.data
import libpurple_to_markdown.libpurple # noqa: F401