Restructure by moving datastructures and libpurple parsing into own module
This commit is contained in:
parent
68b2d9fff2
commit
91ca8d66d8
|
@ -30,29 +30,15 @@ import datetime
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from collections.abc import Iterable, Iterator
|
from collections.abc import Iterable, Iterator
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import bs4
|
|
||||||
|
|
||||||
from ._version import __version__
|
from ._version import __version__
|
||||||
|
from .data import Message
|
||||||
|
|
||||||
__all__ = ['__version__']
|
__all__ = ['__version__']
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass(frozen=True, order=True)
|
|
||||||
class Message:
|
|
||||||
sent_at: datetime.datetime
|
|
||||||
sender: str
|
|
||||||
text: str
|
|
||||||
|
|
||||||
def __post_init__(self):
|
|
||||||
assert self.sent_at is not None
|
|
||||||
assert self.sender is not None
|
|
||||||
assert self.text is not None
|
|
||||||
|
|
||||||
|
|
||||||
def datetime_sent(
|
def datetime_sent(
|
||||||
chat_start: datetime.datetime,
|
chat_start: datetime.datetime,
|
||||||
message_sent: datetime.time,
|
message_sent: datetime.time,
|
||||||
|
@ -67,89 +53,6 @@ def datetime_sent(
|
||||||
return naive
|
return naive
|
||||||
|
|
||||||
|
|
||||||
def parse_timestamp(c) -> datetime.time:
|
|
||||||
timestamp_obj = c
|
|
||||||
if c.font is not None:
|
|
||||||
c = c.font
|
|
||||||
m = re.match(r'\((\d+):(\d+):(\d+)\)', timestamp_obj.get_text())
|
|
||||||
return datetime.time(int(m.group(1)), int(m.group(2)), int(m.group(3)))
|
|
||||||
|
|
||||||
|
|
||||||
def parse_messages_in_chat_file(path: Path) -> list[Message]:
|
|
||||||
logger.info('Parsing %s', path)
|
|
||||||
chat_start = datetime.datetime.fromisoformat(
|
|
||||||
path.stem.removesuffix('CEST').removesuffix('CET'),
|
|
||||||
)
|
|
||||||
|
|
||||||
with open(path) as f:
|
|
||||||
soup = bs4.BeautifulSoup(f, 'lxml')
|
|
||||||
|
|
||||||
if len(soup.contents) == 0:
|
|
||||||
logger.warning('File is empty?')
|
|
||||||
return []
|
|
||||||
|
|
||||||
messages = []
|
|
||||||
|
|
||||||
cur_sent_at: datetime.datetime | None = None
|
|
||||||
cur_sender: str | None = None
|
|
||||||
cur_text: str = ''
|
|
||||||
|
|
||||||
if soup.body.p:
|
|
||||||
loglines = soup.body.p.children
|
|
||||||
else:
|
|
||||||
loglines = soup.body.children
|
|
||||||
|
|
||||||
for c in loglines:
|
|
||||||
if c.name in {'font', 'span'} and cur_sent_at is None:
|
|
||||||
# Get timestamp
|
|
||||||
cur_sent_at = datetime_sent(chat_start, parse_timestamp(c))
|
|
||||||
|
|
||||||
# Get sender
|
|
||||||
if c.b:
|
|
||||||
assert cur_sender is None
|
|
||||||
cur_sender = (
|
|
||||||
c.b.get_text()
|
|
||||||
.strip()
|
|
||||||
.removesuffix(':')
|
|
||||||
.removeprefix('***')
|
|
||||||
.removesuffix('[m]')
|
|
||||||
)
|
|
||||||
|
|
||||||
elif c.name in {None, 'span', 'font'}:
|
|
||||||
cur_text += c.get_text()
|
|
||||||
|
|
||||||
elif c.name == 'a':
|
|
||||||
cur_text += '<' + c['href'] + '>'
|
|
||||||
|
|
||||||
elif c.name == 'br':
|
|
||||||
if cur_sender:
|
|
||||||
messages.append(Message(cur_sent_at, cur_sender, cur_text.strip()))
|
|
||||||
cur_sent_at = None
|
|
||||||
cur_sender = None
|
|
||||||
cur_text = ''
|
|
||||||
|
|
||||||
elif c.name == 'b':
|
|
||||||
# Indicates system message. Ignore
|
|
||||||
pass
|
|
||||||
|
|
||||||
elif c.name in {'h1', 'h3'}:
|
|
||||||
pass # Ignore log header
|
|
||||||
|
|
||||||
else:
|
|
||||||
assert False, c
|
|
||||||
|
|
||||||
return messages
|
|
||||||
|
|
||||||
|
|
||||||
def parse_messages_in_chat_folder(chat_folder_path: Path) -> list[Message]:
|
|
||||||
messages = []
|
|
||||||
for file_path in sorted(chat_folder_path.iterdir()):
|
|
||||||
messages.extend(parse_messages_in_chat_file(file_path))
|
|
||||||
|
|
||||||
messages.sort()
|
|
||||||
return messages
|
|
||||||
|
|
||||||
|
|
||||||
def format_message_as_citation(out: list[str], msg: Message) -> None:
|
def format_message_as_citation(out: list[str], msg: Message) -> None:
|
||||||
out.append(f'{msg.sent_at.date()} {msg.sent_at.time()} [[{msg.sender}]]:')
|
out.append(f'{msg.sent_at.date()} {msg.sent_at.time()} [[{msg.sender}]]:')
|
||||||
out.append('\n')
|
out.append('\n')
|
||||||
|
@ -220,7 +123,7 @@ def merge_texts(text1: str, text2: str) -> str:
|
||||||
|
|
||||||
|
|
||||||
def merge_adjacent_messages(messages: Iterable[Message]) -> list[Message]:
|
def merge_adjacent_messages(messages: Iterable[Message]) -> list[Message]:
|
||||||
out = []
|
out: list[Message] = []
|
||||||
for msg in messages:
|
for msg in messages:
|
||||||
if out and is_adjacent_messages(out[-1], msg):
|
if out and is_adjacent_messages(out[-1], msg):
|
||||||
out[-1] = dataclasses.replace(
|
out[-1] = dataclasses.replace(
|
||||||
|
|
|
@ -3,12 +3,12 @@ import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from . import (
|
from . import (
|
||||||
Message,
|
|
||||||
filter_useless_messages,
|
filter_useless_messages,
|
||||||
format_messages,
|
format_messages,
|
||||||
|
libpurple,
|
||||||
merge_adjacent_messages,
|
merge_adjacent_messages,
|
||||||
parse_messages_in_chat_folder,
|
|
||||||
)
|
)
|
||||||
|
from .data import Message
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -37,7 +37,7 @@ def main():
|
||||||
server = args.path.parent.name
|
server = args.path.parent.name
|
||||||
receipient = args.path.name
|
receipient = args.path.name
|
||||||
|
|
||||||
all_messages = parse_messages_in_chat_folder(args.path)
|
all_messages = libpurple.parse_messages_in_chat_folder(args.path)
|
||||||
all_messages = filter_useless_messages(all_messages)
|
all_messages = filter_useless_messages(all_messages)
|
||||||
all_messages = merge_adjacent_messages(all_messages)
|
all_messages = merge_adjacent_messages(all_messages)
|
||||||
|
|
||||||
|
|
14
libpurple_to_markdown/data.py
Normal file
14
libpurple_to_markdown/data.py
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
import dataclasses
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
|
||||||
|
@dataclasses.dataclass(frozen=True, order=True)
|
||||||
|
class Message:
|
||||||
|
sent_at: datetime.datetime
|
||||||
|
sender: str
|
||||||
|
text: str
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
assert self.sent_at is not None
|
||||||
|
assert self.sender is not None
|
||||||
|
assert self.text is not None
|
92
libpurple_to_markdown/libpurple.py
Normal file
92
libpurple_to_markdown/libpurple.py
Normal file
|
@ -0,0 +1,92 @@
|
||||||
|
import datetime
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import bs4
|
||||||
|
|
||||||
|
from .data import Message
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_timestamp(c) -> datetime.time:
|
||||||
|
timestamp_obj = c
|
||||||
|
if c.font is not None:
|
||||||
|
c = c.font
|
||||||
|
m = re.match(r'\((\d+):(\d+):(\d+)\)', timestamp_obj.get_text())
|
||||||
|
return datetime.time(int(m.group(1)), int(m.group(2)), int(m.group(3)))
|
||||||
|
|
||||||
|
|
||||||
|
def parse_messages_in_chat_file(path: Path) -> list[Message]:
|
||||||
|
logger.info('Parsing %s', path)
|
||||||
|
chat_start = datetime.datetime.fromisoformat(
|
||||||
|
path.stem.removesuffix('CEST').removesuffix('CET'),
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(path) as f:
|
||||||
|
soup = bs4.BeautifulSoup(f, 'lxml')
|
||||||
|
|
||||||
|
if len(soup.contents) == 0:
|
||||||
|
logger.warning('File is empty?')
|
||||||
|
return []
|
||||||
|
|
||||||
|
messages = []
|
||||||
|
|
||||||
|
cur_sent_at: datetime.datetime | None = None
|
||||||
|
cur_sender: str | None = None
|
||||||
|
cur_text: str = ''
|
||||||
|
|
||||||
|
if soup.body.p:
|
||||||
|
loglines = soup.body.p.children
|
||||||
|
else:
|
||||||
|
loglines = soup.body.children
|
||||||
|
|
||||||
|
for c in loglines:
|
||||||
|
if c.name in {'font', 'span'} and cur_sent_at is None:
|
||||||
|
# Get timestamp
|
||||||
|
cur_sent_at = datetime_sent(chat_start, parse_timestamp(c))
|
||||||
|
|
||||||
|
# Get sender
|
||||||
|
if c.b:
|
||||||
|
assert cur_sender is None
|
||||||
|
cur_sender = (
|
||||||
|
c.b.get_text()
|
||||||
|
.strip()
|
||||||
|
.removesuffix(':')
|
||||||
|
.removeprefix('***')
|
||||||
|
.removesuffix('[m]')
|
||||||
|
)
|
||||||
|
|
||||||
|
elif c.name in {None, 'span', 'font'}:
|
||||||
|
cur_text += c.get_text()
|
||||||
|
|
||||||
|
elif c.name == 'a':
|
||||||
|
cur_text += '<' + c['href'] + '>'
|
||||||
|
|
||||||
|
elif c.name == 'br':
|
||||||
|
if cur_sender:
|
||||||
|
messages.append(Message(cur_sent_at, cur_sender, cur_text.strip()))
|
||||||
|
cur_sent_at = None
|
||||||
|
cur_sender = None
|
||||||
|
cur_text = ''
|
||||||
|
|
||||||
|
elif c.name == 'b':
|
||||||
|
# Indicates system message. Ignore
|
||||||
|
pass
|
||||||
|
|
||||||
|
elif c.name in {'h1', 'h3'}:
|
||||||
|
pass # Ignore log header
|
||||||
|
|
||||||
|
else:
|
||||||
|
assert False, c
|
||||||
|
|
||||||
|
return messages
|
||||||
|
|
||||||
|
|
||||||
|
def parse_messages_in_chat_folder(chat_folder_path: Path) -> list[Message]:
|
||||||
|
messages = []
|
||||||
|
for file_path in sorted(chat_folder_path.iterdir()):
|
||||||
|
messages.extend(parse_messages_in_chat_file(file_path))
|
||||||
|
|
||||||
|
messages.sort()
|
||||||
|
return messages
|
|
@ -3,3 +3,8 @@ import libpurple_to_markdown
|
||||||
|
|
||||||
def test_version():
|
def test_version():
|
||||||
assert libpurple_to_markdown.__version__ is not None
|
assert libpurple_to_markdown.__version__ is not None
|
||||||
|
|
||||||
|
|
||||||
|
def test_load():
|
||||||
|
import libpurple_to_markdown.data
|
||||||
|
import libpurple_to_markdown.libpurple # noqa: F401
|
||||||
|
|
Loading…
Reference in New Issue
Block a user