1
0

Working on more robust methods

This commit is contained in:
Jon Michael Aanes 2024-10-26 01:30:51 +02:00
parent 0a870dfa11
commit 4e48e345ca
Signed by: Jmaa
SSH Key Fingerprint: SHA256:Ab0GfHGCblESJx7JRE4fj4bFy/KRpeLhi41y4pF3sNA
2 changed files with 34 additions and 12 deletions

View File

@ -24,13 +24,23 @@ def datetime_sent(chat_start: datetime.datetime, message_sent: datetime.time) ->
naive = naive - datetime.timedelta(days=1) naive = naive - datetime.timedelta(days=1)
return naive return naive
def parse_timestamp(c) -> datetime.time:
timestamp_obj = c
if c.font is not None:
c = c.font
m = re.match(r'\((\d+):(\d+):(\d+)\)', timestamp_obj.get_text())
return datetime.time(int(m.group(1)), int(m.group(2)), int(m.group(3)))
def parse_messages_in_chat_file(path: Path) -> list[Message]: def parse_messages_in_chat_file(path: Path) -> list[Message]:
logger.info('Parsing %s', path) logger.info('Parsing %s', path)
chat_start = datetime.datetime.fromisoformat(path.stem.removesuffix('CEST')) chat_start = datetime.datetime.fromisoformat(path.stem.removesuffix('CEST').removesuffix('CET'))
with open(path) as f: with open(path) as f:
soup = bs4.BeautifulSoup(f) soup = bs4.BeautifulSoup(f, 'lxml')
if len(soup.contents) == 0:
logger.warning('File is empty?')
return []
messages = [] messages = []
@ -38,21 +48,26 @@ def parse_messages_in_chat_file(path: Path) -> list[Message]:
cur_sender: str = 'NOT DEFINED' cur_sender: str = 'NOT DEFINED'
cur_text: str = '' cur_text: str = ''
for c in soup.body.children: if soup.p:
if c.name == 'font': logger.warning('File indicates error message?')
return [] # TODO
else:
loglines = soup.body.children
for c in loglines:
if c.name in {'font','span'} and cur_sent_at is None:
# Get timestamp # Get timestamp
m = re.match(r'\((\d+):(\d+):(\d+)\)', c.font.get_text()) cur_sent_at = datetime_sent(chat_start, parse_timestamp(c))
time_sent = datetime.time(int(m.group(1)), int(m.group(2)), int(m.group(3)))
cur_sent_at = datetime_sent(chat_start, time_sent)
# Get sender # Get sender
if c.b:
cur_sender = c.b.get_text().strip().removesuffix(':') cur_sender = c.b.get_text().strip().removesuffix(':')
elif c.name is None: elif c.name in {None,'span','font'}:
cur_text = c.get_text() cur_text += c.get_text()
elif c.name == 'a': elif c.name == 'a':
cur_text = cur_text + '<' + c['href'] + '>' cur_text += cur_text + '<' + c['href'] + '>'
elif c.name == 'br': elif c.name == 'br':
messages.append(Message(cur_sent_at, cur_sender, cur_text.strip())) messages.append(Message(cur_sent_at, cur_sender, cur_text.strip()))
@ -60,7 +75,11 @@ def parse_messages_in_chat_file(path: Path) -> list[Message]:
cur_sender = 'NOT DEFINED' cur_sender = 'NOT DEFINED'
cur_text = '' cur_text = ''
elif c.name == 'h3': elif c.name == 'b':
# Indicates system message. Ignore
pass
elif c.name in {'h1', 'h3'}:
pass # Ignore log header pass # Ignore log header

View File

@ -1,4 +1,5 @@
import argparse import argparse
import logging
from pathlib import Path from pathlib import Path
from . import (parse_messages_in_chat_folder, merge_adjacent_messages, from . import (parse_messages_in_chat_folder, merge_adjacent_messages,
@ -10,6 +11,8 @@ def parse_args():
return parser.parse_args() return parser.parse_args()
def main(): def main():
logging.basicConfig()
logging.getLogger().setLevel('INFO')
args = parse_args() args = parse_args()
messages = parse_messages_in_chat_folder(args.path) messages = parse_messages_in_chat_folder(args.path)