Working on more robust methods
This commit is contained in:
parent
0a870dfa11
commit
4e48e345ca
|
@ -24,13 +24,23 @@ def datetime_sent(chat_start: datetime.datetime, message_sent: datetime.time) ->
|
||||||
naive = naive - datetime.timedelta(days=1)
|
naive = naive - datetime.timedelta(days=1)
|
||||||
return naive
|
return naive
|
||||||
|
|
||||||
|
def parse_timestamp(c) -> datetime.time:
|
||||||
|
timestamp_obj = c
|
||||||
|
if c.font is not None:
|
||||||
|
c = c.font
|
||||||
|
m = re.match(r'\((\d+):(\d+):(\d+)\)', timestamp_obj.get_text())
|
||||||
|
return datetime.time(int(m.group(1)), int(m.group(2)), int(m.group(3)))
|
||||||
|
|
||||||
def parse_messages_in_chat_file(path: Path) -> list[Message]:
|
def parse_messages_in_chat_file(path: Path) -> list[Message]:
|
||||||
logger.info('Parsing %s', path)
|
logger.info('Parsing %s', path)
|
||||||
chat_start = datetime.datetime.fromisoformat(path.stem.removesuffix('CEST'))
|
chat_start = datetime.datetime.fromisoformat(path.stem.removesuffix('CEST').removesuffix('CET'))
|
||||||
|
|
||||||
with open(path) as f:
|
with open(path) as f:
|
||||||
soup = bs4.BeautifulSoup(f)
|
soup = bs4.BeautifulSoup(f, 'lxml')
|
||||||
|
|
||||||
|
if len(soup.contents) == 0:
|
||||||
|
logger.warning('File is empty?')
|
||||||
|
return []
|
||||||
|
|
||||||
messages = []
|
messages = []
|
||||||
|
|
||||||
|
@ -38,21 +48,26 @@ def parse_messages_in_chat_file(path: Path) -> list[Message]:
|
||||||
cur_sender: str = 'NOT DEFINED'
|
cur_sender: str = 'NOT DEFINED'
|
||||||
cur_text: str = ''
|
cur_text: str = ''
|
||||||
|
|
||||||
for c in soup.body.children:
|
if soup.p:
|
||||||
if c.name == 'font':
|
logger.warning('File indicates error message?')
|
||||||
|
return [] # TODO
|
||||||
|
else:
|
||||||
|
loglines = soup.body.children
|
||||||
|
|
||||||
|
for c in loglines:
|
||||||
|
if c.name in {'font','span'} and cur_sent_at is None:
|
||||||
# Get timestamp
|
# Get timestamp
|
||||||
m = re.match(r'\((\d+):(\d+):(\d+)\)', c.font.get_text())
|
cur_sent_at = datetime_sent(chat_start, parse_timestamp(c))
|
||||||
time_sent = datetime.time(int(m.group(1)), int(m.group(2)), int(m.group(3)))
|
|
||||||
cur_sent_at = datetime_sent(chat_start, time_sent)
|
|
||||||
|
|
||||||
# Get sender
|
# Get sender
|
||||||
|
if c.b:
|
||||||
cur_sender = c.b.get_text().strip().removesuffix(':')
|
cur_sender = c.b.get_text().strip().removesuffix(':')
|
||||||
|
|
||||||
elif c.name is None:
|
elif c.name in {None,'span','font'}:
|
||||||
cur_text = c.get_text()
|
cur_text += c.get_text()
|
||||||
|
|
||||||
elif c.name == 'a':
|
elif c.name == 'a':
|
||||||
cur_text = cur_text + '<' + c['href'] + '>'
|
cur_text += cur_text + '<' + c['href'] + '>'
|
||||||
|
|
||||||
elif c.name == 'br':
|
elif c.name == 'br':
|
||||||
messages.append(Message(cur_sent_at, cur_sender, cur_text.strip()))
|
messages.append(Message(cur_sent_at, cur_sender, cur_text.strip()))
|
||||||
|
@ -60,7 +75,11 @@ def parse_messages_in_chat_file(path: Path) -> list[Message]:
|
||||||
cur_sender = 'NOT DEFINED'
|
cur_sender = 'NOT DEFINED'
|
||||||
cur_text = ''
|
cur_text = ''
|
||||||
|
|
||||||
elif c.name == 'h3':
|
elif c.name == 'b':
|
||||||
|
# Indicates system message. Ignore
|
||||||
|
pass
|
||||||
|
|
||||||
|
elif c.name in {'h1', 'h3'}:
|
||||||
pass # Ignore log header
|
pass # Ignore log header
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from . import (parse_messages_in_chat_folder, merge_adjacent_messages,
|
from . import (parse_messages_in_chat_folder, merge_adjacent_messages,
|
||||||
|
@ -10,6 +11,8 @@ def parse_args():
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
logging.basicConfig()
|
||||||
|
logging.getLogger().setLevel('INFO')
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
messages = parse_messages_in_chat_folder(args.path)
|
messages = parse_messages_in_chat_folder(args.path)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user