From 8cec980d315fa510cf4f2335b679576a0baf0538 Mon Sep 17 00:00:00 2001 From: Jon Michael Aanes Date: Sun, 3 Nov 2024 17:14:43 +0100 Subject: [PATCH] Improved normalization --- libpurple_to_markdown/__main__.py | 10 ++++++++-- libpurple_to_markdown/synctech_sms.py | 26 +++++++++++++++++++++++--- test/test_synctech_sms.py | 4 ++++ 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/libpurple_to_markdown/__main__.py b/libpurple_to_markdown/__main__.py index a79fc01..da90b7a 100644 --- a/libpurple_to_markdown/__main__.py +++ b/libpurple_to_markdown/__main__.py @@ -46,7 +46,7 @@ MAX_AVERAGE_MESSAGES_PER_PERIOD = 120 def group_messages_by_period(messages: Iterable[Message]) -> dict[str, list[Message]]: possible_period_keys = [ - (lambda msg: 'Full History'), + (lambda msg: 'full'), year_period_key, year_quarter_period_key, year_and_month_period_key, @@ -108,6 +108,11 @@ def main(): for chat_id, messages_in_chat_original in messages_by_chat_id.items(): messages_in_chat = merge_adjacent_messages(messages_in_chat_original) + if len(messages_in_chat) <= 2: + logger.info( + ' "%s": Skipped due to too few messages', chat_id) + continue + messages_by_period = group_messages_by_period(messages_in_chat) logger.info( ' "%s": %d messages, %d periods (%d msg/period avg)', @@ -118,7 +123,8 @@ def main(): ) for period_key, messages in messages_by_period.items(): - output_file = args.output / chat_id / f'{period_key}.md' + file_escaped_chat_id = chat_id.replace(' ','-') + output_file = args.output / chat_id / f'{file_escaped_chat_id}-{period_key}.md' output_file.parent.mkdir(exist_ok=True) logger.info('Writing % 5d messages to %s', len(messages), output_file) with open(output_file, 'w') as f: diff --git a/libpurple_to_markdown/synctech_sms.py b/libpurple_to_markdown/synctech_sms.py index a1a9114..1b49ad5 100644 --- a/libpurple_to_markdown/synctech_sms.py +++ b/libpurple_to_markdown/synctech_sms.py @@ -17,7 +17,17 @@ from .data import MYSELF, Message logger = logging.getLogger(__name__) +def is_named_number(num: str) -> str: + try: + int(num.removeprefix('+').replace(' ', '')) + return False + except ValueError: + return True + def normalize_phone_number(num: str) -> str: + if is_named_number(num): + return num + num = num.replace(' ', '') if num.startswith('00'): num = '+' + num.removeprefix('00') @@ -33,14 +43,24 @@ def sms_soup_to_message(soup: bs4.BeautifulSoup) -> Message: sent_at = datetime.datetime.fromtimestamp(int(soup['date']) / 1000) phone_num = normalize_phone_number(soup['address']) - contact_name = soup.get('contact_name') or phone_num + + if is_named_number(phone_num): + contact_name = phone_num + phone_num = None + else: + contact_name = soup.get('contact_name') or phone_num + if contact_name == '(Unknown)': + contact_name = None + if soup['type'] == '2': sender = MYSELF else: - sender = contact_name + sender = contact_name or phone_num text = soup['body'] - chat_id = f'SMS {contact_name} {phone_num}' + + chat_id_parts = ['SMS', contact_name or phone_num] + chat_id = ' '.join(p for p in chat_id_parts if p) return Message(sent_at, sender, text, chat_id=chat_id) diff --git a/test/test_synctech_sms.py b/test/test_synctech_sms.py index c9349ad..fc98d02 100644 --- a/test/test_synctech_sms.py +++ b/test/test_synctech_sms.py @@ -1,5 +1,6 @@ from libpurple_to_markdown import synctech_sms + def test_normalize_phone_number(): assert synctech_sms.normalize_phone_number('+45 12 34 56 78') == '+4512345678' assert synctech_sms.normalize_phone_number('+4512345678') == '+4512345678' @@ -8,3 +9,6 @@ def test_normalize_phone_number(): assert synctech_sms.normalize_phone_number('12 34 56 78') == '+4512345678' assert synctech_sms.normalize_phone_number('441234567890') == '+441234567890' assert synctech_sms.normalize_phone_number('004712345678') == '+4712345678' + +def test_dont_normalize_weird(): + assert synctech_sms.normalize_phone_number('Midttrafik') == 'Midttrafik'