1
0

Improved normalization

This commit is contained in:
Jon Michael Aanes 2024-11-03 17:14:43 +01:00
parent 426b32d5cb
commit 8cec980d31
Signed by: Jmaa
SSH Key Fingerprint: SHA256:Ab0GfHGCblESJx7JRE4fj4bFy/KRpeLhi41y4pF3sNA
3 changed files with 35 additions and 5 deletions

View File

@ -46,7 +46,7 @@ MAX_AVERAGE_MESSAGES_PER_PERIOD = 120
def group_messages_by_period(messages: Iterable[Message]) -> dict[str, list[Message]]: def group_messages_by_period(messages: Iterable[Message]) -> dict[str, list[Message]]:
possible_period_keys = [ possible_period_keys = [
(lambda msg: 'Full History'), (lambda msg: 'full'),
year_period_key, year_period_key,
year_quarter_period_key, year_quarter_period_key,
year_and_month_period_key, year_and_month_period_key,
@ -108,6 +108,11 @@ def main():
for chat_id, messages_in_chat_original in messages_by_chat_id.items(): for chat_id, messages_in_chat_original in messages_by_chat_id.items():
messages_in_chat = merge_adjacent_messages(messages_in_chat_original) messages_in_chat = merge_adjacent_messages(messages_in_chat_original)
if len(messages_in_chat) <= 2:
logger.info(
' "%s": Skipped due to too few messages', chat_id)
continue
messages_by_period = group_messages_by_period(messages_in_chat) messages_by_period = group_messages_by_period(messages_in_chat)
logger.info( logger.info(
' "%s": %d messages, %d periods (%d msg/period avg)', ' "%s": %d messages, %d periods (%d msg/period avg)',
@ -118,7 +123,8 @@ def main():
) )
for period_key, messages in messages_by_period.items(): for period_key, messages in messages_by_period.items():
output_file = args.output / chat_id / f'{period_key}.md' file_escaped_chat_id = chat_id.replace(' ','-')
output_file = args.output / chat_id / f'{file_escaped_chat_id}-{period_key}.md'
output_file.parent.mkdir(exist_ok=True) output_file.parent.mkdir(exist_ok=True)
logger.info('Writing % 5d messages to %s', len(messages), output_file) logger.info('Writing % 5d messages to %s', len(messages), output_file)
with open(output_file, 'w') as f: with open(output_file, 'w') as f:

View File

@ -17,7 +17,17 @@ from .data import MYSELF, Message
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def is_named_number(num: str) -> str:
try:
int(num.removeprefix('+').replace(' ', ''))
return False
except ValueError:
return True
def normalize_phone_number(num: str) -> str: def normalize_phone_number(num: str) -> str:
if is_named_number(num):
return num
num = num.replace(' ', '') num = num.replace(' ', '')
if num.startswith('00'): if num.startswith('00'):
num = '+' + num.removeprefix('00') num = '+' + num.removeprefix('00')
@ -33,14 +43,24 @@ def sms_soup_to_message(soup: bs4.BeautifulSoup) -> Message:
sent_at = datetime.datetime.fromtimestamp(int(soup['date']) / 1000) sent_at = datetime.datetime.fromtimestamp(int(soup['date']) / 1000)
phone_num = normalize_phone_number(soup['address']) phone_num = normalize_phone_number(soup['address'])
contact_name = soup.get('contact_name') or phone_num
if is_named_number(phone_num):
contact_name = phone_num
phone_num = None
else:
contact_name = soup.get('contact_name') or phone_num
if contact_name == '(Unknown)':
contact_name = None
if soup['type'] == '2': if soup['type'] == '2':
sender = MYSELF sender = MYSELF
else: else:
sender = contact_name sender = contact_name or phone_num
text = soup['body'] text = soup['body']
chat_id = f'SMS {contact_name} {phone_num}'
chat_id_parts = ['SMS', contact_name or phone_num]
chat_id = ' '.join(p for p in chat_id_parts if p)
return Message(sent_at, sender, text, chat_id=chat_id) return Message(sent_at, sender, text, chat_id=chat_id)

View File

@ -1,5 +1,6 @@
from libpurple_to_markdown import synctech_sms from libpurple_to_markdown import synctech_sms
def test_normalize_phone_number(): def test_normalize_phone_number():
assert synctech_sms.normalize_phone_number('+45 12 34 56 78') == '+4512345678' assert synctech_sms.normalize_phone_number('+45 12 34 56 78') == '+4512345678'
assert synctech_sms.normalize_phone_number('+4512345678') == '+4512345678' assert synctech_sms.normalize_phone_number('+4512345678') == '+4512345678'
@ -8,3 +9,6 @@ def test_normalize_phone_number():
assert synctech_sms.normalize_phone_number('12 34 56 78') == '+4512345678' assert synctech_sms.normalize_phone_number('12 34 56 78') == '+4512345678'
assert synctech_sms.normalize_phone_number('441234567890') == '+441234567890' assert synctech_sms.normalize_phone_number('441234567890') == '+441234567890'
assert synctech_sms.normalize_phone_number('004712345678') == '+4712345678' assert synctech_sms.normalize_phone_number('004712345678') == '+4712345678'
def test_dont_normalize_weird():
assert synctech_sms.normalize_phone_number('Midttrafik') == 'Midttrafik'