Improved normalization
This commit is contained in:
parent
426b32d5cb
commit
8cec980d31
|
@ -46,7 +46,7 @@ MAX_AVERAGE_MESSAGES_PER_PERIOD = 120
|
||||||
|
|
||||||
def group_messages_by_period(messages: Iterable[Message]) -> dict[str, list[Message]]:
|
def group_messages_by_period(messages: Iterable[Message]) -> dict[str, list[Message]]:
|
||||||
possible_period_keys = [
|
possible_period_keys = [
|
||||||
(lambda msg: 'Full History'),
|
(lambda msg: 'full'),
|
||||||
year_period_key,
|
year_period_key,
|
||||||
year_quarter_period_key,
|
year_quarter_period_key,
|
||||||
year_and_month_period_key,
|
year_and_month_period_key,
|
||||||
|
@ -108,6 +108,11 @@ def main():
|
||||||
|
|
||||||
for chat_id, messages_in_chat_original in messages_by_chat_id.items():
|
for chat_id, messages_in_chat_original in messages_by_chat_id.items():
|
||||||
messages_in_chat = merge_adjacent_messages(messages_in_chat_original)
|
messages_in_chat = merge_adjacent_messages(messages_in_chat_original)
|
||||||
|
if len(messages_in_chat) <= 2:
|
||||||
|
logger.info(
|
||||||
|
' "%s": Skipped due to too few messages', chat_id)
|
||||||
|
continue
|
||||||
|
|
||||||
messages_by_period = group_messages_by_period(messages_in_chat)
|
messages_by_period = group_messages_by_period(messages_in_chat)
|
||||||
logger.info(
|
logger.info(
|
||||||
' "%s": %d messages, %d periods (%d msg/period avg)',
|
' "%s": %d messages, %d periods (%d msg/period avg)',
|
||||||
|
@ -118,7 +123,8 @@ def main():
|
||||||
)
|
)
|
||||||
|
|
||||||
for period_key, messages in messages_by_period.items():
|
for period_key, messages in messages_by_period.items():
|
||||||
output_file = args.output / chat_id / f'{period_key}.md'
|
file_escaped_chat_id = chat_id.replace(' ','-')
|
||||||
|
output_file = args.output / chat_id / f'{file_escaped_chat_id}-{period_key}.md'
|
||||||
output_file.parent.mkdir(exist_ok=True)
|
output_file.parent.mkdir(exist_ok=True)
|
||||||
logger.info('Writing % 5d messages to %s', len(messages), output_file)
|
logger.info('Writing % 5d messages to %s', len(messages), output_file)
|
||||||
with open(output_file, 'w') as f:
|
with open(output_file, 'w') as f:
|
||||||
|
|
|
@ -17,7 +17,17 @@ from .data import MYSELF, Message
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def is_named_number(num: str) -> str:
|
||||||
|
try:
|
||||||
|
int(num.removeprefix('+').replace(' ', ''))
|
||||||
|
return False
|
||||||
|
except ValueError:
|
||||||
|
return True
|
||||||
|
|
||||||
def normalize_phone_number(num: str) -> str:
|
def normalize_phone_number(num: str) -> str:
|
||||||
|
if is_named_number(num):
|
||||||
|
return num
|
||||||
|
|
||||||
num = num.replace(' ', '')
|
num = num.replace(' ', '')
|
||||||
if num.startswith('00'):
|
if num.startswith('00'):
|
||||||
num = '+' + num.removeprefix('00')
|
num = '+' + num.removeprefix('00')
|
||||||
|
@ -33,14 +43,24 @@ def sms_soup_to_message(soup: bs4.BeautifulSoup) -> Message:
|
||||||
sent_at = datetime.datetime.fromtimestamp(int(soup['date']) / 1000)
|
sent_at = datetime.datetime.fromtimestamp(int(soup['date']) / 1000)
|
||||||
|
|
||||||
phone_num = normalize_phone_number(soup['address'])
|
phone_num = normalize_phone_number(soup['address'])
|
||||||
contact_name = soup.get('contact_name') or phone_num
|
|
||||||
|
if is_named_number(phone_num):
|
||||||
|
contact_name = phone_num
|
||||||
|
phone_num = None
|
||||||
|
else:
|
||||||
|
contact_name = soup.get('contact_name') or phone_num
|
||||||
|
if contact_name == '(Unknown)':
|
||||||
|
contact_name = None
|
||||||
|
|
||||||
if soup['type'] == '2':
|
if soup['type'] == '2':
|
||||||
sender = MYSELF
|
sender = MYSELF
|
||||||
else:
|
else:
|
||||||
sender = contact_name
|
sender = contact_name or phone_num
|
||||||
|
|
||||||
text = soup['body']
|
text = soup['body']
|
||||||
chat_id = f'SMS {contact_name} {phone_num}'
|
|
||||||
|
chat_id_parts = ['SMS', contact_name or phone_num]
|
||||||
|
chat_id = ' '.join(p for p in chat_id_parts if p)
|
||||||
return Message(sent_at, sender, text, chat_id=chat_id)
|
return Message(sent_at, sender, text, chat_id=chat_id)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from libpurple_to_markdown import synctech_sms
|
from libpurple_to_markdown import synctech_sms
|
||||||
|
|
||||||
|
|
||||||
def test_normalize_phone_number():
|
def test_normalize_phone_number():
|
||||||
assert synctech_sms.normalize_phone_number('+45 12 34 56 78') == '+4512345678'
|
assert synctech_sms.normalize_phone_number('+45 12 34 56 78') == '+4512345678'
|
||||||
assert synctech_sms.normalize_phone_number('+4512345678') == '+4512345678'
|
assert synctech_sms.normalize_phone_number('+4512345678') == '+4512345678'
|
||||||
|
@ -8,3 +9,6 @@ def test_normalize_phone_number():
|
||||||
assert synctech_sms.normalize_phone_number('12 34 56 78') == '+4512345678'
|
assert synctech_sms.normalize_phone_number('12 34 56 78') == '+4512345678'
|
||||||
assert synctech_sms.normalize_phone_number('441234567890') == '+441234567890'
|
assert synctech_sms.normalize_phone_number('441234567890') == '+441234567890'
|
||||||
assert synctech_sms.normalize_phone_number('004712345678') == '+4712345678'
|
assert synctech_sms.normalize_phone_number('004712345678') == '+4712345678'
|
||||||
|
|
||||||
|
def test_dont_normalize_weird():
|
||||||
|
assert synctech_sms.normalize_phone_number('Midttrafik') == 'Midttrafik'
|
||||||
|
|
Loading…
Reference in New Issue
Block a user