1
0

Normalize and deduplicate

This commit is contained in:
Jon Michael Aanes 2024-05-18 21:52:22 +02:00
parent 34ba384265
commit 62db705b3e
Signed by: Jmaa
SSH Key Fingerprint: SHA256:Ab0GfHGCblESJx7JRE4fj4bFy/KRpeLhi41y4pF3sNA
2 changed files with 36 additions and 1 deletions

View File

@ -99,6 +99,12 @@ def deduplicate_dicts(
return dicts, fieldnames return dicts, fieldnames
def normalize_dict(d: dict) -> frozendict:
return frozendict(
{k: to_value(str(v)) for k, v in d.items() if to_value(str(v)) is not None},
)
def extend_csv_file( def extend_csv_file(
filename: str, filename: str,
new_dicts: list[dict], new_dicts: list[dict],
@ -123,7 +129,7 @@ def extend_csv_file(
logger.info('Creating file: %s', filename) logger.info('Creating file: %s', filename)
original_num_dicts = len(dicts) original_num_dicts = len(dicts)
dicts += [frozendict(d) for d in new_dicts] dicts += [normalize_dict(d) for d in new_dicts]
del new_dicts del new_dicts
dicts, fieldnames = deduplicate_dicts( dicts, fieldnames = deduplicate_dicts(

29
test/test_deduplicate.py Normal file
View File

@ -0,0 +1,29 @@
from frozendict import frozendict
from personal_data.data import DeduplicateMode
from personal_data.main import deduplicate_dicts
LIST = [
frozendict({'a': 1, 'b': 2, 't': 300}),
frozendict({'a': 1, 'b': 2, 't': 301}),
frozendict({'a': 1, 'b': 2, 't': 302}),
frozendict({'a': 1, 'b': 2, 't': 303}),
]
def test_no_deduplicate():
ls, fields = deduplicate_dicts(LIST, DeduplicateMode.NONE, [])
assert fields == ['a', 'b', 't']
assert ls == LIST
def test_only_latest_no_fields():
ls, fields = deduplicate_dicts(LIST, DeduplicateMode.ONLY_LATEST, [])
assert fields == ['a', 'b', 't']
assert ls == LIST
def test_only_latest():
ls, fields = deduplicate_dicts(LIST, DeduplicateMode.ONLY_LATEST, ['t'])
assert fields == ['a', 'b', 't']
assert ls == [frozendict({'a': 1, 'b': 2, 't': 300})]