1
0
personal-data/personal_data/util.py

168 lines
4.5 KiB
Python
Raw Normal View History

2024-10-10 22:54:01 +00:00
import _csv
2024-08-25 18:08:41 +00:00
import csv
import datetime
import io
import logging
2024-08-25 18:50:03 +00:00
import urllib.parse
2024-10-25 20:09:27 +00:00
from collections.abc import Iterable, Mapping
2024-08-25 18:50:03 +00:00
from pathlib import Path
2024-10-13 13:20:30 +00:00
from typing import Any
2024-08-25 18:08:41 +00:00
from frozendict import frozendict
2024-10-13 13:20:30 +00:00
from . import csv_import, data
2024-08-25 18:08:41 +00:00
logger = logging.getLogger(__name__)
def csv_safe_value(v: Any) -> str:
2024-08-25 19:07:52 +00:00
if isinstance(v, urllib.parse.ParseResult):
return v.geturl()
2024-08-26 22:31:44 +00:00
if isinstance(v, datetime.datetime):
2024-10-25 20:09:27 +00:00
if v.tzinfo is None:
raise RuntimeError(v)
2024-08-25 19:07:52 +00:00
return str(v)
2024-08-25 18:08:41 +00:00
def equals_without_fields(
2024-10-25 20:09:27 +00:00
a: Mapping[str, Any],
b: Mapping[str, Any],
2024-08-25 18:08:41 +00:00
fields: Iterable[str] = frozenset(),
) -> bool:
a = dict(a)
b = dict(b)
for f in fields:
del a[f], b[f]
return frozendict(a) == frozendict(b)
def deduplicate_by_ignoring_certain_fields(
2024-10-25 20:09:27 +00:00
dicts: list[frozendict[str, Any]],
2024-08-25 18:08:41 +00:00
deduplicate_ignore_columns: Iterable[str],
2024-10-25 20:09:27 +00:00
) -> list[frozendict[str, Any]]:
2024-08-25 18:08:41 +00:00
"""Removes duplicates that occur when ignoring certain columns.
Output order is stable.
"""
to_remove = set()
for idx1, first in enumerate(dicts):
for idx2, second in enumerate(dicts[idx1 + 1 :], idx1 + 1):
if equals_without_fields(first, second, deduplicate_ignore_columns):
to_remove.add(idx2)
2024-08-25 19:53:18 +00:00
del idx2, second
del idx1, first
2024-08-25 18:08:41 +00:00
2024-10-25 20:09:27 +00:00
to_remove_ls = sorted(to_remove)
del to_remove
while to_remove_ls:
del dicts[to_remove_ls.pop()]
2024-08-25 18:08:41 +00:00
return dicts
def deduplicate_dicts(
2024-10-25 20:09:27 +00:00
dicts: list[frozendict[str, Any]],
2024-08-25 18:08:41 +00:00
deduplicate_mode: data.DeduplicateMode,
deduplicate_ignore_columns: list[str],
2024-10-25 20:09:27 +00:00
) -> tuple[list[frozendict[str, Any]], list[str]]:
if not isinstance(deduplicate_ignore_columns, list):
raise TypeError(deduplicate_ignore_columns)
2024-08-25 18:08:41 +00:00
fieldnames = []
for d in dicts:
for k in d.keys():
if k not in fieldnames:
fieldnames.append(k)
del k
del d
if deduplicate_mode == data.DeduplicateMode.ONLY_LATEST:
2024-10-25 20:09:27 +00:00
while len(dicts) > 1 and equals_without_fields(
2024-08-25 18:08:41 +00:00
dicts[-1],
dicts[-2],
deduplicate_ignore_columns,
):
del dicts[-1]
elif deduplicate_mode == data.DeduplicateMode.BY_ALL_COLUMNS:
dicts = deduplicate_by_ignoring_certain_fields(
dicts,
deduplicate_ignore_columns,
)
elif deduplicate_mode != data.DeduplicateMode.NONE:
2024-10-25 20:09:27 +00:00
dicts = list(set(dicts))
2024-08-25 18:08:41 +00:00
dicts = sorted(dicts, key=lambda d: tuple(str(d.get(fn, '')) for fn in fieldnames))
return dicts, fieldnames
2024-10-25 20:09:27 +00:00
def normalize_dict(d: dict[str, Any] | frozendict[str, Any]) -> frozendict[str, Any]:
2024-08-25 18:08:41 +00:00
return frozendict(
2024-10-03 21:24:12 +00:00
{
k: csv_import.csv_str_to_value(str(v))
2024-10-03 21:24:12 +00:00
for k, v in d.items()
if csv_import.csv_str_to_value(str(v)) is not None
2024-10-03 21:24:12 +00:00
},
2024-08-25 18:08:41 +00:00
)
def extend_csv_file(
csv_file: Path,
2024-10-25 20:09:27 +00:00
new_dicts: list[dict[str, Any] | frozendict[str, Any]],
2024-08-25 18:08:41 +00:00
deduplicate_mode: data.DeduplicateMode,
deduplicate_ignore_columns: list[str],
) -> dict:
2024-10-25 20:09:27 +00:00
if not isinstance(deduplicate_ignore_columns, list):
raise TypeError(deduplicate_ignore_columns)
2024-08-25 18:08:41 +00:00
try:
2024-10-25 20:09:27 +00:00
original_dicts = csv_import.load_csv_file(csv_file)
except (FileNotFoundError, _csv.Error):
2024-08-25 18:08:41 +00:00
logger.info('Creating file: %s', csv_file)
2024-10-25 20:09:27 +00:00
original_dicts = []
2024-08-25 18:08:41 +00:00
2024-10-25 20:09:27 +00:00
original_num_dicts = len(original_dicts)
dicts = [normalize_dict(d) for d in original_dicts] + [
normalize_dict(d) for d in new_dicts
]
2024-08-25 18:08:41 +00:00
del new_dicts
dicts, fieldnames = deduplicate_dicts(
dicts,
deduplicate_mode,
deduplicate_ignore_columns,
)
csvfile_in_memory = io.StringIO()
writer = csv.DictWriter(
csvfile_in_memory,
fieldnames=fieldnames,
dialect=csv_import.CSV_DIALECT,
2024-08-25 18:08:41 +00:00
)
writer.writeheader()
for d in dicts:
2024-10-03 21:24:12 +00:00
writable_d = {k: csv_safe_value(v) for k, v in d.items()}
2024-08-25 19:07:52 +00:00
writer.writerow(writable_d)
del d, writable_d
2024-08-25 18:08:41 +00:00
output_csv = csvfile_in_memory.getvalue()
del writer, csvfile_in_memory
2024-08-25 18:50:03 +00:00
csv_file.parent.mkdir(parents=True, exist_ok=True)
2024-08-25 18:08:41 +00:00
with open(csv_file, 'w') as csvfile:
csvfile.write(output_csv)
del csvfile
logger.info(
'Extended CSV "%s" from %d to %d lines',
csv_file,
original_num_dicts,
len(dicts),
)
return {
'extended': original_num_dicts != len(dicts),
'input_lines': original_num_dicts,
'output_lines': len(dicts),
'dicts': dicts,
}