1
0

Improved normalization to prevent weird transformations
All checks were successful
Run Python tests (through Pytest) / Test (push) Successful in 34s
Verify Python project can be installed, loaded and have version checked / Test (push) Successful in 30s

This commit is contained in:
Jon Michael Aanes 2025-02-01 20:11:50 +01:00
parent 9d528d4cfd
commit d2916cbc28
3 changed files with 19 additions and 21 deletions

View File

@ -17,6 +17,18 @@ csv.register_dialect(CSV_DIALECT, lineterminator='\n', skipinitialspace=True)
T = typing.TypeVar('T') T = typing.TypeVar('T')
def csv_safe_value(v: Any) -> str:
if isinstance(v, urllib.parse.ParseResult):
return v.geturl()
if isinstance(v, datetime.datetime):
if v.tzinfo is None or v.tzinfo != datetime.UTC:
msg = f'Timezone must be UTC: {v}'
raise ValueError(msg)
if isinstance(v, urllib.parse.ParseResult):
return v.geturl()
return str(v)
def try_value(fn: Callable[[str], T], s: str) -> T | None: def try_value(fn: Callable[[str], T], s: str) -> T | None:
try: try:
return fn(s) return fn(s)

View File

@ -32,9 +32,10 @@ class MyAnimeList(Scraper):
for data_item in data_items: for data_item in data_items:
print(data_item) print(data_item)
yield { yield {
'series.name': data_item.get('anime_title_eng') or data_item.get('anime_title'), 'series.name_eng': data_item.get('anime_title_eng') or data_item.get('anime_title'),
'series.myanimelist_url': urllib.parse.urljoin(url, data_item['anime_url']), 'series.name': data_item.get('anime_title') or data_item.get('anime_title_eng'),
'series.icon': urllib.parse.urljoin(url, data_item['anime_image_path']), 'series.myanimelist_url': urllib.parse.urlparse(urllib.parse.urljoin(url, data_item['anime_url'])),
'series.icon': urllib.parse.urlparse(urllib.parse.urljoin(url, data_item['anime_image_path'])),
'me.score': data_item.get('score'), 'me.score': data_item.get('score'),
} }

View File

@ -15,16 +15,6 @@ from . import csv_import, data
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def csv_safe_value(v: Any) -> str:
if isinstance(v, urllib.parse.ParseResult):
return v.geturl()
if isinstance(v, datetime.datetime):
if v.tzinfo is None or v.tzinfo != datetime.UTC:
msg = f'Timezone must be UTC: {v}'
raise ValueError(msg)
return str(v)
def equals_without_fields( def equals_without_fields(
a: Mapping[str, Any], a: Mapping[str, Any],
b: Mapping[str, Any], b: Mapping[str, Any],
@ -99,13 +89,8 @@ def deduplicate_dicts(
def normalize_dict(d: dict[str, Any] | frozendict[str, Any]) -> frozendict[str, Any]: def normalize_dict(d: dict[str, Any] | frozendict[str, Any]) -> frozendict[str, Any]:
return frozendict( safe_values = [(k, csv_import.csv_str_to_value(csv_import.csv_safe_value(v))) for k, v in d.items() ]
{ return frozendict( {k:v for k,v in safe_values if v is not None})
k: csv_import.csv_str_to_value(str(v))
for k, v in d.items()
if csv_import.csv_str_to_value(str(v)) is not None
},
)
def extend_csv_file( def extend_csv_file(
@ -145,7 +130,7 @@ def extend_csv_file(
) )
writer.writeheader() writer.writeheader()
for d in dicts: for d in dicts:
writable_d = {k: csv_safe_value(v) for k, v in d.items()} writable_d = {k: csv_import.csv_safe_value(v) for k, v in d.items()}
writer.writerow(writable_d) writer.writerow(writable_d)
del d, writable_d del d, writable_d
output_csv = csvfile_in_memory.getvalue() output_csv = csvfile_in_memory.getvalue()