1
0

More robust url parsing
Some checks failed
Verify Python project can be installed, loaded and have version checked / Test (push) Waiting to run
Run Python tests (through Pytest) / Test (push) Has been cancelled

This commit is contained in:
Jon Michael Aanes 2025-05-26 18:44:25 +02:00
parent 8ff6c84ae0
commit 4b6c998de5
2 changed files with 12 additions and 6 deletions

View File

@ -1,17 +1,23 @@
import json
import urllib
import urllib.parse
import schemeld
def determine_concepts_internal(json, context, outputs):
def parse_url(url: str | urllib.parse.ParseResult) -> urllib.parse.ParseResult:
if isinstance(url, urllib.parse.ParseResult):
return url
return urllib.parse.urlparse(url)
def determine_concepts_internal(json: dict | list, context, outputs: list[schemeld.Concept]) -> None:
if isinstance(json, list):
for m in json:
determine_concepts_internal(m, context, outputs)
return
assert isinstance(json, dict), type(json)
context = urllib.parse.urlparse(json.get('@context', context))
context = parse_url(json.get('@context', context))
assert context.netloc == 'schema.org'
if '@graph' in json:
@ -20,13 +26,13 @@ def determine_concepts_internal(json, context, outputs):
outputs.append(schemeld.Concept(context, json))
def determine_concepts(json):
def determine_concepts(json: dict | list) -> list[schemeld.Concept]:
concepts = []
determine_concepts_internal(json, '', concepts)
return concepts
def determine_concepts_in_soup(soup):
def determine_concepts_in_soup(soup) -> list[schemeld.Concept]:
# TODO: Check type
ld_json_elements = soup.find_all('script', type='application/ld+json')
concepts = []

View File

@ -5,7 +5,7 @@ from typing import Any
STRICT_VALIDATION = True
Key = int | str | urllib.parse.ParseResult
Context = str # TODO
Context = urllib.parse.ParseResult # TODO
def canonical_keys(base_key: Key, context: Context | None) -> list[Any]: