More robust url parsing
This commit is contained in:
parent
8ff6c84ae0
commit
4b6c998de5
|
@ -1,17 +1,23 @@
|
|||
import json
|
||||
import urllib
|
||||
import urllib.parse
|
||||
|
||||
import schemeld
|
||||
|
||||
|
||||
def determine_concepts_internal(json, context, outputs):
|
||||
def parse_url(url: str | urllib.parse.ParseResult) -> urllib.parse.ParseResult:
|
||||
if isinstance(url, urllib.parse.ParseResult):
|
||||
return url
|
||||
return urllib.parse.urlparse(url)
|
||||
|
||||
|
||||
def determine_concepts_internal(json: dict | list, context, outputs: list[schemeld.Concept]) -> None:
|
||||
if isinstance(json, list):
|
||||
for m in json:
|
||||
determine_concepts_internal(m, context, outputs)
|
||||
return
|
||||
|
||||
assert isinstance(json, dict), type(json)
|
||||
context = urllib.parse.urlparse(json.get('@context', context))
|
||||
context = parse_url(json.get('@context', context))
|
||||
assert context.netloc == 'schema.org'
|
||||
|
||||
if '@graph' in json:
|
||||
|
@ -20,13 +26,13 @@ def determine_concepts_internal(json, context, outputs):
|
|||
outputs.append(schemeld.Concept(context, json))
|
||||
|
||||
|
||||
def determine_concepts(json):
|
||||
def determine_concepts(json: dict | list) -> list[schemeld.Concept]:
|
||||
concepts = []
|
||||
determine_concepts_internal(json, '', concepts)
|
||||
return concepts
|
||||
|
||||
|
||||
def determine_concepts_in_soup(soup):
|
||||
def determine_concepts_in_soup(soup) -> list[schemeld.Concept]:
|
||||
# TODO: Check type
|
||||
ld_json_elements = soup.find_all('script', type='application/ld+json')
|
||||
concepts = []
|
||||
|
|
|
@ -5,7 +5,7 @@ from typing import Any
|
|||
STRICT_VALIDATION = True
|
||||
|
||||
Key = int | str | urllib.parse.ParseResult
|
||||
Context = str # TODO
|
||||
Context = urllib.parse.ParseResult # TODO
|
||||
|
||||
|
||||
def canonical_keys(base_key: Key, context: Context | None) -> list[Any]:
|
||||
|
|
Loading…
Reference in New Issue
Block a user