1
0
datagraph/parse.py

36 lines
974 B
Python
Raw Normal View History

import schemeld
2023-03-07 22:14:09 +00:00
import urllib
2023-09-17 10:09:17 +00:00
import json
def determine_concepts_internal(json, context, outputs):
if isinstance(json, list):
for m in json:
determine_concepts_internal(m, context, outputs)
return
assert isinstance(json, dict), type(json)
context = urllib.parse.urlparse(json.get('@context', context))
assert context.netloc == 'schema.org'
if '@graph' in json:
determine_concepts_internal(json['@graph'], context, outputs)
else:
outputs.append(schemeld.Concept(context, json))
def determine_concepts(json):
concepts = []
determine_concepts_internal(json, '', concepts)
return concepts
2023-09-17 10:09:17 +00:00
def determine_concepts_in_soup(soup):
# TODO: Check type
ld_json_elements = soup.find_all('script', type="application/ld+json")
concepts = []
for e in ld_json_elements:
json_data = json.loads(e.string)
concepts.extend(determine_concepts(json_data))
return concepts