37 lines
975 B
Python
37 lines
975 B
Python
import json
|
|
import urllib
|
|
|
|
import schemeld
|
|
|
|
|
|
def determine_concepts_internal(json, context, outputs):
|
|
if isinstance(json, list):
|
|
for m in json:
|
|
determine_concepts_internal(m, context, outputs)
|
|
return
|
|
|
|
assert isinstance(json, dict), type(json)
|
|
context = urllib.parse.urlparse(json.get('@context', context))
|
|
assert context.netloc == 'schema.org'
|
|
|
|
if '@graph' in json:
|
|
determine_concepts_internal(json['@graph'], context, outputs)
|
|
else:
|
|
outputs.append(schemeld.Concept(context, json))
|
|
|
|
|
|
def determine_concepts(json):
|
|
concepts = []
|
|
determine_concepts_internal(json, '', concepts)
|
|
return concepts
|
|
|
|
|
|
def determine_concepts_in_soup(soup):
|
|
# TODO: Check type
|
|
ld_json_elements = soup.find_all('script', type='application/ld+json')
|
|
concepts = []
|
|
for e in ld_json_elements:
|
|
json_data = json.loads(e.string)
|
|
concepts.extend(determine_concepts(json_data))
|
|
return concepts
|