1
0
datagraph/datagraph/parse.py
2024-07-08 18:54:14 +02:00

37 lines
975 B
Python

import json
import urllib
import schemeld
def determine_concepts_internal(json, context, outputs):
if isinstance(json, list):
for m in json:
determine_concepts_internal(m, context, outputs)
return
assert isinstance(json, dict), type(json)
context = urllib.parse.urlparse(json.get('@context', context))
assert context.netloc == 'schema.org'
if '@graph' in json:
determine_concepts_internal(json['@graph'], context, outputs)
else:
outputs.append(schemeld.Concept(context, json))
def determine_concepts(json):
concepts = []
determine_concepts_internal(json, '', concepts)
return concepts
def determine_concepts_in_soup(soup):
# TODO: Check type
ld_json_elements = soup.find_all('script', type='application/ld+json')
concepts = []
for e in ld_json_elements:
json_data = json.loads(e.string)
concepts.extend(determine_concepts(json_data))
return concepts