import schemeld import urllib import json def determine_concepts_internal(json, context, outputs): if isinstance(json, list): for m in json: determine_concepts_internal(m, context, outputs) return assert isinstance(json, dict), type(json) context = urllib.parse.urlparse(json.get('@context', context)) assert context.netloc == 'schema.org' if '@graph' in json: determine_concepts_internal(json['@graph'], context, outputs) else: outputs.append(schemeld.Concept(context, json)) def determine_concepts(json): concepts = [] determine_concepts_internal(json, '', concepts) return concepts def determine_concepts_in_soup(soup): # TODO: Check type ld_json_elements = soup.find_all('script', type="application/ld+json") concepts = [] for e in ld_json_elements: json_data = json.loads(e.string) concepts.extend(determine_concepts(json_data)) return concepts