diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..25b8796 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ + +/__pycache__/ + diff --git a/schemeld.py b/schemeld.py index d935248..d5bc4ff 100644 --- a/schemeld.py +++ b/schemeld.py @@ -1,5 +1,14 @@ +import logging import urllib.parse +import wikidata.entity +import datetime +from dataclasses import dataclass +from enforce_typing import enforce_types +from typing import List, Set, Optional, Union +from enum import Enum + +STRICT_VALIDATION = True class Concept(object): @@ -10,10 +19,27 @@ class Concept(object): def get(self, key, *args, **kwargs): return self.data.get(self.canonical_key(key, *args, **kwargs)) + def keys(self): + return self.data.keys() + + def setdefault(self, key, value): + return self.data.setdefault(self.canonical_key(key), value) + + def to_dict(self): + return {k:v for k,v in self.data.items()} + def __getitem__(self, key): return self.data[self.canonical_key(key)] def __setitem__(self, key, value): + if STRICT_VALIDATION: + if not isinstance(key, str) or key != '@id': + assert isinstance(value, list), value + for v in value: + assert isinstance(v, dict), value + assert 'value' in v, value + for subk in v: + assert not isinstance(v[subk], list), value self.data[self.canonical_key(key)] = value def __contains__(self, key): @@ -23,13 +49,24 @@ class Concept(object): del self.data[self.canonical_key(key)] def canonical_key(self, key): + if isinstance(key, urllib.parse.ParseResult): + return key if not isinstance(key, str): return key elif key.startswith('@'): return key - elif key.startswith(self.context): + if self.context is None: return key - return self.context + key + return self.context._replace(path = key) + + def __repr__(self): + if id := self.data.get('@id'): + return 'Concept {{ @id = {} }}'.format(id) + + return 'Concept '+str(self.data) + + def __str__(self): + return repr(self) def determine_concepts_internal(json, context, outputs): if isinstance(json, list): @@ -37,9 +74,10 @@ def determine_concepts_internal(json, context, outputs): determine_concepts_internal(m, context, outputs) return - assert isinstance(json, dict) - context = json.get('@context', context) - assert urllib.parse.urlparse(context).netloc == 'schema.org' + assert isinstance(json, dict), type(json) + context = urllib.parse.urlparse(json.get('@context', context)) + assert context.netloc == 'schema.org' + if '@graph' in json: determine_concepts_internal(json['@graph'], context, outputs) else: @@ -49,3 +87,113 @@ def determine_concepts(json): concepts = [] determine_concepts_internal(json, '', concepts) return concepts + +REFERENCE_PROPERTIES = {'P813', 'P854', 'P248', 'P143', 'P813'} + +def fmt_value(c, prefer_reference = False): + if isinstance(c, str): + return '"{}"'.format(c) # TODO: Escape + elif isinstance(c, Concept): + if '@id' in c: + return fmt_value(c['@id'], prefer_reference) + else: + logging.error('Could not determine useful id for %s', c) + return '' + elif isinstance(c, wikidata.entity.Entity): + s = c.id + if isinstance(s, int): + s = 'P{}'.format(s) + if s in REFERENCE_PROPERTIES: + s = s.replace('P', 'S', 1) + return s + elif isinstance(c, urllib.parse.ParseResult): + return c.geturl() if prefer_reference else fmt_value(c.geturl(), prefer_reference) + elif isinstance(c, datetime.datetime): + return '+{}/11'.format(c.isoformat()) + elif isinstance(c, datetime.date): + return '+{}T00:00:00Z/11'.format(c.isoformat()) + + return str(c) + +def fmt_predicate(pred, object): + if isinstance(pred, urllib.parse.ParseResult) and pred.netloc == 'schema.org': + lang = object.get('__language') or 'en' + if pred.path == '/name': + return 'L'+lang + elif pred.path == '/alternateName': + return 'A'+lang + elif pred.path == '/description': + return 'D'+lang + elif pred.path == '/sameAs': + return 'S{}wiki'.format(lang) + else: + assert False, pred + return fmt_value(pred, prefer_reference = True) + +def to_quickstatements_v1_item(subject, lines, skip_impossible = True, skip_already_syncronized = True): + #assert '@id' not in subject, 'TODO: Linked subjects' + subject_id = fmt_value(subject, True) if '@id' in subject else 'LAST' + + if subject_id == 'LAST': + lines.append(['CREATE']) + + def fmt_key_value_pair(v, line): + if isinstance(v, list): + for e in v: + fmt_key_value_pair(e, line) + return + elif isinstance(v, dict) and 'value' in v: + line.append(fmt_value(v['value'])) + for sub_k, sub_v in v.items(): + if sub_k is None or sub_v is None: + continue + if not isinstance(sub_k, str): + line.append(fmt_predicate(sub_k, sub_v)) + line.append(fmt_value(sub_v)) + else: + line.append(fmt_value(v)) + + for predicate, pred_objects in subject.data.items(): + if isinstance(predicate, str) and (predicate == '@id' or predicate.startswith('__')): + continue + + assert isinstance(pred_objects, list) + for pred_object in pred_objects: + if pred_object.get('__synchronized_with_wikidata', False) and skip_already_syncronized: + continue + predicate_str = fmt_predicate(predicate, pred_object) + line = [subject_id, predicate_str] + fmt_key_value_pair(pred_object, line) + + if skip_impossible and predicate_str.startswith('"'): + logging.warning('Bad line: %s (Lines must not start with ")', predicate_str) + continue + if '' in line and skip_impossible: + logging.warning('Bad line: %s (Lines must not contain empty names)', line) + continue + assert 'None' not in line, line + lines.append(line) + +def to_quickstatements_v1(concepts): + if isinstance(concepts, Concept): + concepts = [concepts] + + lines = [] + + for concept in concepts: + to_quickstatements_v1_item(concept, lines) + + logging.info("Produced %s statements for %s concepts", len(lines), len(concepts)) + commands = '\n'.join(['\t'.join(l) for l in lines]) + + assert '\tNone\t' not in commands, 'TODO' + return commands + +def commands_to_quickstatements_v1_url(commands): + url = commands.replace('\t', '|').replace('\n', '||') + url = urllib.parse.quote(url, safe = '') + return 'https://quickstatements.toolforge.org/#/v1=' + url + +def to_quickstatements_v1_url(concepts): + return commands_to_quickstatements_v1_url(to_quickstatements_v1(concepts)) + diff --git a/test.py b/test.py new file mode 100644 index 0000000..c35d427 --- /dev/null +++ b/test.py @@ -0,0 +1,15 @@ + +import schemeld +import wikidata +import wikidata_ext + +if __name__ == '__main__': + + client = wikidata.client.Client() + + EQV_PROPERTY = client.get('P1628') + schema_root = "https://schema.org/" + schema_prop = "image" + + triples = wikidata_ext.get_triples(client, predicate = EQV_PROPERTY, "{}{}".format(schema_root, schema_prop)) + diff --git a/wikidata.py b/wikidata.py deleted file mode 100644 index b01d4a9..0000000 --- a/wikidata.py +++ /dev/null @@ -1,26 +0,0 @@ - -def get_triples(client, subject = None, predicate = None, object = None): - time.sleep(1) - params = { - 'subject': fmt_triple_value(subject), - 'predicate': fmt_triple_value(predicate), - 'object': fmt_triple_value(object), - 'page': 1, - } - headers = {'accept': 'application/ld+json'} - result = requests.get('https://query.wikidata.org/bigdata/ldf', - params = params, - headers = headers, - ) - - triples = [] - if result.status_code != 200: - logging.error('Got %s error message: %s', result.status_code, repr((subject, predicate, object))) - return [] - for item in result.json()['@graph']: - if item['@id'].startswith('wd:') and predicate.id in item: - s = item['@id'][3:] - triples.append((client.get(s, load = False), predicate, object)) - - return triples - diff --git a/wikidata_ext.py b/wikidata_ext.py new file mode 100644 index 0000000..9e80502 --- /dev/null +++ b/wikidata_ext.py @@ -0,0 +1,114 @@ + +import ratelimit +import urllib.parse +import wikidata.entity +import requests +import json +import logging + +def concept_uri(obj): + assert isinstance(obj, wikidata.entity.Entity), obj + if obj.id.startswith('P'): + return urllib.parse.urlparse('http://www.wikidata.org/prop/direct/{}'.format(obj.id)) + elif obj.id.startswith('Q'): + return urllib.parse.urlparse('http://www.wikidata.org/entity/{}'.format(obj.id)) + else: + assert False, "TODO: " + ojb.id + +def fmt_triple_value(obj, prefer_obj = False): + if obj is None: + return '' + if isinstance(obj, str): + return '"{}"'.format(obj) + elif isinstance(obj, urllib.parse.ParseResult): + return obj.geturl() if prefer_obj else fmt_triple_value(obj.geturl()) + elif isinstance(obj, wikidata.entity.Entity): + uri = concept_uri(obj) + return fmt_triple_value(uri, prefer_obj) + else: + assert False, type(obj) + +@ratelimit.sleep_and_retry +@ratelimit.limits(calls=10, period=60) +def fetch_by_url(url, headers): + logging.debug('Fetching: %s', url) + result = requests.get(url, headers = headers) + if result.status_code != 200: + logging.error('Got %s error message: %s', result.status_code, repr((subject, predicate, object))) + return None + return request + +ITEMS_PER_PAGE = "http://www.w3.org/ns/hydra/core#itemsPerPage" +TOTAL_ITEMS = "http://www.w3.org/ns/hydra/core#totalItems" + +def fmt_params(subject, predicate, object): + derp = [x for x in [subject, predicate, object] if x] + assert len(derp) >= 1, str(derp) + params = { + 'subject': fmt_triple_value(subject, prefer_obj = True), + 'predicate': fmt_triple_value(predicate, prefer_obj = True), + 'object': fmt_triple_value(object), + 'page': 1, + } + return params + +def get_triples_count(subject = None, predicate = None, object = None): + ''' + Fetches first page in order to determine amount of items. + ''' + params = fmt_params(subject, predicate, object) + url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url + result = fetch_by_url(url, headers = {'accept': 'application/ld+json'}) + json_data = json.loads(result) + for item in json_data['@graph']: + if TOTAL_ITEMS in item: + return { + 'items_per_page': item[ITEMS_PER_PAGE], + 'items_total': item[TOTAL_ITEMS], + 'num_pages': int((item[TOTAL_ITEMS] - 1) / item[ITEMS_PER_PAGE] + 1), + } + assert False + +def get_triples_internal(subject, predicate, object): + params = fmt_params(subject, predicate, object) + pagination_data = get_triples_count(subject, predicate, object) + for current_page in range(1, pagination_data['num_pages']+1): + params['page'] = current_page + url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url + result = fetch_by_url(url, headers = {'accept': 'application/ld+json'}) + json_data = json.loads(result) + + for item in json_data['@graph']: + if item['@id'].startswith('_:b'): + continue + if item['@id'].startswith('https://query.wikidata.org/bigdata/ldf'): + continue + if item['@id'].startswith('http://www.wikidata.org/.well-known/'): + continue + yield item + + # Bookkeeping + del url, result, json_data + +SCHEMA_ABOUT = urllib.parse.urlparse('http://schema.org/about') + +def get_wikidata_concept_for_wikipedia_page(client, wikipage): + triples = get_triples_internal(wikipage, SCHEMA_ABOUT, None); + triples = list(triples) + for item in triples: + s = item['about'][3:] + return client.get(s, load = False) + +def get_triples(client, subject = None, predicate = None, object = None): + triples = [] + iterator = get_triples_internal(subject, predicate, object) + for item in iterator: + is_looking_for = item['@id'].startswith('wd:') and predicate.id in item + if is_looking_for : + s = subject + if s is None: + s = client.get(item['@id'][3:], load = False) + o = object or item[predicate.id] + yield (s, predicate, o) + del item, is_looking_for +