Significant additions
This commit is contained in:
parent
94fa9f10b9
commit
d39fbbfed3
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
|
||||
/__pycache__/
|
||||
|
158
schemeld.py
158
schemeld.py
|
@ -1,5 +1,14 @@
|
|||
|
||||
import logging
|
||||
import urllib.parse
|
||||
import wikidata.entity
|
||||
import datetime
|
||||
from dataclasses import dataclass
|
||||
from enforce_typing import enforce_types
|
||||
from typing import List, Set, Optional, Union
|
||||
from enum import Enum
|
||||
|
||||
STRICT_VALIDATION = True
|
||||
|
||||
class Concept(object):
|
||||
|
||||
|
@ -10,10 +19,27 @@ class Concept(object):
|
|||
def get(self, key, *args, **kwargs):
|
||||
return self.data.get(self.canonical_key(key, *args, **kwargs))
|
||||
|
||||
def keys(self):
|
||||
return self.data.keys()
|
||||
|
||||
def setdefault(self, key, value):
|
||||
return self.data.setdefault(self.canonical_key(key), value)
|
||||
|
||||
def to_dict(self):
|
||||
return {k:v for k,v in self.data.items()}
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.data[self.canonical_key(key)]
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
if STRICT_VALIDATION:
|
||||
if not isinstance(key, str) or key != '@id':
|
||||
assert isinstance(value, list), value
|
||||
for v in value:
|
||||
assert isinstance(v, dict), value
|
||||
assert 'value' in v, value
|
||||
for subk in v:
|
||||
assert not isinstance(v[subk], list), value
|
||||
self.data[self.canonical_key(key)] = value
|
||||
|
||||
def __contains__(self, key):
|
||||
|
@ -23,13 +49,24 @@ class Concept(object):
|
|||
del self.data[self.canonical_key(key)]
|
||||
|
||||
def canonical_key(self, key):
|
||||
if isinstance(key, urllib.parse.ParseResult):
|
||||
return key
|
||||
if not isinstance(key, str):
|
||||
return key
|
||||
elif key.startswith('@'):
|
||||
return key
|
||||
elif key.startswith(self.context):
|
||||
if self.context is None:
|
||||
return key
|
||||
return self.context + key
|
||||
return self.context._replace(path = key)
|
||||
|
||||
def __repr__(self):
|
||||
if id := self.data.get('@id'):
|
||||
return 'Concept {{ @id = {} }}'.format(id)
|
||||
|
||||
return 'Concept '+str(self.data)
|
||||
|
||||
def __str__(self):
|
||||
return repr(self)
|
||||
|
||||
def determine_concepts_internal(json, context, outputs):
|
||||
if isinstance(json, list):
|
||||
|
@ -37,9 +74,10 @@ def determine_concepts_internal(json, context, outputs):
|
|||
determine_concepts_internal(m, context, outputs)
|
||||
return
|
||||
|
||||
assert isinstance(json, dict)
|
||||
context = json.get('@context', context)
|
||||
assert urllib.parse.urlparse(context).netloc == 'schema.org'
|
||||
assert isinstance(json, dict), type(json)
|
||||
context = urllib.parse.urlparse(json.get('@context', context))
|
||||
assert context.netloc == 'schema.org'
|
||||
|
||||
if '@graph' in json:
|
||||
determine_concepts_internal(json['@graph'], context, outputs)
|
||||
else:
|
||||
|
@ -49,3 +87,113 @@ def determine_concepts(json):
|
|||
concepts = []
|
||||
determine_concepts_internal(json, '', concepts)
|
||||
return concepts
|
||||
|
||||
REFERENCE_PROPERTIES = {'P813', 'P854', 'P248', 'P143', 'P813'}
|
||||
|
||||
def fmt_value(c, prefer_reference = False):
|
||||
if isinstance(c, str):
|
||||
return '"{}"'.format(c) # TODO: Escape
|
||||
elif isinstance(c, Concept):
|
||||
if '@id' in c:
|
||||
return fmt_value(c['@id'], prefer_reference)
|
||||
else:
|
||||
logging.error('Could not determine useful id for %s', c)
|
||||
return ''
|
||||
elif isinstance(c, wikidata.entity.Entity):
|
||||
s = c.id
|
||||
if isinstance(s, int):
|
||||
s = 'P{}'.format(s)
|
||||
if s in REFERENCE_PROPERTIES:
|
||||
s = s.replace('P', 'S', 1)
|
||||
return s
|
||||
elif isinstance(c, urllib.parse.ParseResult):
|
||||
return c.geturl() if prefer_reference else fmt_value(c.geturl(), prefer_reference)
|
||||
elif isinstance(c, datetime.datetime):
|
||||
return '+{}/11'.format(c.isoformat())
|
||||
elif isinstance(c, datetime.date):
|
||||
return '+{}T00:00:00Z/11'.format(c.isoformat())
|
||||
|
||||
return str(c)
|
||||
|
||||
def fmt_predicate(pred, object):
|
||||
if isinstance(pred, urllib.parse.ParseResult) and pred.netloc == 'schema.org':
|
||||
lang = object.get('__language') or 'en'
|
||||
if pred.path == '/name':
|
||||
return 'L'+lang
|
||||
elif pred.path == '/alternateName':
|
||||
return 'A'+lang
|
||||
elif pred.path == '/description':
|
||||
return 'D'+lang
|
||||
elif pred.path == '/sameAs':
|
||||
return 'S{}wiki'.format(lang)
|
||||
else:
|
||||
assert False, pred
|
||||
return fmt_value(pred, prefer_reference = True)
|
||||
|
||||
def to_quickstatements_v1_item(subject, lines, skip_impossible = True, skip_already_syncronized = True):
|
||||
#assert '@id' not in subject, 'TODO: Linked subjects'
|
||||
subject_id = fmt_value(subject, True) if '@id' in subject else 'LAST'
|
||||
|
||||
if subject_id == 'LAST':
|
||||
lines.append(['CREATE'])
|
||||
|
||||
def fmt_key_value_pair(v, line):
|
||||
if isinstance(v, list):
|
||||
for e in v:
|
||||
fmt_key_value_pair(e, line)
|
||||
return
|
||||
elif isinstance(v, dict) and 'value' in v:
|
||||
line.append(fmt_value(v['value']))
|
||||
for sub_k, sub_v in v.items():
|
||||
if sub_k is None or sub_v is None:
|
||||
continue
|
||||
if not isinstance(sub_k, str):
|
||||
line.append(fmt_predicate(sub_k, sub_v))
|
||||
line.append(fmt_value(sub_v))
|
||||
else:
|
||||
line.append(fmt_value(v))
|
||||
|
||||
for predicate, pred_objects in subject.data.items():
|
||||
if isinstance(predicate, str) and (predicate == '@id' or predicate.startswith('__')):
|
||||
continue
|
||||
|
||||
assert isinstance(pred_objects, list)
|
||||
for pred_object in pred_objects:
|
||||
if pred_object.get('__synchronized_with_wikidata', False) and skip_already_syncronized:
|
||||
continue
|
||||
predicate_str = fmt_predicate(predicate, pred_object)
|
||||
line = [subject_id, predicate_str]
|
||||
fmt_key_value_pair(pred_object, line)
|
||||
|
||||
if skip_impossible and predicate_str.startswith('"'):
|
||||
logging.warning('Bad line: %s (Lines must not start with ")', predicate_str)
|
||||
continue
|
||||
if '' in line and skip_impossible:
|
||||
logging.warning('Bad line: %s (Lines must not contain empty names)', line)
|
||||
continue
|
||||
assert 'None' not in line, line
|
||||
lines.append(line)
|
||||
|
||||
def to_quickstatements_v1(concepts):
|
||||
if isinstance(concepts, Concept):
|
||||
concepts = [concepts]
|
||||
|
||||
lines = []
|
||||
|
||||
for concept in concepts:
|
||||
to_quickstatements_v1_item(concept, lines)
|
||||
|
||||
logging.info("Produced %s statements for %s concepts", len(lines), len(concepts))
|
||||
commands = '\n'.join(['\t'.join(l) for l in lines])
|
||||
|
||||
assert '\tNone\t' not in commands, 'TODO'
|
||||
return commands
|
||||
|
||||
def commands_to_quickstatements_v1_url(commands):
|
||||
url = commands.replace('\t', '|').replace('\n', '||')
|
||||
url = urllib.parse.quote(url, safe = '')
|
||||
return 'https://quickstatements.toolforge.org/#/v1=' + url
|
||||
|
||||
def to_quickstatements_v1_url(concepts):
|
||||
return commands_to_quickstatements_v1_url(to_quickstatements_v1(concepts))
|
||||
|
||||
|
|
15
test.py
Normal file
15
test.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
|
||||
import schemeld
|
||||
import wikidata
|
||||
import wikidata_ext
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
client = wikidata.client.Client()
|
||||
|
||||
EQV_PROPERTY = client.get('P1628')
|
||||
schema_root = "https://schema.org/"
|
||||
schema_prop = "image"
|
||||
|
||||
triples = wikidata_ext.get_triples(client, predicate = EQV_PROPERTY, "{}{}".format(schema_root, schema_prop))
|
||||
|
26
wikidata.py
26
wikidata.py
|
@ -1,26 +0,0 @@
|
|||
|
||||
def get_triples(client, subject = None, predicate = None, object = None):
|
||||
time.sleep(1)
|
||||
params = {
|
||||
'subject': fmt_triple_value(subject),
|
||||
'predicate': fmt_triple_value(predicate),
|
||||
'object': fmt_triple_value(object),
|
||||
'page': 1,
|
||||
}
|
||||
headers = {'accept': 'application/ld+json'}
|
||||
result = requests.get('https://query.wikidata.org/bigdata/ldf',
|
||||
params = params,
|
||||
headers = headers,
|
||||
)
|
||||
|
||||
triples = []
|
||||
if result.status_code != 200:
|
||||
logging.error('Got %s error message: %s', result.status_code, repr((subject, predicate, object)))
|
||||
return []
|
||||
for item in result.json()['@graph']:
|
||||
if item['@id'].startswith('wd:') and predicate.id in item:
|
||||
s = item['@id'][3:]
|
||||
triples.append((client.get(s, load = False), predicate, object))
|
||||
|
||||
return triples
|
||||
|
114
wikidata_ext.py
Normal file
114
wikidata_ext.py
Normal file
|
@ -0,0 +1,114 @@
|
|||
|
||||
import ratelimit
|
||||
import urllib.parse
|
||||
import wikidata.entity
|
||||
import requests
|
||||
import json
|
||||
import logging
|
||||
|
||||
def concept_uri(obj):
|
||||
assert isinstance(obj, wikidata.entity.Entity), obj
|
||||
if obj.id.startswith('P'):
|
||||
return urllib.parse.urlparse('http://www.wikidata.org/prop/direct/{}'.format(obj.id))
|
||||
elif obj.id.startswith('Q'):
|
||||
return urllib.parse.urlparse('http://www.wikidata.org/entity/{}'.format(obj.id))
|
||||
else:
|
||||
assert False, "TODO: " + ojb.id
|
||||
|
||||
def fmt_triple_value(obj, prefer_obj = False):
|
||||
if obj is None:
|
||||
return ''
|
||||
if isinstance(obj, str):
|
||||
return '"{}"'.format(obj)
|
||||
elif isinstance(obj, urllib.parse.ParseResult):
|
||||
return obj.geturl() if prefer_obj else fmt_triple_value(obj.geturl())
|
||||
elif isinstance(obj, wikidata.entity.Entity):
|
||||
uri = concept_uri(obj)
|
||||
return fmt_triple_value(uri, prefer_obj)
|
||||
else:
|
||||
assert False, type(obj)
|
||||
|
||||
@ratelimit.sleep_and_retry
|
||||
@ratelimit.limits(calls=10, period=60)
|
||||
def fetch_by_url(url, headers):
|
||||
logging.debug('Fetching: %s', url)
|
||||
result = requests.get(url, headers = headers)
|
||||
if result.status_code != 200:
|
||||
logging.error('Got %s error message: %s', result.status_code, repr((subject, predicate, object)))
|
||||
return None
|
||||
return request
|
||||
|
||||
ITEMS_PER_PAGE = "http://www.w3.org/ns/hydra/core#itemsPerPage"
|
||||
TOTAL_ITEMS = "http://www.w3.org/ns/hydra/core#totalItems"
|
||||
|
||||
def fmt_params(subject, predicate, object):
|
||||
derp = [x for x in [subject, predicate, object] if x]
|
||||
assert len(derp) >= 1, str(derp)
|
||||
params = {
|
||||
'subject': fmt_triple_value(subject, prefer_obj = True),
|
||||
'predicate': fmt_triple_value(predicate, prefer_obj = True),
|
||||
'object': fmt_triple_value(object),
|
||||
'page': 1,
|
||||
}
|
||||
return params
|
||||
|
||||
def get_triples_count(subject = None, predicate = None, object = None):
|
||||
'''
|
||||
Fetches first page in order to determine amount of items.
|
||||
'''
|
||||
params = fmt_params(subject, predicate, object)
|
||||
url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url
|
||||
result = fetch_by_url(url, headers = {'accept': 'application/ld+json'})
|
||||
json_data = json.loads(result)
|
||||
for item in json_data['@graph']:
|
||||
if TOTAL_ITEMS in item:
|
||||
return {
|
||||
'items_per_page': item[ITEMS_PER_PAGE],
|
||||
'items_total': item[TOTAL_ITEMS],
|
||||
'num_pages': int((item[TOTAL_ITEMS] - 1) / item[ITEMS_PER_PAGE] + 1),
|
||||
}
|
||||
assert False
|
||||
|
||||
def get_triples_internal(subject, predicate, object):
|
||||
params = fmt_params(subject, predicate, object)
|
||||
pagination_data = get_triples_count(subject, predicate, object)
|
||||
for current_page in range(1, pagination_data['num_pages']+1):
|
||||
params['page'] = current_page
|
||||
url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url
|
||||
result = fetch_by_url(url, headers = {'accept': 'application/ld+json'})
|
||||
json_data = json.loads(result)
|
||||
|
||||
for item in json_data['@graph']:
|
||||
if item['@id'].startswith('_:b'):
|
||||
continue
|
||||
if item['@id'].startswith('https://query.wikidata.org/bigdata/ldf'):
|
||||
continue
|
||||
if item['@id'].startswith('http://www.wikidata.org/.well-known/'):
|
||||
continue
|
||||
yield item
|
||||
|
||||
# Bookkeeping
|
||||
del url, result, json_data
|
||||
|
||||
SCHEMA_ABOUT = urllib.parse.urlparse('http://schema.org/about')
|
||||
|
||||
def get_wikidata_concept_for_wikipedia_page(client, wikipage):
|
||||
triples = get_triples_internal(wikipage, SCHEMA_ABOUT, None);
|
||||
triples = list(triples)
|
||||
for item in triples:
|
||||
s = item['about'][3:]
|
||||
return client.get(s, load = False)
|
||||
|
||||
def get_triples(client, subject = None, predicate = None, object = None):
|
||||
triples = []
|
||||
iterator = get_triples_internal(subject, predicate, object)
|
||||
for item in iterator:
|
||||
is_looking_for = item['@id'].startswith('wd:') and predicate.id in item
|
||||
if is_looking_for :
|
||||
s = subject
|
||||
if s is None:
|
||||
s = client.get(item['@id'][3:], load = False)
|
||||
o = object or item[predicate.id]
|
||||
yield (s, predicate, o)
|
||||
del item, is_looking_for
|
||||
|
Loading…
Reference in New Issue
Block a user