1
0

Significant additions

This commit is contained in:
Jon Michael Aanes 2023-03-06 23:41:49 +01:00
parent 94fa9f10b9
commit d39fbbfed3
5 changed files with 285 additions and 31 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
/__pycache__/

View File

@ -1,5 +1,14 @@
import logging
import urllib.parse
import wikidata.entity
import datetime
from dataclasses import dataclass
from enforce_typing import enforce_types
from typing import List, Set, Optional, Union
from enum import Enum
STRICT_VALIDATION = True
class Concept(object):
@ -10,10 +19,27 @@ class Concept(object):
def get(self, key, *args, **kwargs):
return self.data.get(self.canonical_key(key, *args, **kwargs))
def keys(self):
return self.data.keys()
def setdefault(self, key, value):
return self.data.setdefault(self.canonical_key(key), value)
def to_dict(self):
return {k:v for k,v in self.data.items()}
def __getitem__(self, key):
return self.data[self.canonical_key(key)]
def __setitem__(self, key, value):
if STRICT_VALIDATION:
if not isinstance(key, str) or key != '@id':
assert isinstance(value, list), value
for v in value:
assert isinstance(v, dict), value
assert 'value' in v, value
for subk in v:
assert not isinstance(v[subk], list), value
self.data[self.canonical_key(key)] = value
def __contains__(self, key):
@ -23,13 +49,24 @@ class Concept(object):
del self.data[self.canonical_key(key)]
def canonical_key(self, key):
if isinstance(key, urllib.parse.ParseResult):
return key
if not isinstance(key, str):
return key
elif key.startswith('@'):
return key
elif key.startswith(self.context):
if self.context is None:
return key
return self.context + key
return self.context._replace(path = key)
def __repr__(self):
if id := self.data.get('@id'):
return 'Concept {{ @id = {} }}'.format(id)
return 'Concept '+str(self.data)
def __str__(self):
return repr(self)
def determine_concepts_internal(json, context, outputs):
if isinstance(json, list):
@ -37,9 +74,10 @@ def determine_concepts_internal(json, context, outputs):
determine_concepts_internal(m, context, outputs)
return
assert isinstance(json, dict)
context = json.get('@context', context)
assert urllib.parse.urlparse(context).netloc == 'schema.org'
assert isinstance(json, dict), type(json)
context = urllib.parse.urlparse(json.get('@context', context))
assert context.netloc == 'schema.org'
if '@graph' in json:
determine_concepts_internal(json['@graph'], context, outputs)
else:
@ -49,3 +87,113 @@ def determine_concepts(json):
concepts = []
determine_concepts_internal(json, '', concepts)
return concepts
REFERENCE_PROPERTIES = {'P813', 'P854', 'P248', 'P143', 'P813'}
def fmt_value(c, prefer_reference = False):
if isinstance(c, str):
return '"{}"'.format(c) # TODO: Escape
elif isinstance(c, Concept):
if '@id' in c:
return fmt_value(c['@id'], prefer_reference)
else:
logging.error('Could not determine useful id for %s', c)
return ''
elif isinstance(c, wikidata.entity.Entity):
s = c.id
if isinstance(s, int):
s = 'P{}'.format(s)
if s in REFERENCE_PROPERTIES:
s = s.replace('P', 'S', 1)
return s
elif isinstance(c, urllib.parse.ParseResult):
return c.geturl() if prefer_reference else fmt_value(c.geturl(), prefer_reference)
elif isinstance(c, datetime.datetime):
return '+{}/11'.format(c.isoformat())
elif isinstance(c, datetime.date):
return '+{}T00:00:00Z/11'.format(c.isoformat())
return str(c)
def fmt_predicate(pred, object):
if isinstance(pred, urllib.parse.ParseResult) and pred.netloc == 'schema.org':
lang = object.get('__language') or 'en'
if pred.path == '/name':
return 'L'+lang
elif pred.path == '/alternateName':
return 'A'+lang
elif pred.path == '/description':
return 'D'+lang
elif pred.path == '/sameAs':
return 'S{}wiki'.format(lang)
else:
assert False, pred
return fmt_value(pred, prefer_reference = True)
def to_quickstatements_v1_item(subject, lines, skip_impossible = True, skip_already_syncronized = True):
#assert '@id' not in subject, 'TODO: Linked subjects'
subject_id = fmt_value(subject, True) if '@id' in subject else 'LAST'
if subject_id == 'LAST':
lines.append(['CREATE'])
def fmt_key_value_pair(v, line):
if isinstance(v, list):
for e in v:
fmt_key_value_pair(e, line)
return
elif isinstance(v, dict) and 'value' in v:
line.append(fmt_value(v['value']))
for sub_k, sub_v in v.items():
if sub_k is None or sub_v is None:
continue
if not isinstance(sub_k, str):
line.append(fmt_predicate(sub_k, sub_v))
line.append(fmt_value(sub_v))
else:
line.append(fmt_value(v))
for predicate, pred_objects in subject.data.items():
if isinstance(predicate, str) and (predicate == '@id' or predicate.startswith('__')):
continue
assert isinstance(pred_objects, list)
for pred_object in pred_objects:
if pred_object.get('__synchronized_with_wikidata', False) and skip_already_syncronized:
continue
predicate_str = fmt_predicate(predicate, pred_object)
line = [subject_id, predicate_str]
fmt_key_value_pair(pred_object, line)
if skip_impossible and predicate_str.startswith('"'):
logging.warning('Bad line: %s (Lines must not start with ")', predicate_str)
continue
if '' in line and skip_impossible:
logging.warning('Bad line: %s (Lines must not contain empty names)', line)
continue
assert 'None' not in line, line
lines.append(line)
def to_quickstatements_v1(concepts):
if isinstance(concepts, Concept):
concepts = [concepts]
lines = []
for concept in concepts:
to_quickstatements_v1_item(concept, lines)
logging.info("Produced %s statements for %s concepts", len(lines), len(concepts))
commands = '\n'.join(['\t'.join(l) for l in lines])
assert '\tNone\t' not in commands, 'TODO'
return commands
def commands_to_quickstatements_v1_url(commands):
url = commands.replace('\t', '|').replace('\n', '||')
url = urllib.parse.quote(url, safe = '')
return 'https://quickstatements.toolforge.org/#/v1=' + url
def to_quickstatements_v1_url(concepts):
return commands_to_quickstatements_v1_url(to_quickstatements_v1(concepts))

15
test.py Normal file
View File

@ -0,0 +1,15 @@
import schemeld
import wikidata
import wikidata_ext
if __name__ == '__main__':
client = wikidata.client.Client()
EQV_PROPERTY = client.get('P1628')
schema_root = "https://schema.org/"
schema_prop = "image"
triples = wikidata_ext.get_triples(client, predicate = EQV_PROPERTY, "{}{}".format(schema_root, schema_prop))

View File

@ -1,26 +0,0 @@
def get_triples(client, subject = None, predicate = None, object = None):
time.sleep(1)
params = {
'subject': fmt_triple_value(subject),
'predicate': fmt_triple_value(predicate),
'object': fmt_triple_value(object),
'page': 1,
}
headers = {'accept': 'application/ld+json'}
result = requests.get('https://query.wikidata.org/bigdata/ldf',
params = params,
headers = headers,
)
triples = []
if result.status_code != 200:
logging.error('Got %s error message: %s', result.status_code, repr((subject, predicate, object)))
return []
for item in result.json()['@graph']:
if item['@id'].startswith('wd:') and predicate.id in item:
s = item['@id'][3:]
triples.append((client.get(s, load = False), predicate, object))
return triples

114
wikidata_ext.py Normal file
View File

@ -0,0 +1,114 @@
import ratelimit
import urllib.parse
import wikidata.entity
import requests
import json
import logging
def concept_uri(obj):
assert isinstance(obj, wikidata.entity.Entity), obj
if obj.id.startswith('P'):
return urllib.parse.urlparse('http://www.wikidata.org/prop/direct/{}'.format(obj.id))
elif obj.id.startswith('Q'):
return urllib.parse.urlparse('http://www.wikidata.org/entity/{}'.format(obj.id))
else:
assert False, "TODO: " + ojb.id
def fmt_triple_value(obj, prefer_obj = False):
if obj is None:
return ''
if isinstance(obj, str):
return '"{}"'.format(obj)
elif isinstance(obj, urllib.parse.ParseResult):
return obj.geturl() if prefer_obj else fmt_triple_value(obj.geturl())
elif isinstance(obj, wikidata.entity.Entity):
uri = concept_uri(obj)
return fmt_triple_value(uri, prefer_obj)
else:
assert False, type(obj)
@ratelimit.sleep_and_retry
@ratelimit.limits(calls=10, period=60)
def fetch_by_url(url, headers):
logging.debug('Fetching: %s', url)
result = requests.get(url, headers = headers)
if result.status_code != 200:
logging.error('Got %s error message: %s', result.status_code, repr((subject, predicate, object)))
return None
return request
ITEMS_PER_PAGE = "http://www.w3.org/ns/hydra/core#itemsPerPage"
TOTAL_ITEMS = "http://www.w3.org/ns/hydra/core#totalItems"
def fmt_params(subject, predicate, object):
derp = [x for x in [subject, predicate, object] if x]
assert len(derp) >= 1, str(derp)
params = {
'subject': fmt_triple_value(subject, prefer_obj = True),
'predicate': fmt_triple_value(predicate, prefer_obj = True),
'object': fmt_triple_value(object),
'page': 1,
}
return params
def get_triples_count(subject = None, predicate = None, object = None):
'''
Fetches first page in order to determine amount of items.
'''
params = fmt_params(subject, predicate, object)
url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url
result = fetch_by_url(url, headers = {'accept': 'application/ld+json'})
json_data = json.loads(result)
for item in json_data['@graph']:
if TOTAL_ITEMS in item:
return {
'items_per_page': item[ITEMS_PER_PAGE],
'items_total': item[TOTAL_ITEMS],
'num_pages': int((item[TOTAL_ITEMS] - 1) / item[ITEMS_PER_PAGE] + 1),
}
assert False
def get_triples_internal(subject, predicate, object):
params = fmt_params(subject, predicate, object)
pagination_data = get_triples_count(subject, predicate, object)
for current_page in range(1, pagination_data['num_pages']+1):
params['page'] = current_page
url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url
result = fetch_by_url(url, headers = {'accept': 'application/ld+json'})
json_data = json.loads(result)
for item in json_data['@graph']:
if item['@id'].startswith('_:b'):
continue
if item['@id'].startswith('https://query.wikidata.org/bigdata/ldf'):
continue
if item['@id'].startswith('http://www.wikidata.org/.well-known/'):
continue
yield item
# Bookkeeping
del url, result, json_data
SCHEMA_ABOUT = urllib.parse.urlparse('http://schema.org/about')
def get_wikidata_concept_for_wikipedia_page(client, wikipage):
triples = get_triples_internal(wikipage, SCHEMA_ABOUT, None);
triples = list(triples)
for item in triples:
s = item['about'][3:]
return client.get(s, load = False)
def get_triples(client, subject = None, predicate = None, object = None):
triples = []
iterator = get_triples_internal(subject, predicate, object)
for item in iterator:
is_looking_for = item['@id'].startswith('wd:') and predicate.id in item
if is_looking_for :
s = subject
if s is None:
s = client.get(item['@id'][3:], load = False)
o = object or item[predicate.id]
yield (s, predicate, o)
del item, is_looking_for