1
0
datagraph/wikidata_ext.py

127 lines
4.6 KiB
Python
Raw Normal View History

2023-03-06 22:41:49 +00:00
import ratelimit
import urllib.parse
import wikidata.entity
import requests
import json
import logging
2023-12-03 22:20:29 +00:00
REQUEST_SESSION = None # TODO?
2023-08-29 07:20:14 +00:00
2023-03-06 22:41:49 +00:00
def concept_uri(obj):
assert isinstance(obj, wikidata.entity.Entity), obj
if obj.id.startswith('P'):
return urllib.parse.urlparse('http://www.wikidata.org/prop/direct/{}'.format(obj.id))
elif obj.id.startswith('Q'):
return urllib.parse.urlparse('http://www.wikidata.org/entity/{}'.format(obj.id))
else:
assert False, "TODO: " + ojb.id
def fmt_triple_value(obj, prefer_obj = False):
if obj is None:
return ''
if isinstance(obj, str):
return '"{}"'.format(obj)
elif isinstance(obj, urllib.parse.ParseResult):
return obj.geturl() if prefer_obj else fmt_triple_value(obj.geturl())
elif isinstance(obj, wikidata.entity.Entity):
uri = concept_uri(obj)
2023-08-29 07:20:14 +00:00
return fmt_triple_value(uri, True)
2023-03-06 22:41:49 +00:00
else:
assert False, type(obj)
@ratelimit.sleep_and_retry
def fetch_by_url(url, headers):
2023-09-17 10:09:17 +00:00
logging.debug('Fetching: %s', url)
2023-12-03 22:20:29 +00:00
assert REQUEST_SESSION is not None, 'REQUEST_SESSION must be set, before calling fetch_by_url'
2023-08-29 07:20:14 +00:00
response = REQUEST_SESSION.get(url, headers = headers)
if response.status_code != 200:
2023-09-17 10:09:17 +00:00
logging.error('Got %s error message: %s', response.status_code, response.text)
2023-03-06 22:41:49 +00:00
return None
2023-08-29 07:20:14 +00:00
return response
2023-03-06 22:41:49 +00:00
ITEMS_PER_PAGE = "http://www.w3.org/ns/hydra/core#itemsPerPage"
TOTAL_ITEMS = "http://www.w3.org/ns/hydra/core#totalItems"
def fmt_params(subject, predicate, object):
derp = [x for x in [subject, predicate, object] if x]
assert len(derp) >= 1, str(derp)
params = {
'subject': fmt_triple_value(subject, prefer_obj = True),
'predicate': fmt_triple_value(predicate, prefer_obj = True),
2023-09-17 10:09:17 +00:00
'object': fmt_triple_value(object, prefer_obj = True),
2023-03-06 22:41:49 +00:00
'page': 1,
}
return params
def get_triples_count(subject = None, predicate = None, object = None):
'''
Fetches first page in order to determine amount of items.
'''
params = fmt_params(subject, predicate, object)
url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url
2023-08-29 07:20:14 +00:00
response = fetch_by_url(url, headers = {'accept': 'application/ld+json'})
2023-09-17 10:09:17 +00:00
if response is None:
return {
'items_per_page': 0,
'items_total': 0,
'num_pages': 0,
}
2023-08-29 07:20:14 +00:00
json_data = response.json()
2023-03-06 22:41:49 +00:00
for item in json_data['@graph']:
if TOTAL_ITEMS in item:
return {
'items_per_page': item[ITEMS_PER_PAGE],
'items_total': item[TOTAL_ITEMS],
'num_pages': int((item[TOTAL_ITEMS] - 1) / item[ITEMS_PER_PAGE] + 1),
}
assert False
def get_triples_internal(subject, predicate, object):
params = fmt_params(subject, predicate, object)
pagination_data = get_triples_count(subject, predicate, object)
for current_page in range(1, pagination_data['num_pages']+1):
params['page'] = current_page
url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url
2023-08-29 07:20:14 +00:00
response = fetch_by_url(url, headers = {'accept': 'application/ld+json'})
json_data = response.json()
2023-03-06 22:41:49 +00:00
for item in json_data['@graph']:
if item['@id'].startswith('_:b'):
continue
if item['@id'].startswith('https://query.wikidata.org/bigdata/ldf'):
continue
if item['@id'].startswith('http://www.wikidata.org/.well-known/'):
continue
yield item
# Bookkeeping
2023-08-29 07:20:14 +00:00
del url, response, json_data
2023-03-06 22:41:49 +00:00
SCHEMA_ABOUT = urllib.parse.urlparse('http://schema.org/about')
def get_wikidata_concept_for_wikipedia_page(client, wikipage):
triples = get_triples_internal(wikipage, SCHEMA_ABOUT, None);
triples = list(triples)
for item in triples:
s = item['about'][3:]
return client.get(s, load = False)
def get_triples(client, subject = None, predicate = None, object = None):
triples = []
iterator = get_triples_internal(subject, predicate, object)
for item in iterator:
is_looking_for = item['@id'].startswith('wd:') and predicate.id in item
if is_looking_for :
s = subject
if s is None:
s = client.get(item['@id'][3:], load = False)
o = object or item[predicate.id]
yield (s, predicate, o)
del item, is_looking_for
2023-08-29 07:20:14 +00:00
def get_backlinks(client, predicate, object):
for subject, _, _ in get_triples(client, subject = None, predicate = predicate, object = object):
yield subject