2023-03-06 22:41:49 +00:00
|
|
|
|
|
|
|
import ratelimit
|
|
|
|
import urllib.parse
|
|
|
|
import wikidata.entity
|
|
|
|
import requests
|
|
|
|
import json
|
|
|
|
import logging
|
|
|
|
|
2023-08-29 07:20:14 +00:00
|
|
|
REQUEST_SESSION = requests.Session()
|
|
|
|
|
2023-03-06 22:41:49 +00:00
|
|
|
def concept_uri(obj):
|
|
|
|
assert isinstance(obj, wikidata.entity.Entity), obj
|
|
|
|
if obj.id.startswith('P'):
|
|
|
|
return urllib.parse.urlparse('http://www.wikidata.org/prop/direct/{}'.format(obj.id))
|
|
|
|
elif obj.id.startswith('Q'):
|
|
|
|
return urllib.parse.urlparse('http://www.wikidata.org/entity/{}'.format(obj.id))
|
|
|
|
else:
|
|
|
|
assert False, "TODO: " + ojb.id
|
|
|
|
|
|
|
|
def fmt_triple_value(obj, prefer_obj = False):
|
|
|
|
if obj is None:
|
|
|
|
return ''
|
|
|
|
if isinstance(obj, str):
|
|
|
|
return '"{}"'.format(obj)
|
|
|
|
elif isinstance(obj, urllib.parse.ParseResult):
|
|
|
|
return obj.geturl() if prefer_obj else fmt_triple_value(obj.geturl())
|
|
|
|
elif isinstance(obj, wikidata.entity.Entity):
|
|
|
|
uri = concept_uri(obj)
|
2023-08-29 07:20:14 +00:00
|
|
|
return fmt_triple_value(uri, True)
|
2023-03-06 22:41:49 +00:00
|
|
|
else:
|
|
|
|
assert False, type(obj)
|
|
|
|
|
|
|
|
@ratelimit.sleep_and_retry
|
|
|
|
def fetch_by_url(url, headers):
|
2023-09-17 10:09:17 +00:00
|
|
|
logging.debug('Fetching: %s', url)
|
2023-08-29 07:20:14 +00:00
|
|
|
response = REQUEST_SESSION.get(url, headers = headers)
|
|
|
|
if response.status_code != 200:
|
2023-09-17 10:09:17 +00:00
|
|
|
logging.error('Got %s error message: %s', response.status_code, response.text)
|
2023-03-06 22:41:49 +00:00
|
|
|
return None
|
2023-08-29 07:20:14 +00:00
|
|
|
return response
|
2023-03-06 22:41:49 +00:00
|
|
|
|
|
|
|
ITEMS_PER_PAGE = "http://www.w3.org/ns/hydra/core#itemsPerPage"
|
|
|
|
TOTAL_ITEMS = "http://www.w3.org/ns/hydra/core#totalItems"
|
|
|
|
|
|
|
|
def fmt_params(subject, predicate, object):
|
|
|
|
derp = [x for x in [subject, predicate, object] if x]
|
|
|
|
assert len(derp) >= 1, str(derp)
|
|
|
|
params = {
|
|
|
|
'subject': fmt_triple_value(subject, prefer_obj = True),
|
|
|
|
'predicate': fmt_triple_value(predicate, prefer_obj = True),
|
2023-09-17 10:09:17 +00:00
|
|
|
'object': fmt_triple_value(object, prefer_obj = True),
|
2023-03-06 22:41:49 +00:00
|
|
|
'page': 1,
|
|
|
|
}
|
|
|
|
return params
|
|
|
|
|
|
|
|
def get_triples_count(subject = None, predicate = None, object = None):
|
|
|
|
'''
|
|
|
|
Fetches first page in order to determine amount of items.
|
|
|
|
'''
|
|
|
|
params = fmt_params(subject, predicate, object)
|
|
|
|
url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url
|
2023-08-29 07:20:14 +00:00
|
|
|
response = fetch_by_url(url, headers = {'accept': 'application/ld+json'})
|
2023-09-17 10:09:17 +00:00
|
|
|
if response is None:
|
|
|
|
return {
|
|
|
|
'items_per_page': 0,
|
|
|
|
'items_total': 0,
|
|
|
|
'num_pages': 0,
|
|
|
|
}
|
2023-08-29 07:20:14 +00:00
|
|
|
json_data = response.json()
|
2023-03-06 22:41:49 +00:00
|
|
|
for item in json_data['@graph']:
|
|
|
|
if TOTAL_ITEMS in item:
|
|
|
|
return {
|
|
|
|
'items_per_page': item[ITEMS_PER_PAGE],
|
|
|
|
'items_total': item[TOTAL_ITEMS],
|
|
|
|
'num_pages': int((item[TOTAL_ITEMS] - 1) / item[ITEMS_PER_PAGE] + 1),
|
|
|
|
}
|
|
|
|
assert False
|
|
|
|
|
|
|
|
def get_triples_internal(subject, predicate, object):
|
|
|
|
params = fmt_params(subject, predicate, object)
|
|
|
|
pagination_data = get_triples_count(subject, predicate, object)
|
|
|
|
for current_page in range(1, pagination_data['num_pages']+1):
|
|
|
|
params['page'] = current_page
|
|
|
|
url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url
|
2023-08-29 07:20:14 +00:00
|
|
|
response = fetch_by_url(url, headers = {'accept': 'application/ld+json'})
|
|
|
|
json_data = response.json()
|
2023-03-06 22:41:49 +00:00
|
|
|
|
|
|
|
for item in json_data['@graph']:
|
|
|
|
if item['@id'].startswith('_:b'):
|
|
|
|
continue
|
|
|
|
if item['@id'].startswith('https://query.wikidata.org/bigdata/ldf'):
|
|
|
|
continue
|
|
|
|
if item['@id'].startswith('http://www.wikidata.org/.well-known/'):
|
|
|
|
continue
|
|
|
|
yield item
|
|
|
|
|
|
|
|
# Bookkeeping
|
2023-08-29 07:20:14 +00:00
|
|
|
del url, response, json_data
|
2023-03-06 22:41:49 +00:00
|
|
|
|
|
|
|
SCHEMA_ABOUT = urllib.parse.urlparse('http://schema.org/about')
|
|
|
|
|
|
|
|
def get_wikidata_concept_for_wikipedia_page(client, wikipage):
|
|
|
|
triples = get_triples_internal(wikipage, SCHEMA_ABOUT, None);
|
|
|
|
triples = list(triples)
|
|
|
|
for item in triples:
|
|
|
|
s = item['about'][3:]
|
|
|
|
return client.get(s, load = False)
|
|
|
|
|
|
|
|
def get_triples(client, subject = None, predicate = None, object = None):
|
|
|
|
triples = []
|
|
|
|
iterator = get_triples_internal(subject, predicate, object)
|
|
|
|
for item in iterator:
|
|
|
|
is_looking_for = item['@id'].startswith('wd:') and predicate.id in item
|
|
|
|
if is_looking_for :
|
|
|
|
s = subject
|
|
|
|
if s is None:
|
|
|
|
s = client.get(item['@id'][3:], load = False)
|
|
|
|
o = object or item[predicate.id]
|
|
|
|
yield (s, predicate, o)
|
|
|
|
del item, is_looking_for
|
|
|
|
|
2023-08-29 07:20:14 +00:00
|
|
|
def get_backlinks(client, predicate, object):
|
|
|
|
for subject, _, _ in get_triples(client, subject = None, predicate = predicate, object = object):
|
|
|
|
yield subject
|
|
|
|
|