import ratelimit import urllib.parse import wikidata.entity import requests import json import logging REQUEST_SESSION = None # TODO? def concept_uri(obj): assert isinstance(obj, wikidata.entity.Entity), obj if obj.id.startswith('P'): return urllib.parse.urlparse('http://www.wikidata.org/prop/direct/{}'.format(obj.id)) elif obj.id.startswith('Q'): return urllib.parse.urlparse('http://www.wikidata.org/entity/{}'.format(obj.id)) else: assert False, "TODO: " + ojb.id def fmt_triple_value(obj, prefer_obj = False): if obj is None: return '' if isinstance(obj, str): return '"{}"'.format(obj) elif isinstance(obj, urllib.parse.ParseResult): return obj.geturl() if prefer_obj else fmt_triple_value(obj.geturl()) elif isinstance(obj, wikidata.entity.Entity): uri = concept_uri(obj) return fmt_triple_value(uri, True) else: assert False, type(obj) @ratelimit.sleep_and_retry def fetch_by_url(url, headers): logging.debug('Fetching: %s', url) assert REQUEST_SESSION is not None, 'REQUEST_SESSION must be set, before calling fetch_by_url' response = REQUEST_SESSION.get(url, headers = headers) if response.status_code != 200: logging.error('Got %s error message: %s', response.status_code, response.text) return None return response ITEMS_PER_PAGE = "http://www.w3.org/ns/hydra/core#itemsPerPage" TOTAL_ITEMS = "http://www.w3.org/ns/hydra/core#totalItems" def fmt_params(subject, predicate, object): derp = [x for x in [subject, predicate, object] if x] assert len(derp) >= 1, str(derp) params = { 'subject': fmt_triple_value(subject, prefer_obj = True), 'predicate': fmt_triple_value(predicate, prefer_obj = True), 'object': fmt_triple_value(object, prefer_obj = True), 'page': 1, } return params def get_triples_count(subject = None, predicate = None, object = None): ''' Fetches first page in order to determine amount of items. ''' params = fmt_params(subject, predicate, object) url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url response = fetch_by_url(url, headers = {'accept': 'application/ld+json'}) if response is None: return { 'items_per_page': 0, 'items_total': 0, 'num_pages': 0, } json_data = response.json() for item in json_data['@graph']: if TOTAL_ITEMS in item: return { 'items_per_page': item[ITEMS_PER_PAGE], 'items_total': item[TOTAL_ITEMS], 'num_pages': int((item[TOTAL_ITEMS] - 1) / item[ITEMS_PER_PAGE] + 1), } assert False def get_triples_internal(subject, predicate, object): params = fmt_params(subject, predicate, object) pagination_data = get_triples_count(subject, predicate, object) for current_page in range(1, pagination_data['num_pages']+1): params['page'] = current_page url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url response = fetch_by_url(url, headers = {'accept': 'application/ld+json'}) json_data = response.json() for item in json_data['@graph']: if item['@id'].startswith('_:b'): continue if item['@id'].startswith('https://query.wikidata.org/bigdata/ldf'): continue if item['@id'].startswith('http://www.wikidata.org/.well-known/'): continue yield item # Bookkeeping del url, response, json_data SCHEMA_ABOUT = urllib.parse.urlparse('http://schema.org/about') def get_wikidata_concept_for_wikipedia_page(client, wikipage): triples = get_triples_internal(wikipage, SCHEMA_ABOUT, None); triples = list(triples) for item in triples: s = item['about'][3:] return client.get(s, load = False) def get_triples(client, subject = None, predicate = None, object = None): triples = [] iterator = get_triples_internal(subject, predicate, object) for item in iterator: is_looking_for = item['@id'].startswith('wd:') and predicate.id in item if is_looking_for : s = subject if s is None: s = client.get(item['@id'][3:], load = False) o = object or item[predicate.id] yield (s, predicate, o) del item, is_looking_for def get_backlinks(client, predicate, object): for subject, _, _ in get_triples(client, subject = None, predicate = predicate, object = object): yield subject