diff --git a/wikidata_ext.py b/wikidata_ext.py index 9e80502..1d9f4aa 100644 --- a/wikidata_ext.py +++ b/wikidata_ext.py @@ -6,6 +6,8 @@ import requests import json import logging +REQUEST_SESSION = requests.Session() + def concept_uri(obj): assert isinstance(obj, wikidata.entity.Entity), obj if obj.id.startswith('P'): @@ -24,19 +26,19 @@ def fmt_triple_value(obj, prefer_obj = False): return obj.geturl() if prefer_obj else fmt_triple_value(obj.geturl()) elif isinstance(obj, wikidata.entity.Entity): uri = concept_uri(obj) - return fmt_triple_value(uri, prefer_obj) + return fmt_triple_value(uri, True) else: assert False, type(obj) @ratelimit.sleep_and_retry @ratelimit.limits(calls=10, period=60) def fetch_by_url(url, headers): - logging.debug('Fetching: %s', url) - result = requests.get(url, headers = headers) - if result.status_code != 200: - logging.error('Got %s error message: %s', result.status_code, repr((subject, predicate, object))) + logging.warning('Fetching: %s', url) + response = REQUEST_SESSION.get(url, headers = headers) + if response.status_code != 200: + logging.error('Got %s error message: %s', response.status_code, url) return None - return request + return response ITEMS_PER_PAGE = "http://www.w3.org/ns/hydra/core#itemsPerPage" TOTAL_ITEMS = "http://www.w3.org/ns/hydra/core#totalItems" @@ -58,8 +60,8 @@ def get_triples_count(subject = None, predicate = None, object = None): ''' params = fmt_params(subject, predicate, object) url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url - result = fetch_by_url(url, headers = {'accept': 'application/ld+json'}) - json_data = json.loads(result) + response = fetch_by_url(url, headers = {'accept': 'application/ld+json'}) + json_data = response.json() for item in json_data['@graph']: if TOTAL_ITEMS in item: return { @@ -75,8 +77,8 @@ def get_triples_internal(subject, predicate, object): for current_page in range(1, pagination_data['num_pages']+1): params['page'] = current_page url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url - result = fetch_by_url(url, headers = {'accept': 'application/ld+json'}) - json_data = json.loads(result) + response = fetch_by_url(url, headers = {'accept': 'application/ld+json'}) + json_data = response.json() for item in json_data['@graph']: if item['@id'].startswith('_:b'): @@ -88,7 +90,7 @@ def get_triples_internal(subject, predicate, object): yield item # Bookkeeping - del url, result, json_data + del url, response, json_data SCHEMA_ABOUT = urllib.parse.urlparse('http://schema.org/about') @@ -112,3 +114,7 @@ def get_triples(client, subject = None, predicate = None, object = None): yield (s, predicate, o) del item, is_looking_for +def get_backlinks(client, predicate, object): + for subject, _, _ in get_triples(client, subject = None, predicate = predicate, object = object): + yield subject +