1
0

Lots of small fixes

This commit is contained in:
Jon Michael Aanes 2023-08-29 09:20:14 +02:00
parent 84f0abfc92
commit c45bbc6157

View File

@ -6,6 +6,8 @@ import requests
import json import json
import logging import logging
REQUEST_SESSION = requests.Session()
def concept_uri(obj): def concept_uri(obj):
assert isinstance(obj, wikidata.entity.Entity), obj assert isinstance(obj, wikidata.entity.Entity), obj
if obj.id.startswith('P'): if obj.id.startswith('P'):
@ -24,19 +26,19 @@ def fmt_triple_value(obj, prefer_obj = False):
return obj.geturl() if prefer_obj else fmt_triple_value(obj.geturl()) return obj.geturl() if prefer_obj else fmt_triple_value(obj.geturl())
elif isinstance(obj, wikidata.entity.Entity): elif isinstance(obj, wikidata.entity.Entity):
uri = concept_uri(obj) uri = concept_uri(obj)
return fmt_triple_value(uri, prefer_obj) return fmt_triple_value(uri, True)
else: else:
assert False, type(obj) assert False, type(obj)
@ratelimit.sleep_and_retry @ratelimit.sleep_and_retry
@ratelimit.limits(calls=10, period=60) @ratelimit.limits(calls=10, period=60)
def fetch_by_url(url, headers): def fetch_by_url(url, headers):
logging.debug('Fetching: %s', url) logging.warning('Fetching: %s', url)
result = requests.get(url, headers = headers) response = REQUEST_SESSION.get(url, headers = headers)
if result.status_code != 200: if response.status_code != 200:
logging.error('Got %s error message: %s', result.status_code, repr((subject, predicate, object))) logging.error('Got %s error message: %s', response.status_code, url)
return None return None
return request return response
ITEMS_PER_PAGE = "http://www.w3.org/ns/hydra/core#itemsPerPage" ITEMS_PER_PAGE = "http://www.w3.org/ns/hydra/core#itemsPerPage"
TOTAL_ITEMS = "http://www.w3.org/ns/hydra/core#totalItems" TOTAL_ITEMS = "http://www.w3.org/ns/hydra/core#totalItems"
@ -58,8 +60,8 @@ def get_triples_count(subject = None, predicate = None, object = None):
''' '''
params = fmt_params(subject, predicate, object) params = fmt_params(subject, predicate, object)
url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url
result = fetch_by_url(url, headers = {'accept': 'application/ld+json'}) response = fetch_by_url(url, headers = {'accept': 'application/ld+json'})
json_data = json.loads(result) json_data = response.json()
for item in json_data['@graph']: for item in json_data['@graph']:
if TOTAL_ITEMS in item: if TOTAL_ITEMS in item:
return { return {
@ -75,8 +77,8 @@ def get_triples_internal(subject, predicate, object):
for current_page in range(1, pagination_data['num_pages']+1): for current_page in range(1, pagination_data['num_pages']+1):
params['page'] = current_page params['page'] = current_page
url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url
result = fetch_by_url(url, headers = {'accept': 'application/ld+json'}) response = fetch_by_url(url, headers = {'accept': 'application/ld+json'})
json_data = json.loads(result) json_data = response.json()
for item in json_data['@graph']: for item in json_data['@graph']:
if item['@id'].startswith('_:b'): if item['@id'].startswith('_:b'):
@ -88,7 +90,7 @@ def get_triples_internal(subject, predicate, object):
yield item yield item
# Bookkeeping # Bookkeeping
del url, result, json_data del url, response, json_data
SCHEMA_ABOUT = urllib.parse.urlparse('http://schema.org/about') SCHEMA_ABOUT = urllib.parse.urlparse('http://schema.org/about')
@ -112,3 +114,7 @@ def get_triples(client, subject = None, predicate = None, object = None):
yield (s, predicate, o) yield (s, predicate, o)
del item, is_looking_for del item, is_looking_for
def get_backlinks(client, predicate, object):
for subject, _, _ in get_triples(client, subject = None, predicate = predicate, object = object):
yield subject