datagraph/wikidata_ext.py


import ratelimit
import urllib.parse
import wikidata.entity
import requests
import json
import logging

REQUEST_SESSION = None # TODO?

def concept_uri(obj):
    assert isinstance(obj, wikidata.entity.Entity), obj
    if obj.id.startswith('P'):
        return urllib.parse.urlparse('http://www.wikidata.org/prop/direct/{}'.format(obj.id))
    elif obj.id.startswith('Q'):
        return urllib.parse.urlparse('http://www.wikidata.org/entity/{}'.format(obj.id))
    else:
        assert False, "TODO: " + ojb.id

def fmt_triple_value(obj, prefer_obj = False):
    if obj is None:
        return ''
    if isinstance(obj, str):
        return '"{}"'.format(obj)
    elif isinstance(obj, urllib.parse.ParseResult):
        return obj.geturl() if prefer_obj else fmt_triple_value(obj.geturl())
    elif isinstance(obj, wikidata.entity.Entity):
        uri = concept_uri(obj)
        return fmt_triple_value(uri, True)
    else:
        assert False, type(obj)

@ratelimit.sleep_and_retry
def fetch_by_url(url, headers):
    logging.debug('Fetching: %s', url)
    assert REQUEST_SESSION is not None, 'REQUEST_SESSION must be set, before calling fetch_by_url'
    response = REQUEST_SESSION.get(url, headers = headers)
    if response.status_code != 200:
        logging.error('Got %s error message: %s', response.status_code, response.text)
        return None
    return response

ITEMS_PER_PAGE = "http://www.w3.org/ns/hydra/core#itemsPerPage"
TOTAL_ITEMS = "http://www.w3.org/ns/hydra/core#totalItems"

def fmt_params(subject, predicate, object):
    derp = [x for x in [subject, predicate, object] if x]
    assert len(derp) >= 1, str(derp)
    params = {
        'subject': fmt_triple_value(subject, prefer_obj = True),
        'predicate': fmt_triple_value(predicate, prefer_obj = True),
        'object': fmt_triple_value(object, prefer_obj = True),
        'page': 1,
    }
    return params

def get_triples_count(subject = None, predicate = None, object = None):
    '''
    Fetches first page in order to determine amount of items.
    '''
    params = fmt_params(subject, predicate, object)
    url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url
    response = fetch_by_url(url, headers = {'accept': 'application/ld+json'})
    if response is None:
        return {
            'items_per_page': 0,
            'items_total': 0,
            'num_pages': 0,
        }
    json_data = response.json()
    for item in json_data['@graph']:
        if TOTAL_ITEMS in item:
            return {
                'items_per_page': item[ITEMS_PER_PAGE],
                'items_total': item[TOTAL_ITEMS],
                'num_pages': int((item[TOTAL_ITEMS] - 1) / item[ITEMS_PER_PAGE] + 1),
            }
    assert False

def get_triples_internal(subject, predicate, object):
    params = fmt_params(subject, predicate, object)
    pagination_data = get_triples_count(subject, predicate, object)
    for current_page in range(1, pagination_data['num_pages']+1):
        params['page'] = current_page
        url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url
        response = fetch_by_url(url, headers = {'accept': 'application/ld+json'})
        json_data = response.json()

        for item in json_data['@graph']:
            if item['@id'].startswith('_:b'):
                continue
            if item['@id'].startswith('https://query.wikidata.org/bigdata/ldf'):
                continue
            if item['@id'].startswith('http://www.wikidata.org/.well-known/'):
                continue
            yield item

        # Bookkeeping
        del url, response, json_data

SCHEMA_ABOUT = urllib.parse.urlparse('http://schema.org/about')

def get_wikidata_concept_for_wikipedia_page(client, wikipage):
    triples = get_triples_internal(wikipage, SCHEMA_ABOUT, None);
    triples = list(triples)
    for item in triples:
        s = item['about'][3:]
        return client.get(s, load = False)

def get_triples(client, subject = None, predicate = None, object = None):
    triples = []
    iterator = get_triples_internal(subject, predicate, object)
    for item in iterator:
        is_looking_for = item['@id'].startswith('wd:') and predicate.id in item
        if is_looking_for :
            s = subject
            if s is None:
                s = client.get(item['@id'][3:], load = False)
            o = object or item[predicate.id]
            yield (s, predicate, o)
        del item, is_looking_for

def get_backlinks(client, predicate, object):
    for subject, _, _ in get_triples(client, subject = None, predicate = predicate, object = object):
        yield subject
Significant additions 2023-03-06 22:41:49 +00:00
			`import ratelimit`
			`import urllib.parse`
			`import wikidata.entity`
			`import requests`
			`import json`
			`import logging`

No default Session 2023-12-03 22:20:29 +00:00			`REQUEST_SESSION = None # TODO?`
Lots of small fixes 2023-08-29 07:20:14 +00:00
Significant additions 2023-03-06 22:41:49 +00:00			`def concept_uri(obj):`
			`assert isinstance(obj, wikidata.entity.Entity), obj`
			`if obj.id.startswith('P'):`
			`return urllib.parse.urlparse('http://www.wikidata.org/prop/direct/{}'.format(obj.id))`
			`elif obj.id.startswith('Q'):`
			`return urllib.parse.urlparse('http://www.wikidata.org/entity/{}'.format(obj.id))`
			`else:`
			`assert False, "TODO: " + ojb.id`

			`def fmt_triple_value(obj, prefer_obj = False):`
			`if obj is None:`
			`return ''`
			`if isinstance(obj, str):`
			`return '"{}"'.format(obj)`
			`elif isinstance(obj, urllib.parse.ParseResult):`
			`return obj.geturl() if prefer_obj else fmt_triple_value(obj.geturl())`
			`elif isinstance(obj, wikidata.entity.Entity):`
			`uri = concept_uri(obj)`
Lots of small fixes 2023-08-29 07:20:14 +00:00			`return fmt_triple_value(uri, True)`
Significant additions 2023-03-06 22:41:49 +00:00			`else:`
			`assert False, type(obj)`

			`@ratelimit.sleep_and_retry`
			`def fetch_by_url(url, headers):`
Canonical key system 2023-09-17 10:09:17 +00:00			`logging.debug('Fetching: %s', url)`
No default Session 2023-12-03 22:20:29 +00:00			`assert REQUEST_SESSION is not None, 'REQUEST_SESSION must be set, before calling fetch_by_url'`
Lots of small fixes 2023-08-29 07:20:14 +00:00			`response = REQUEST_SESSION.get(url, headers = headers)`
			`if response.status_code != 200:`
Canonical key system 2023-09-17 10:09:17 +00:00			`logging.error('Got %s error message: %s', response.status_code, response.text)`
Significant additions 2023-03-06 22:41:49 +00:00			`return None`
Lots of small fixes 2023-08-29 07:20:14 +00:00			`return response`
Significant additions 2023-03-06 22:41:49 +00:00
			`ITEMS_PER_PAGE = "http://www.w3.org/ns/hydra/core#itemsPerPage"`
			`TOTAL_ITEMS = "http://www.w3.org/ns/hydra/core#totalItems"`

			`def fmt_params(subject, predicate, object):`
			`derp = [x for x in [subject, predicate, object] if x]`
			`assert len(derp) >= 1, str(derp)`
			`params = {`
			`'subject': fmt_triple_value(subject, prefer_obj = True),`
			`'predicate': fmt_triple_value(predicate, prefer_obj = True),`
Canonical key system 2023-09-17 10:09:17 +00:00			`'object': fmt_triple_value(object, prefer_obj = True),`
Significant additions 2023-03-06 22:41:49 +00:00			`'page': 1,`
			`}`
			`return params`

			`def get_triples_count(subject = None, predicate = None, object = None):`
			`'''`
			`Fetches first page in order to determine amount of items.`
			`'''`
			`params = fmt_params(subject, predicate, object)`
			`url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url`
Lots of small fixes 2023-08-29 07:20:14 +00:00			`response = fetch_by_url(url, headers = {'accept': 'application/ld+json'})`
Canonical key system 2023-09-17 10:09:17 +00:00			`if response is None:`
			`return {`
			`'items_per_page': 0,`
			`'items_total': 0,`
			`'num_pages': 0,`
			`}`
Lots of small fixes 2023-08-29 07:20:14 +00:00			`json_data = response.json()`
Significant additions 2023-03-06 22:41:49 +00:00			`for item in json_data['@graph']:`
			`if TOTAL_ITEMS in item:`
			`return {`
			`'items_per_page': item[ITEMS_PER_PAGE],`
			`'items_total': item[TOTAL_ITEMS],`
			`'num_pages': int((item[TOTAL_ITEMS] - 1) / item[ITEMS_PER_PAGE] + 1),`
			`}`
			`assert False`

			`def get_triples_internal(subject, predicate, object):`
			`params = fmt_params(subject, predicate, object)`
			`pagination_data = get_triples_count(subject, predicate, object)`
			`for current_page in range(1, pagination_data['num_pages']+1):`
			`params['page'] = current_page`
			`url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url`
Lots of small fixes 2023-08-29 07:20:14 +00:00			`response = fetch_by_url(url, headers = {'accept': 'application/ld+json'})`
			`json_data = response.json()`
Significant additions 2023-03-06 22:41:49 +00:00
			`for item in json_data['@graph']:`
			`if item['@id'].startswith('_:b'):`
			`continue`
			`if item['@id'].startswith('https://query.wikidata.org/bigdata/ldf'):`
			`continue`
			`if item['@id'].startswith('http://www.wikidata.org/.well-known/'):`
			`continue`
			`yield item`

			`# Bookkeeping`
Lots of small fixes 2023-08-29 07:20:14 +00:00			`del url, response, json_data`
Significant additions 2023-03-06 22:41:49 +00:00
			`SCHEMA_ABOUT = urllib.parse.urlparse('http://schema.org/about')`

			`def get_wikidata_concept_for_wikipedia_page(client, wikipage):`
			`triples = get_triples_internal(wikipage, SCHEMA_ABOUT, None);`
			`triples = list(triples)`
			`for item in triples:`
			`s = item['about'][3:]`
			`return client.get(s, load = False)`

			`def get_triples(client, subject = None, predicate = None, object = None):`
			`triples = []`
			`iterator = get_triples_internal(subject, predicate, object)`
			`for item in iterator:`
			`is_looking_for = item['@id'].startswith('wd:') and predicate.id in item`
			`if is_looking_for :`
			`s = subject`
			`if s is None:`
			`s = client.get(item['@id'][3:], load = False)`
			`o = object or item[predicate.id]`
			`yield (s, predicate, o)`
			`del item, is_looking_for`

Lots of small fixes 2023-08-29 07:20:14 +00:00			`def get_backlinks(client, predicate, object):`
			`for subject, _, _ in get_triples(client, subject = None, predicate = predicate, object = object):`
			`yield subject`