diff --git a/datagraph/wikidata_ext.py b/datagraph/wikidata_ext.py index e79337f..213a11c 100644 --- a/datagraph/wikidata_ext.py +++ b/datagraph/wikidata_ext.py @@ -1,5 +1,7 @@ import logging import urllib.parse +from collections.abc import Iterator +from typing import Any import ratelimit import requests @@ -8,36 +10,37 @@ import wikidata.entity REQUEST_SESSION = None # TODO? -def concept_uri(obj): - assert isinstance(obj, wikidata.entity.Entity), obj +def concept_uri(obj: wikidata.entity.Entity) -> urllib.parse.ParseResult: if obj.id.startswith('P'): return urllib.parse.urlparse(f'http://www.wikidata.org/prop/direct/{obj.id}') - elif obj.id.startswith('Q'): + if obj.id.startswith('Q'): return urllib.parse.urlparse(f'http://www.wikidata.org/entity/{obj.id}') - else: - assert False, 'TODO: ' + ojb.id + + msg = f'Object id scheme not supported: {obj.id}' + raise ValueError(msg) -def fmt_triple_value(obj, prefer_obj=False): +def fmt_triple_value(obj: Any, prefer_obj=False) -> str: if obj is None: return '' if isinstance(obj, str): return f'"{obj}"' - elif isinstance(obj, urllib.parse.ParseResult): + if isinstance(obj, urllib.parse.ParseResult): return obj.geturl() if prefer_obj else fmt_triple_value(obj.geturl()) - elif isinstance(obj, wikidata.entity.Entity): + if isinstance(obj, wikidata.entity.Entity): uri = concept_uri(obj) return fmt_triple_value(uri, True) - else: - assert False, type(obj) + + msg = f'Type cannot be formatted: {type(obj)}' + raise TypeError(msg) @ratelimit.sleep_and_retry -def fetch_by_url(url, headers): +def fetch_by_url(url: str, headers: dict[str, str]): logging.debug('Fetching: %s', url) - assert ( - REQUEST_SESSION is not None - ), 'REQUEST_SESSION must be set, before calling fetch_by_url' + if REQUEST_SESSION is None: + msg = 'REQUEST_SESSION must be set, before calling fetch_by_url' + raise RuntimeError(msg) response = REQUEST_SESSION.get(url, headers=headers) if response.status_code != 200: logging.error('Got %s error message: %s', response.status_code, response.text) @@ -49,22 +52,27 @@ ITEMS_PER_PAGE = 'http://www.w3.org/ns/hydra/core#itemsPerPage' TOTAL_ITEMS = 'http://www.w3.org/ns/hydra/core#totalItems' -def fmt_params(subject, predicate, object): - derp = [x for x in [subject, predicate, object] if x] - assert len(derp) >= 1, str(derp) - params = { +def fmt_params(subject: Any, predicate: Any, object_: Any) -> dict[str, str | int]: + entities = [x for x in [subject, predicate, object_] if x] + if len(entities) == 0: + msg = 'There are no entities for this query!' + raise RuntimeError(msg) + return { 'subject': fmt_triple_value(subject, prefer_obj=True), 'predicate': fmt_triple_value(predicate, prefer_obj=True), - 'object': fmt_triple_value(object, prefer_obj=True), + 'object': fmt_triple_value(object_, prefer_obj=True), 'page': 1, } - return params -def get_triples_count(subject=None, predicate=None, object=None): +def get_triples_count( + subject: Any = None, + predicate: Any = None, + object_: Any = None, +) -> dict[str, Any]: """Fetches first page in order to determine amount of items.""" - params = fmt_params(subject, predicate, object) - url = ( + params = fmt_params(subject, predicate, object_) + url: str = ( requests.Request(url='https://query.wikidata.org/bigdata/ldf', params=params) .prepare() .url @@ -84,12 +92,14 @@ def get_triples_count(subject=None, predicate=None, object=None): 'items_total': item[TOTAL_ITEMS], 'num_pages': int((item[TOTAL_ITEMS] - 1) / item[ITEMS_PER_PAGE] + 1), } - assert False + + msg = 'Could not determine triple count' + raise RuntimeError(msg) -def get_triples_internal(subject, predicate, object): - params = fmt_params(subject, predicate, object) - pagination_data = get_triples_count(subject, predicate, object) +def get_triples_internal(subject: Any, predicate: Any, object_: Any) -> Iterator[dict]: + params = fmt_params(subject, predicate, object_) + pagination_data = get_triples_count(subject, predicate, object_) for current_page in range(1, pagination_data['num_pages'] + 1): params['page'] = current_page url = ( @@ -119,33 +129,46 @@ def get_triples_internal(subject, predicate, object): SCHEMA_ABOUT = urllib.parse.urlparse('http://schema.org/about') -def get_wikidata_concept_for_wikipedia_page(client, wikipage): +def get_wikidata_concept_for_wikipedia_page( + client, + wikipage: str, +) -> wikidata.entity.Entity | None: triples = get_triples_internal(wikipage, SCHEMA_ABOUT, None) - triples = list(triples) - for item in triples: + for item in list(triples): s = item['about'][3:] return client.get(s, load=False) + return None -def get_triples(client, subject=None, predicate=None, object=None): - triples = [] - iterator = get_triples_internal(subject, predicate, object) +def get_triples( + client, + subject: Any = None, + predicate: Any = None, + object_: Any = None, +) -> Iterator[ + tuple[wikidata.entity.Entity, wikidata.entity.Entity, wikidata.entity.Entity] +]: + iterator = get_triples_internal(subject, predicate, object_) for item in iterator: is_looking_for = item['@id'].startswith('wd:') and predicate.id in item if is_looking_for: s = subject if s is None: s = client.get(item['@id'][3:], load=False) - o = object or item[predicate.id] + o = object_ or item[predicate.id] yield (s, predicate, o) del item, is_looking_for -def get_backlinks(client, predicate, object): +def get_backlinks( + client, + predicate: Any, + object_: Any, +) -> Iterator[wikidata.entity.Entity]: for subject, _, _ in get_triples( client, subject=None, predicate=predicate, - object=object, + object_=object_, ): yield subject diff --git a/test/test_datagraph.py b/test/test_datagraph.py index f35b089..2652e34 100644 --- a/test/test_datagraph.py +++ b/test/test_datagraph.py @@ -14,13 +14,13 @@ def test_version(): def test_get_triples(): client = wikidata.client.Client() - EQV_PROPERTY = client.get('P1628') + eqv_property = client.get('P1628') schema_root = 'https://schema.org/' schema_prop = 'image' triples_iter = datagraph.wikidata_ext.get_triples( client=client, - predicate=EQV_PROPERTY, + predicate=eqv_property, object=f'{schema_root}{schema_prop}', ) assert triples_iter is not None