1
0

Code quality

This commit is contained in:
Jon Michael Aanes 2024-11-28 22:29:56 +01:00
parent a99f2c134d
commit 73baf6bd29
Signed by: Jmaa
SSH Key Fingerprint: SHA256:Ab0GfHGCblESJx7JRE4fj4bFy/KRpeLhi41y4pF3sNA
2 changed files with 61 additions and 38 deletions

View File

@ -1,5 +1,7 @@
import logging
import urllib.parse
from collections.abc import Iterator
from typing import Any
import ratelimit
import requests
@ -8,36 +10,37 @@ import wikidata.entity
REQUEST_SESSION = None # TODO?
def concept_uri(obj):
assert isinstance(obj, wikidata.entity.Entity), obj
def concept_uri(obj: wikidata.entity.Entity) -> urllib.parse.ParseResult:
if obj.id.startswith('P'):
return urllib.parse.urlparse(f'http://www.wikidata.org/prop/direct/{obj.id}')
elif obj.id.startswith('Q'):
if obj.id.startswith('Q'):
return urllib.parse.urlparse(f'http://www.wikidata.org/entity/{obj.id}')
else:
assert False, 'TODO: ' + ojb.id
msg = f'Object id scheme not supported: {obj.id}'
raise ValueError(msg)
def fmt_triple_value(obj, prefer_obj=False):
def fmt_triple_value(obj: Any, prefer_obj=False) -> str:
if obj is None:
return ''
if isinstance(obj, str):
return f'"{obj}"'
elif isinstance(obj, urllib.parse.ParseResult):
if isinstance(obj, urllib.parse.ParseResult):
return obj.geturl() if prefer_obj else fmt_triple_value(obj.geturl())
elif isinstance(obj, wikidata.entity.Entity):
if isinstance(obj, wikidata.entity.Entity):
uri = concept_uri(obj)
return fmt_triple_value(uri, True)
else:
assert False, type(obj)
msg = f'Type cannot be formatted: {type(obj)}'
raise TypeError(msg)
@ratelimit.sleep_and_retry
def fetch_by_url(url, headers):
def fetch_by_url(url: str, headers: dict[str, str]):
logging.debug('Fetching: %s', url)
assert (
REQUEST_SESSION is not None
), 'REQUEST_SESSION must be set, before calling fetch_by_url'
if REQUEST_SESSION is None:
msg = 'REQUEST_SESSION must be set, before calling fetch_by_url'
raise RuntimeError(msg)
response = REQUEST_SESSION.get(url, headers=headers)
if response.status_code != 200:
logging.error('Got %s error message: %s', response.status_code, response.text)
@ -49,22 +52,27 @@ ITEMS_PER_PAGE = 'http://www.w3.org/ns/hydra/core#itemsPerPage'
TOTAL_ITEMS = 'http://www.w3.org/ns/hydra/core#totalItems'
def fmt_params(subject, predicate, object):
derp = [x for x in [subject, predicate, object] if x]
assert len(derp) >= 1, str(derp)
params = {
def fmt_params(subject: Any, predicate: Any, object_: Any) -> dict[str, str | int]:
entities = [x for x in [subject, predicate, object_] if x]
if len(entities) == 0:
msg = 'There are no entities for this query!'
raise RuntimeError(msg)
return {
'subject': fmt_triple_value(subject, prefer_obj=True),
'predicate': fmt_triple_value(predicate, prefer_obj=True),
'object': fmt_triple_value(object, prefer_obj=True),
'object': fmt_triple_value(object_, prefer_obj=True),
'page': 1,
}
return params
def get_triples_count(subject=None, predicate=None, object=None):
def get_triples_count(
subject: Any = None,
predicate: Any = None,
object_: Any = None,
) -> dict[str, Any]:
"""Fetches first page in order to determine amount of items."""
params = fmt_params(subject, predicate, object)
url = (
params = fmt_params(subject, predicate, object_)
url: str = (
requests.Request(url='https://query.wikidata.org/bigdata/ldf', params=params)
.prepare()
.url
@ -84,12 +92,14 @@ def get_triples_count(subject=None, predicate=None, object=None):
'items_total': item[TOTAL_ITEMS],
'num_pages': int((item[TOTAL_ITEMS] - 1) / item[ITEMS_PER_PAGE] + 1),
}
assert False
msg = 'Could not determine triple count'
raise RuntimeError(msg)
def get_triples_internal(subject, predicate, object):
params = fmt_params(subject, predicate, object)
pagination_data = get_triples_count(subject, predicate, object)
def get_triples_internal(subject: Any, predicate: Any, object_: Any) -> Iterator[dict]:
params = fmt_params(subject, predicate, object_)
pagination_data = get_triples_count(subject, predicate, object_)
for current_page in range(1, pagination_data['num_pages'] + 1):
params['page'] = current_page
url = (
@ -119,33 +129,46 @@ def get_triples_internal(subject, predicate, object):
SCHEMA_ABOUT = urllib.parse.urlparse('http://schema.org/about')
def get_wikidata_concept_for_wikipedia_page(client, wikipage):
def get_wikidata_concept_for_wikipedia_page(
client,
wikipage: str,
) -> wikidata.entity.Entity | None:
triples = get_triples_internal(wikipage, SCHEMA_ABOUT, None)
triples = list(triples)
for item in triples:
for item in list(triples):
s = item['about'][3:]
return client.get(s, load=False)
return None
def get_triples(client, subject=None, predicate=None, object=None):
triples = []
iterator = get_triples_internal(subject, predicate, object)
def get_triples(
client,
subject: Any = None,
predicate: Any = None,
object_: Any = None,
) -> Iterator[
tuple[wikidata.entity.Entity, wikidata.entity.Entity, wikidata.entity.Entity]
]:
iterator = get_triples_internal(subject, predicate, object_)
for item in iterator:
is_looking_for = item['@id'].startswith('wd:') and predicate.id in item
if is_looking_for:
s = subject
if s is None:
s = client.get(item['@id'][3:], load=False)
o = object or item[predicate.id]
o = object_ or item[predicate.id]
yield (s, predicate, o)
del item, is_looking_for
def get_backlinks(client, predicate, object):
def get_backlinks(
client,
predicate: Any,
object_: Any,
) -> Iterator[wikidata.entity.Entity]:
for subject, _, _ in get_triples(
client,
subject=None,
predicate=predicate,
object=object,
object_=object_,
):
yield subject

View File

@ -14,13 +14,13 @@ def test_version():
def test_get_triples():
client = wikidata.client.Client()
EQV_PROPERTY = client.get('P1628')
eqv_property = client.get('P1628')
schema_root = 'https://schema.org/'
schema_prop = 'image'
triples_iter = datagraph.wikidata_ext.get_triples(
client=client,
predicate=EQV_PROPERTY,
predicate=eqv_property,
object=f'{schema_root}{schema_prop}',
)
assert triples_iter is not None