Significant additions

2023-03-06 23:41:49 +01:00 · 2023-03-06 23:41:49 +01:00 · d39fbbfed3
commit d39fbbfed3
parent 94fa9f10b9
5 changed files with 285 additions and 31 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+
+/__pycache__/
+
--- a/schemeld.py
+++ b/schemeld.py
@ -1,5 +1,14 @@

+import logging
 import urllib.parse
+import wikidata.entity
+import datetime
+from dataclasses import dataclass
+from enforce_typing import enforce_types
+from typing import List, Set, Optional, Union
+from enum import Enum
+
+STRICT_VALIDATION = True

 class Concept(object):

@ -10,10 +19,27 @@ class Concept(object):
    def get(self, key, *args, **kwargs):
        return self.data.get(self.canonical_key(key, *args, **kwargs))

+    def keys(self):
+        return self.data.keys()
+
+    def setdefault(self, key, value):
+        return self.data.setdefault(self.canonical_key(key), value)
+
+    def to_dict(self):
+        return {k:v for k,v in self.data.items()}
+
    def __getitem__(self, key):
        return self.data[self.canonical_key(key)]

    def __setitem__(self, key, value):
+        if STRICT_VALIDATION:
+            if not isinstance(key, str) or key != '@id':
+                assert isinstance(value, list), value
+                for v in value:
+                    assert isinstance(v, dict), value
+                    assert 'value' in v, value
+                    for subk in v:
+                        assert not isinstance(v[subk], list), value
        self.data[self.canonical_key(key)] = value

    def __contains__(self, key):
@ -23,13 +49,24 @@ class Concept(object):
        del self.data[self.canonical_key(key)]

    def canonical_key(self, key):
+        if isinstance(key, urllib.parse.ParseResult):
+            return key
        if not isinstance(key, str):
            return key
        elif key.startswith('@'):
            return key
-        elif key.startswith(self.context):
+        if self.context is None:
            return key
-        return self.context + key
+        return self.context._replace(path = key)
+
+    def __repr__(self):
+        if id := self.data.get('@id'):
+            return 'Concept {{ @id = {} }}'.format(id)
+
+        return 'Concept '+str(self.data)
+
+    def __str__(self):
+        return repr(self)

 def determine_concepts_internal(json, context, outputs):
    if isinstance(json, list):
@ -37,9 +74,10 @@ def determine_concepts_internal(json, context, outputs):
            determine_concepts_internal(m, context, outputs)
        return

-    assert isinstance(json, dict)
-    context = json.get('@context', context)
-    assert urllib.parse.urlparse(context).netloc == 'schema.org'
+    assert isinstance(json, dict), type(json)
+    context = urllib.parse.urlparse(json.get('@context', context))
+    assert context.netloc == 'schema.org'
+
    if '@graph' in json:
        determine_concepts_internal(json['@graph'], context, outputs)
    else:
@ -49,3 +87,113 @@ def determine_concepts(json):
    concepts = []
    determine_concepts_internal(json, '', concepts)
    return concepts
+
+REFERENCE_PROPERTIES = {'P813', 'P854', 'P248', 'P143', 'P813'}
+
+def fmt_value(c, prefer_reference = False):
+    if isinstance(c, str):
+        return '"{}"'.format(c) # TODO: Escape
+    elif isinstance(c, Concept):
+        if '@id' in c:
+            return fmt_value(c['@id'], prefer_reference)
+        else:
+            logging.error('Could not determine useful id for %s', c)
+            return ''
+    elif isinstance(c, wikidata.entity.Entity):
+        s = c.id
+        if isinstance(s, int):
+            s = 'P{}'.format(s)
+        if s in REFERENCE_PROPERTIES:
+            s = s.replace('P', 'S', 1)
+        return s
+    elif isinstance(c, urllib.parse.ParseResult):
+        return c.geturl() if prefer_reference else fmt_value(c.geturl(), prefer_reference)
+    elif isinstance(c, datetime.datetime):
+        return '+{}/11'.format(c.isoformat())
+    elif isinstance(c, datetime.date):
+        return '+{}T00:00:00Z/11'.format(c.isoformat())
+
+    return str(c)
+
+def fmt_predicate(pred, object):
+    if isinstance(pred, urllib.parse.ParseResult) and pred.netloc == 'schema.org':
+        lang = object.get('__language') or 'en'
+        if pred.path == '/name':
+            return 'L'+lang
+        elif pred.path == '/alternateName':
+            return 'A'+lang
+        elif pred.path == '/description':
+            return 'D'+lang
+        elif pred.path == '/sameAs':
+            return 'S{}wiki'.format(lang)
+        else:
+            assert False, pred
+    return fmt_value(pred, prefer_reference = True)
+
+def to_quickstatements_v1_item(subject, lines, skip_impossible = True, skip_already_syncronized = True):
+    #assert '@id' not in subject, 'TODO: Linked subjects'
+    subject_id = fmt_value(subject, True) if '@id' in subject else 'LAST'
+
+    if subject_id == 'LAST':
+        lines.append(['CREATE'])
+
+    def fmt_key_value_pair(v, line):
+        if isinstance(v, list):
+            for e in v:
+                fmt_key_value_pair(e, line)
+            return
+        elif isinstance(v, dict) and 'value' in v:
+            line.append(fmt_value(v['value']))
+            for sub_k, sub_v in v.items():
+                if sub_k is None or sub_v is None:
+                    continue
+                if not isinstance(sub_k, str):
+                    line.append(fmt_predicate(sub_k, sub_v))
+                    line.append(fmt_value(sub_v))
+        else:
+            line.append(fmt_value(v))
+
+    for predicate, pred_objects in subject.data.items():
+        if isinstance(predicate, str) and (predicate == '@id' or predicate.startswith('__')):
+            continue
+
+        assert isinstance(pred_objects, list)
+        for pred_object in pred_objects:
+            if pred_object.get('__synchronized_with_wikidata', False) and skip_already_syncronized:
+                continue
+            predicate_str = fmt_predicate(predicate, pred_object)
+            line = [subject_id, predicate_str]
+            fmt_key_value_pair(pred_object, line)
+
+            if skip_impossible and predicate_str.startswith('"'):
+                logging.warning('Bad line: %s (Lines must not start with ")', predicate_str)
+                continue
+            if '' in line and skip_impossible:
+                logging.warning('Bad line: %s (Lines must not contain empty names)', line)
+                continue
+            assert 'None' not in line, line
+            lines.append(line)
+
+def to_quickstatements_v1(concepts):
+    if isinstance(concepts, Concept):
+        concepts = [concepts]
+
+    lines = []
+
+    for concept in concepts:
+        to_quickstatements_v1_item(concept, lines)
+
+    logging.info("Produced %s statements for %s concepts", len(lines), len(concepts))
+    commands = '\n'.join(['\t'.join(l) for l in lines])
+
+    assert '\tNone\t' not in commands, 'TODO'
+    return commands
+
+def commands_to_quickstatements_v1_url(commands):
+    url = commands.replace('\t', '|').replace('\n', '||')
+    url = urllib.parse.quote(url, safe = '')
+    return 'https://quickstatements.toolforge.org/#/v1=' + url
+
+def to_quickstatements_v1_url(concepts):
+    return commands_to_quickstatements_v1_url(to_quickstatements_v1(concepts))
+
--- a/test.py
+++ b/test.py
@ -0,0 +1,15 @@
+
+import schemeld
+import wikidata
+import wikidata_ext
+
+if __name__ == '__main__':
+
+    client = wikidata.client.Client()
+
+    EQV_PROPERTY = client.get('P1628')
+    schema_root = "https://schema.org/"
+    schema_prop = "image"
+
+    triples = wikidata_ext.get_triples(client, predicate = EQV_PROPERTY, "{}{}".format(schema_root, schema_prop))
+
--- a/wikidata.py
+++ b/wikidata.py
@ -1,26 +0,0 @@
-
-def get_triples(client, subject = None, predicate = None, object = None):
-    time.sleep(1)
-    params = {
-        'subject': fmt_triple_value(subject),
-        'predicate': fmt_triple_value(predicate),
-        'object': fmt_triple_value(object),
-        'page': 1,
-    }
-    headers = {'accept': 'application/ld+json'}
-    result = requests.get('https://query.wikidata.org/bigdata/ldf',
-                          params = params,
-                          headers = headers,
-    )
-
-    triples = []
-    if result.status_code != 200:
-        logging.error('Got %s error message: %s', result.status_code, repr((subject, predicate, object)))
-        return []
-    for item in result.json()['@graph']:
-        if item['@id'].startswith('wd:') and predicate.id in item:
-            s = item['@id'][3:]
-            triples.append((client.get(s, load = False), predicate, object))
-
-    return triples
-
--- a/wikidata_ext.py
+++ b/wikidata_ext.py
@ -0,0 +1,114 @@
+
+import ratelimit
+import urllib.parse
+import wikidata.entity
+import requests
+import json
+import logging
+
+def concept_uri(obj):
+    assert isinstance(obj, wikidata.entity.Entity), obj
+    if obj.id.startswith('P'):
+        return urllib.parse.urlparse('http://www.wikidata.org/prop/direct/{}'.format(obj.id))
+    elif obj.id.startswith('Q'):
+        return urllib.parse.urlparse('http://www.wikidata.org/entity/{}'.format(obj.id))
+    else:
+        assert False, "TODO: " + ojb.id
+
+def fmt_triple_value(obj, prefer_obj = False):
+    if obj is None:
+        return ''
+    if isinstance(obj, str):
+        return '"{}"'.format(obj)
+    elif isinstance(obj, urllib.parse.ParseResult):
+        return obj.geturl() if prefer_obj else fmt_triple_value(obj.geturl())
+    elif isinstance(obj, wikidata.entity.Entity):
+        uri = concept_uri(obj)
+        return fmt_triple_value(uri, prefer_obj)
+    else:
+        assert False, type(obj)
+
+@ratelimit.sleep_and_retry
+@ratelimit.limits(calls=10, period=60)
+def fetch_by_url(url, headers):
+    logging.debug('Fetching: %s', url)
+    result = requests.get(url, headers = headers)
+    if result.status_code != 200:
+        logging.error('Got %s error message: %s', result.status_code, repr((subject, predicate, object)))
+        return None
+    return request
+
+ITEMS_PER_PAGE = "http://www.w3.org/ns/hydra/core#itemsPerPage"
+TOTAL_ITEMS = "http://www.w3.org/ns/hydra/core#totalItems"
+
+def fmt_params(subject, predicate, object):
+    derp = [x for x in [subject, predicate, object] if x]
+    assert len(derp) >= 1, str(derp)
+    params = {
+        'subject': fmt_triple_value(subject, prefer_obj = True),
+        'predicate': fmt_triple_value(predicate, prefer_obj = True),
+        'object': fmt_triple_value(object),
+        'page': 1,
+    }
+    return params
+
+def get_triples_count(subject = None, predicate = None, object = None):
+    '''
+    Fetches first page in order to determine amount of items.
+    '''
+    params = fmt_params(subject, predicate, object)
+    url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url
+    result = fetch_by_url(url, headers = {'accept': 'application/ld+json'})
+    json_data = json.loads(result)
+    for item in json_data['@graph']:
+        if TOTAL_ITEMS in item:
+            return {
+                'items_per_page': item[ITEMS_PER_PAGE],
+                'items_total': item[TOTAL_ITEMS],
+                'num_pages': int((item[TOTAL_ITEMS] - 1) / item[ITEMS_PER_PAGE] + 1),
+            }
+    assert False
+
+def get_triples_internal(subject, predicate, object):
+    params = fmt_params(subject, predicate, object)
+    pagination_data = get_triples_count(subject, predicate, object)
+    for current_page in range(1, pagination_data['num_pages']+1):
+        params['page'] = current_page
+        url = requests.Request(url = 'https://query.wikidata.org/bigdata/ldf', params = params).prepare().url
+        result = fetch_by_url(url, headers = {'accept': 'application/ld+json'})
+        json_data = json.loads(result)
+
+        for item in json_data['@graph']:
+            if item['@id'].startswith('_:b'):
+                continue
+            if item['@id'].startswith('https://query.wikidata.org/bigdata/ldf'):
+                continue
+            if item['@id'].startswith('http://www.wikidata.org/.well-known/'):
+                continue
+            yield item
+
+        # Bookkeeping
+        del url, result, json_data
+
+SCHEMA_ABOUT = urllib.parse.urlparse('http://schema.org/about')
+
+def get_wikidata_concept_for_wikipedia_page(client, wikipage):
+    triples = get_triples_internal(wikipage, SCHEMA_ABOUT, None);
+    triples = list(triples)
+    for item in triples:
+        s = item['about'][3:]
+        return client.get(s, load = False)
+
+def get_triples(client, subject = None, predicate = None, object = None):
+    triples = []
+    iterator = get_triples_internal(subject, predicate, object)
+    for item in iterator:
+        is_looking_for = item['@id'].startswith('wd:') and predicate.id in item
+        if is_looking_for :
+            s = subject
+            if s is None:
+                s = client.get(item['@id'][3:], load = False)
+            o = object or item[predicate.id]
+            yield (s, predicate, o)
+        del item, is_looking_for
+