From ccda0ceb4009ee80852e4b793afcb9a9bae3a090 Mon Sep 17 00:00:00 2001 From: Jon Michael Aanes Date: Sat, 9 Jun 2018 15:35:25 +0200 Subject: [PATCH] =?UTF-8?q?Fors=C3=B8ger=20nu=20wikidata,=20f=C3=B8r=20den?= =?UTF-8?q?=20fors=C3=B8ger=20andre=20steder.=20Dette=20g=C3=B8r=20den=20l?= =?UTF-8?q?angsommere,=20da=20den=20skal=20h=C3=A5ndtere=20disambiguation?= =?UTF-8?q?=20sider.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- internet.lua | 211 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 170 insertions(+), 41 deletions(-) diff --git a/internet.lua b/internet.lua index 4b530aa..6c66996 100644 --- a/internet.lua +++ b/internet.lua @@ -1,21 +1,51 @@ local https = require 'ssl.https' +local md5 = require 'md5' local internet = {} -------------------------------------------------------------------------------- -- Util +local function assert_equal (a, b) + if a ~= b then + error(('Assertion failed!\n\tThis : %s\n\tShould be : %s'):format(a, b)) + end + return true +end + local SCANDI_SYMBOLS = { 'æ', 'Æ', 'ø', 'Ø', 'å', 'Å' } local function string_contains_scandi (str) + assert(type(str) == 'string') for _, symbol in ipairs(SCANDI_SYMBOLS) do - if topic:match(symbol) then return true end + if str:match(symbol) then return true end end return false end + +local function escape_url (url, non_standard) + local non_standard = non_standard or {} + return url:gsub(' ', non_standard[' '] or '%%20'):gsub(',', non_standard[','] or '%%2C') +end + +local function escape_pattern (text) + return text:gsub('[+-?*]', '%%%1') +end + +local function safe_access (base, path) + for i = 1, #path do + local item = base[path[i]] + if not item then return nil, path[i] end + base = item + end + return base +end + -------------------------------------------------------------------------------- +-- Searching Clearbit for logoes +-- Contains logoes local function search_clearbit_for_logo (topic) if not (type(topic) == 'string' and topic == topic:lower() and #topic > 0) then @@ -31,7 +61,10 @@ local function search_clearbit_for_logo (topic) end end -local function search_splashbase_for_image_topic (topic) +-------------------------------------------------------------------------------- +-- Searching splashbase for fairly-licensed stockphotoes + +local function search_splashbase_for_stock_photoes (topic) if not (type(topic) == 'string' and topic == topic:lower() and #topic > 0) then return nil, 'Bad topic: '..tostring(topic) elseif string_contains_scandi(topic) then @@ -51,75 +84,171 @@ local function search_splashbase_for_image_topic (topic) return img_url end -local WIKIPEDIA_API_URL = 'https://%s.wikipedia.org/w/api.php?action=query&titles=%s&prop=pageimages&format=json&piprop=original&redirects=1' +-------------------------------------------------------------------------------- +-- Search wikipedia for images on pages -local function search_wikipedia_for_images (topics, language, topic_to_image_url) - if type(topics) == 'string' then topics = { topics } end - local language = language or 'en' - local topic_to_image_url = topic_to_image_url or {} +--[[ +local WIKIPEDIA_API_URL = 'https://%s.wikipedia.org/w/api.php?action=query&titles=%s&prop=pageimages&format=json&piprop=original&redirects=1&prop=categories&prop=links' +local WIKIPEDIA_DISAMBIGUATION_CATEGORIES_FOR_LANG = { + da = 'Kategori:Flertydig', + en = 'Category:All disambiguation pages' +} +--]] + +local WIKIPEDIA_CONTENT_NAMESPACE = 0 + +local function get_disambiguation_links (page) + assert(type(page) == 'table') -- - local titles_field = table.concat(topics, '|'):gsub('%s+', '%%20') + local pagename = escape_pattern(page.title:lower()) + -- + local links = {} + for _, link_info in pairs(page.links or {}) do + if link_info.title + and link_info.title:lower():match(pagename) + and link_info.ns == WIKIPEDIA_CONTENT_NAMESPACE then + links[#links+1] = link_info.title + end + end + -- + return links +end + +local function get_wikipedia_pages (topics, language) + assert(type(topics) == 'table') + assert(type(language) == 'string') + -- + local titles_field = escape_url(table.concat(topics, '|')) local body, code, headers, status = https.request(WIKIPEDIA_API_URL:format(language, titles_field)) if not body then error(code) end local data = json.decode(body) - if not data then return {}, 'JSON could not decode data from wikipedia for '..titles_field end + if not data then return nil, 'JSON could not decode data from wikipedia for '..titles_field end + --if data.success ~= 1 then return nil, 'Query was incorrect in some way' end - -- Determine if some topic was redirected - local redirected_topics = {} + -- Determine if some topic was redirected or normalized + local pages = {} + for _, page in pairs(data.query.pages) do + pages[page.title] = page + end for _, redirect in pairs(data.query.normalized or {}) do - redirected_topics[ redirect.to ] = redirected_topics[ redirect.from ] or redirect.from + pages[ redirect.from ] = pages[ redirect.to ] end for _, redirect in pairs(data.query.redirects or {}) do - redirected_topics[ redirect.to ] = redirected_topics[ redirect.from ] or redirect.from + pages[ redirect.from ] = pages[ redirect.to ] end + -- + return pages +end - -- Determine topic to image - for _, page in pairs(data.query.pages) do - local orig_title = redirected_topics[ page.title ] or page.title - if not topic_to_image_url[orig_title] then - local found_url = false - if page.original then found_url = page.original.source end - topic_to_image_url[orig_title] = found_url + +-------------------------------------------------------------------------------- +-- Search Wikidata for infobox images + +local WIKIMEDIA_IMAGE_PATH = 'https://upload.wikimedia.org/wikipedia/commons/%s/%s/%s' +local WIKIDATA_API_URL = 'https://www.wikidata.org/w/api.php?action=wbgetentities&sites=%s&titles=%s&languages=en&props=claims&format=json&redirects=yes&normalize=yes' + +local function is_disambiguation_entity (entity) + local is_a_list = safe_access(entity, {'claims', 'P31'}) + if not is_a_list then return false end + for _, is_a_attr in ipairs(is_a_list) do + if safe_access(is_a_attr, {'mainsnak', 'datavalue', 'value', 'id'}) == 'Q4167410' then + return true end end - --- - return topic_to_image_url + return false end -local function all_topics_has_image (topic_to_image_url) - for _, url in pairs(topic_to_image_url) do - if not url then return false end +local IMAGE_CLAIM_PRIORITY = { 'P41', 'P154', 'P18' } + +local function select_image_filename_from_entity (entity) + if not entity.claims then return nil end + for _, image_property_name in ipairs(IMAGE_CLAIM_PRIORITY) do + local claim = entity.claims[image_property_name] + for _, subclaim in pairs(claim or {}) do + local filename = safe_access(subclaim, { 'mainsnak', 'datavalue', 'value' }) + if filename then return filename end + end end - return true end +local function search_wikidata_for_image (topic, language) + -- Assert and correction + assert(type(topic) == 'string' or topic == nil) + assert(type(language) == 'string' or language == nil) + local language = language or 'en' + local site = language..'wiki' + local topic_to_image_url = topic_to_image_url or {} + + -- Download and parse + --print("Searching "..site.." wikidata for images") + local body, code, headers, status = https.request(WIKIDATA_API_URL:format(site, escape_url(topic))) + if not body then error(code) end + local data = json.decode(body) + + if not data then return nil, 'JSON could not decode data from wikipedia for '..titles_field end + if data.success ~= 1 then return nil, 'Query was incorrect in some way' end + + -- Find entity + local entity_key = next(data.entities) + if not entity_key then return end + assert(next(data.entities, entity_key) == nil) + local entity = data.entities[entity_key] + + -- Determine if hit disambiguation entity + if is_disambiguation_entity(entity) then + local wikipedia_page = get_wikipedia_pages({topic}, language) + local links = get_disambiguation_links(wikipedia_page[topic]) + assert(#links > 0) + return search_wikidata_for_image(links[math.random(#links)], language) + end + + -- Find image, if any + local filename = select_image_filename_from_entity(entity) + if not filename then return end + filename = filename:gsub(' ', '_') + local hex = md5.sumhexa(filename) + local url = WIKIMEDIA_IMAGE_PATH:format(hex:sub(1,1), hex:sub(1,2), filename) + + return escape_url(url) +end + +assert_equal( search_wikidata_for_image('Java', 'en') + , 'https://upload.wikimedia.org/wikipedia/commons/0/0b/Gunung_Merapi_2006-05-14%2C_MODIS.jpg' ) + +assert_equal( search_wikidata_for_image('poop emoji', 'en') + , 'https://upload.wikimedia.org/wikipedia/commons/6/6a/Emoji_u1f4a9.svg' ) + + +-------------------------------------------------------------------------------- +-- General search for images + function internet.search_images (topics) assert(type(topics) == 'table') if #topics == 0 then return {} end + -- Init local topic_to_image_url = {} - -- Wikipedia - search_wikipedia_for_images(topics, 'da', topic_to_image_url) - if not all_topics_has_image(topic_to_image_url) then return topic_to_image_url end - search_wikipedia_for_images(topics, 'en', topic_to_image_url) - -- Logoes - for topic, val in pairs(topic_to_image_url) do - if not val then - topic_to_image_url[topic] = search_clearbit_for_logo(topic:lower()) - end - end - -- Stock photoes - for topic, val in pairs(topic_to_image_url) do - if not val then - topic_to_image_url[topic] = search_splashbase_for_image_topic(topic:lower()) - end + + for _, topic in ipairs(topics) do + local val + + -- Wikidata + if not val then val = search_wikidata_for_image(topic, 'da') end + if not val then val = search_wikidata_for_image(topic, 'en') end + -- Logoes + if not val then val = search_clearbit_for_logo(topic:lower()) end + -- Stock Photoes + if not val then val = search_splashbase_for_stock_photoes(topic:lower()) end + + topic_to_image_url[topic] = val end -- Ret return topic_to_image_url end -------------------------------------------------------------------------------- +-- Download file function internet.download_file (url, filename) -- retrieve the content of a URL