Forsøger nu wikidata, før den forsøger andre steder. Dette gør den langsommere, da den skal håndtere disambiguation sider.

This commit is contained in:
Jon Michael Aanes 2018-06-09 15:35:25 +02:00
parent 7d5bf90c77
commit ccda0ceb40

View File

@ -1,21 +1,51 @@
local https = require 'ssl.https' local https = require 'ssl.https'
local md5 = require 'md5'
local internet = {} local internet = {}
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
-- Util -- Util
local function assert_equal (a, b)
if a ~= b then
error(('Assertion failed!\n\tThis : %s\n\tShould be : %s'):format(a, b))
end
return true
end
local SCANDI_SYMBOLS = { 'æ', 'Æ', 'ø', 'Ø', 'å', 'Å' } local SCANDI_SYMBOLS = { 'æ', 'Æ', 'ø', 'Ø', 'å', 'Å' }
local function string_contains_scandi (str) local function string_contains_scandi (str)
assert(type(str) == 'string')
for _, symbol in ipairs(SCANDI_SYMBOLS) do for _, symbol in ipairs(SCANDI_SYMBOLS) do
if topic:match(symbol) then return true end if str:match(symbol) then return true end
end end
return false return false
end end
local function escape_url (url, non_standard)
local non_standard = non_standard or {}
return url:gsub(' ', non_standard[' '] or '%%20'):gsub(',', non_standard[','] or '%%2C')
end
local function escape_pattern (text)
return text:gsub('[+-?*]', '%%%1')
end
local function safe_access (base, path)
for i = 1, #path do
local item = base[path[i]]
if not item then return nil, path[i] end
base = item
end
return base
end
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
-- Searching Clearbit for logoes
-- Contains logoes
local function search_clearbit_for_logo (topic) local function search_clearbit_for_logo (topic)
if not (type(topic) == 'string' and topic == topic:lower() and #topic > 0) then if not (type(topic) == 'string' and topic == topic:lower() and #topic > 0) then
@ -31,7 +61,10 @@ local function search_clearbit_for_logo (topic)
end end
end end
local function search_splashbase_for_image_topic (topic) --------------------------------------------------------------------------------
-- Searching splashbase for fairly-licensed stockphotoes
local function search_splashbase_for_stock_photoes (topic)
if not (type(topic) == 'string' and topic == topic:lower() and #topic > 0) then if not (type(topic) == 'string' and topic == topic:lower() and #topic > 0) then
return nil, 'Bad topic: '..tostring(topic) return nil, 'Bad topic: '..tostring(topic)
elseif string_contains_scandi(topic) then elseif string_contains_scandi(topic) then
@ -51,75 +84,171 @@ local function search_splashbase_for_image_topic (topic)
return img_url return img_url
end end
local WIKIPEDIA_API_URL = 'https://%s.wikipedia.org/w/api.php?action=query&titles=%s&prop=pageimages&format=json&piprop=original&redirects=1' --------------------------------------------------------------------------------
-- Search wikipedia for images on pages
local function search_wikipedia_for_images (topics, language, topic_to_image_url) --[[
if type(topics) == 'string' then topics = { topics } end local WIKIPEDIA_API_URL = 'https://%s.wikipedia.org/w/api.php?action=query&titles=%s&prop=pageimages&format=json&piprop=original&redirects=1&prop=categories&prop=links'
local language = language or 'en' local WIKIPEDIA_DISAMBIGUATION_CATEGORIES_FOR_LANG = {
local topic_to_image_url = topic_to_image_url or {} da = 'Kategori:Flertydig',
en = 'Category:All disambiguation pages'
}
--]]
local WIKIPEDIA_CONTENT_NAMESPACE = 0
local function get_disambiguation_links (page)
assert(type(page) == 'table')
-- --
local titles_field = table.concat(topics, '|'):gsub('%s+', '%%20') local pagename = escape_pattern(page.title:lower())
--
local links = {}
for _, link_info in pairs(page.links or {}) do
if link_info.title
and link_info.title:lower():match(pagename)
and link_info.ns == WIKIPEDIA_CONTENT_NAMESPACE then
links[#links+1] = link_info.title
end
end
--
return links
end
local function get_wikipedia_pages (topics, language)
assert(type(topics) == 'table')
assert(type(language) == 'string')
--
local titles_field = escape_url(table.concat(topics, '|'))
local body, code, headers, status = https.request(WIKIPEDIA_API_URL:format(language, titles_field)) local body, code, headers, status = https.request(WIKIPEDIA_API_URL:format(language, titles_field))
if not body then error(code) end if not body then error(code) end
local data = json.decode(body) local data = json.decode(body)
if not data then return {}, 'JSON could not decode data from wikipedia for '..titles_field end if not data then return nil, 'JSON could not decode data from wikipedia for '..titles_field end
--if data.success ~= 1 then return nil, 'Query was incorrect in some way' end
-- Determine if some topic was redirected -- Determine if some topic was redirected or normalized
local redirected_topics = {} local pages = {}
for _, page in pairs(data.query.pages) do
pages[page.title] = page
end
for _, redirect in pairs(data.query.normalized or {}) do for _, redirect in pairs(data.query.normalized or {}) do
redirected_topics[ redirect.to ] = redirected_topics[ redirect.from ] or redirect.from pages[ redirect.from ] = pages[ redirect.to ]
end end
for _, redirect in pairs(data.query.redirects or {}) do for _, redirect in pairs(data.query.redirects or {}) do
redirected_topics[ redirect.to ] = redirected_topics[ redirect.from ] or redirect.from pages[ redirect.from ] = pages[ redirect.to ]
end
--
return pages
end end
-- Determine topic to image
for _, page in pairs(data.query.pages) do
local orig_title = redirected_topics[ page.title ] or page.title
if not topic_to_image_url[orig_title] then
local found_url = false
if page.original then found_url = page.original.source end
topic_to_image_url[orig_title] = found_url
end
end
---
return topic_to_image_url
end
local function all_topics_has_image (topic_to_image_url) --------------------------------------------------------------------------------
for _, url in pairs(topic_to_image_url) do -- Search Wikidata for infobox images
if not url then return false end
end local WIKIMEDIA_IMAGE_PATH = 'https://upload.wikimedia.org/wikipedia/commons/%s/%s/%s'
local WIKIDATA_API_URL = 'https://www.wikidata.org/w/api.php?action=wbgetentities&sites=%s&titles=%s&languages=en&props=claims&format=json&redirects=yes&normalize=yes'
local function is_disambiguation_entity (entity)
local is_a_list = safe_access(entity, {'claims', 'P31'})
if not is_a_list then return false end
for _, is_a_attr in ipairs(is_a_list) do
if safe_access(is_a_attr, {'mainsnak', 'datavalue', 'value', 'id'}) == 'Q4167410' then
return true return true
end end
end
return false
end
local IMAGE_CLAIM_PRIORITY = { 'P41', 'P154', 'P18' }
local function select_image_filename_from_entity (entity)
if not entity.claims then return nil end
for _, image_property_name in ipairs(IMAGE_CLAIM_PRIORITY) do
local claim = entity.claims[image_property_name]
for _, subclaim in pairs(claim or {}) do
local filename = safe_access(subclaim, { 'mainsnak', 'datavalue', 'value' })
if filename then return filename end
end
end
end
local function search_wikidata_for_image (topic, language)
-- Assert and correction
assert(type(topic) == 'string' or topic == nil)
assert(type(language) == 'string' or language == nil)
local language = language or 'en'
local site = language..'wiki'
local topic_to_image_url = topic_to_image_url or {}
-- Download and parse
--print("Searching "..site.." wikidata for images")
local body, code, headers, status = https.request(WIKIDATA_API_URL:format(site, escape_url(topic)))
if not body then error(code) end
local data = json.decode(body)
if not data then return nil, 'JSON could not decode data from wikipedia for '..titles_field end
if data.success ~= 1 then return nil, 'Query was incorrect in some way' end
-- Find entity
local entity_key = next(data.entities)
if not entity_key then return end
assert(next(data.entities, entity_key) == nil)
local entity = data.entities[entity_key]
-- Determine if hit disambiguation entity
if is_disambiguation_entity(entity) then
local wikipedia_page = get_wikipedia_pages({topic}, language)
local links = get_disambiguation_links(wikipedia_page[topic])
assert(#links > 0)
return search_wikidata_for_image(links[math.random(#links)], language)
end
-- Find image, if any
local filename = select_image_filename_from_entity(entity)
if not filename then return end
filename = filename:gsub(' ', '_')
local hex = md5.sumhexa(filename)
local url = WIKIMEDIA_IMAGE_PATH:format(hex:sub(1,1), hex:sub(1,2), filename)
return escape_url(url)
end
assert_equal( search_wikidata_for_image('Java', 'en')
, 'https://upload.wikimedia.org/wikipedia/commons/0/0b/Gunung_Merapi_2006-05-14%2C_MODIS.jpg' )
assert_equal( search_wikidata_for_image('poop emoji', 'en')
, 'https://upload.wikimedia.org/wikipedia/commons/6/6a/Emoji_u1f4a9.svg' )
--------------------------------------------------------------------------------
-- General search for images
function internet.search_images (topics) function internet.search_images (topics)
assert(type(topics) == 'table') assert(type(topics) == 'table')
if #topics == 0 then return {} end if #topics == 0 then return {} end
-- Init
local topic_to_image_url = {} local topic_to_image_url = {}
-- Wikipedia
search_wikipedia_for_images(topics, 'da', topic_to_image_url) for _, topic in ipairs(topics) do
if not all_topics_has_image(topic_to_image_url) then return topic_to_image_url end local val
search_wikipedia_for_images(topics, 'en', topic_to_image_url)
-- Wikidata
if not val then val = search_wikidata_for_image(topic, 'da') end
if not val then val = search_wikidata_for_image(topic, 'en') end
-- Logoes -- Logoes
for topic, val in pairs(topic_to_image_url) do if not val then val = search_clearbit_for_logo(topic:lower()) end
if not val then -- Stock Photoes
topic_to_image_url[topic] = search_clearbit_for_logo(topic:lower()) if not val then val = search_splashbase_for_stock_photoes(topic:lower()) end
end
end topic_to_image_url[topic] = val
-- Stock photoes
for topic, val in pairs(topic_to_image_url) do
if not val then
topic_to_image_url[topic] = search_splashbase_for_image_topic(topic:lower())
end
end end
-- Ret -- Ret
return topic_to_image_url return topic_to_image_url
end end
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
-- Download file
function internet.download_file (url, filename) function internet.download_file (url, filename)
-- retrieve the content of a URL -- retrieve the content of a URL