Forsøger nu wikidata, før den forsøger andre steder. Dette gør den langsommere, da den skal håndtere disambiguation sider.
This commit is contained in:
parent
7d5bf90c77
commit
ccda0ceb40
215
internet.lua
215
internet.lua
|
@ -1,21 +1,51 @@
|
|||
|
||||
local https = require 'ssl.https'
|
||||
local md5 = require 'md5'
|
||||
|
||||
local internet = {}
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
-- Util
|
||||
|
||||
local function assert_equal (a, b)
|
||||
if a ~= b then
|
||||
error(('Assertion failed!\n\tThis : %s\n\tShould be : %s'):format(a, b))
|
||||
end
|
||||
return true
|
||||
end
|
||||
|
||||
local SCANDI_SYMBOLS = { 'æ', 'Æ', 'ø', 'Ø', 'å', 'Å' }
|
||||
|
||||
local function string_contains_scandi (str)
|
||||
assert(type(str) == 'string')
|
||||
for _, symbol in ipairs(SCANDI_SYMBOLS) do
|
||||
if topic:match(symbol) then return true end
|
||||
if str:match(symbol) then return true end
|
||||
end
|
||||
return false
|
||||
end
|
||||
|
||||
|
||||
local function escape_url (url, non_standard)
|
||||
local non_standard = non_standard or {}
|
||||
return url:gsub(' ', non_standard[' '] or '%%20'):gsub(',', non_standard[','] or '%%2C')
|
||||
end
|
||||
|
||||
local function escape_pattern (text)
|
||||
return text:gsub('[+-?*]', '%%%1')
|
||||
end
|
||||
|
||||
local function safe_access (base, path)
|
||||
for i = 1, #path do
|
||||
local item = base[path[i]]
|
||||
if not item then return nil, path[i] end
|
||||
base = item
|
||||
end
|
||||
return base
|
||||
end
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
-- Searching Clearbit for logoes
|
||||
-- Contains logoes
|
||||
|
||||
local function search_clearbit_for_logo (topic)
|
||||
if not (type(topic) == 'string' and topic == topic:lower() and #topic > 0) then
|
||||
|
@ -31,7 +61,10 @@ local function search_clearbit_for_logo (topic)
|
|||
end
|
||||
end
|
||||
|
||||
local function search_splashbase_for_image_topic (topic)
|
||||
--------------------------------------------------------------------------------
|
||||
-- Searching splashbase for fairly-licensed stockphotoes
|
||||
|
||||
local function search_splashbase_for_stock_photoes (topic)
|
||||
if not (type(topic) == 'string' and topic == topic:lower() and #topic > 0) then
|
||||
return nil, 'Bad topic: '..tostring(topic)
|
||||
elseif string_contains_scandi(topic) then
|
||||
|
@ -51,75 +84,171 @@ local function search_splashbase_for_image_topic (topic)
|
|||
return img_url
|
||||
end
|
||||
|
||||
local WIKIPEDIA_API_URL = 'https://%s.wikipedia.org/w/api.php?action=query&titles=%s&prop=pageimages&format=json&piprop=original&redirects=1'
|
||||
--------------------------------------------------------------------------------
|
||||
-- Search wikipedia for images on pages
|
||||
|
||||
local function search_wikipedia_for_images (topics, language, topic_to_image_url)
|
||||
if type(topics) == 'string' then topics = { topics } end
|
||||
local language = language or 'en'
|
||||
local topic_to_image_url = topic_to_image_url or {}
|
||||
--[[
|
||||
local WIKIPEDIA_API_URL = 'https://%s.wikipedia.org/w/api.php?action=query&titles=%s&prop=pageimages&format=json&piprop=original&redirects=1&prop=categories&prop=links'
|
||||
local WIKIPEDIA_DISAMBIGUATION_CATEGORIES_FOR_LANG = {
|
||||
da = 'Kategori:Flertydig',
|
||||
en = 'Category:All disambiguation pages'
|
||||
}
|
||||
--]]
|
||||
|
||||
local WIKIPEDIA_CONTENT_NAMESPACE = 0
|
||||
|
||||
local function get_disambiguation_links (page)
|
||||
assert(type(page) == 'table')
|
||||
--
|
||||
local titles_field = table.concat(topics, '|'):gsub('%s+', '%%20')
|
||||
local pagename = escape_pattern(page.title:lower())
|
||||
--
|
||||
local links = {}
|
||||
for _, link_info in pairs(page.links or {}) do
|
||||
if link_info.title
|
||||
and link_info.title:lower():match(pagename)
|
||||
and link_info.ns == WIKIPEDIA_CONTENT_NAMESPACE then
|
||||
links[#links+1] = link_info.title
|
||||
end
|
||||
end
|
||||
--
|
||||
return links
|
||||
end
|
||||
|
||||
local function get_wikipedia_pages (topics, language)
|
||||
assert(type(topics) == 'table')
|
||||
assert(type(language) == 'string')
|
||||
--
|
||||
local titles_field = escape_url(table.concat(topics, '|'))
|
||||
|
||||
local body, code, headers, status = https.request(WIKIPEDIA_API_URL:format(language, titles_field))
|
||||
if not body then error(code) end
|
||||
local data = json.decode(body)
|
||||
|
||||
if not data then return {}, 'JSON could not decode data from wikipedia for '..titles_field end
|
||||
if not data then return nil, 'JSON could not decode data from wikipedia for '..titles_field end
|
||||
--if data.success ~= 1 then return nil, 'Query was incorrect in some way' end
|
||||
|
||||
-- Determine if some topic was redirected
|
||||
local redirected_topics = {}
|
||||
-- Determine if some topic was redirected or normalized
|
||||
local pages = {}
|
||||
for _, page in pairs(data.query.pages) do
|
||||
pages[page.title] = page
|
||||
end
|
||||
for _, redirect in pairs(data.query.normalized or {}) do
|
||||
redirected_topics[ redirect.to ] = redirected_topics[ redirect.from ] or redirect.from
|
||||
pages[ redirect.from ] = pages[ redirect.to ]
|
||||
end
|
||||
for _, redirect in pairs(data.query.redirects or {}) do
|
||||
redirected_topics[ redirect.to ] = redirected_topics[ redirect.from ] or redirect.from
|
||||
pages[ redirect.from ] = pages[ redirect.to ]
|
||||
end
|
||||
--
|
||||
return pages
|
||||
end
|
||||
|
||||
-- Determine topic to image
|
||||
for _, page in pairs(data.query.pages) do
|
||||
local orig_title = redirected_topics[ page.title ] or page.title
|
||||
if not topic_to_image_url[orig_title] then
|
||||
local found_url = false
|
||||
if page.original then found_url = page.original.source end
|
||||
topic_to_image_url[orig_title] = found_url
|
||||
end
|
||||
end
|
||||
---
|
||||
return topic_to_image_url
|
||||
end
|
||||
|
||||
local function all_topics_has_image (topic_to_image_url)
|
||||
for _, url in pairs(topic_to_image_url) do
|
||||
if not url then return false end
|
||||
end
|
||||
--------------------------------------------------------------------------------
|
||||
-- Search Wikidata for infobox images
|
||||
|
||||
local WIKIMEDIA_IMAGE_PATH = 'https://upload.wikimedia.org/wikipedia/commons/%s/%s/%s'
|
||||
local WIKIDATA_API_URL = 'https://www.wikidata.org/w/api.php?action=wbgetentities&sites=%s&titles=%s&languages=en&props=claims&format=json&redirects=yes&normalize=yes'
|
||||
|
||||
local function is_disambiguation_entity (entity)
|
||||
local is_a_list = safe_access(entity, {'claims', 'P31'})
|
||||
if not is_a_list then return false end
|
||||
for _, is_a_attr in ipairs(is_a_list) do
|
||||
if safe_access(is_a_attr, {'mainsnak', 'datavalue', 'value', 'id'}) == 'Q4167410' then
|
||||
return true
|
||||
end
|
||||
end
|
||||
return false
|
||||
end
|
||||
|
||||
local IMAGE_CLAIM_PRIORITY = { 'P41', 'P154', 'P18' }
|
||||
|
||||
local function select_image_filename_from_entity (entity)
|
||||
if not entity.claims then return nil end
|
||||
for _, image_property_name in ipairs(IMAGE_CLAIM_PRIORITY) do
|
||||
local claim = entity.claims[image_property_name]
|
||||
for _, subclaim in pairs(claim or {}) do
|
||||
local filename = safe_access(subclaim, { 'mainsnak', 'datavalue', 'value' })
|
||||
if filename then return filename end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
local function search_wikidata_for_image (topic, language)
|
||||
-- Assert and correction
|
||||
assert(type(topic) == 'string' or topic == nil)
|
||||
assert(type(language) == 'string' or language == nil)
|
||||
local language = language or 'en'
|
||||
local site = language..'wiki'
|
||||
local topic_to_image_url = topic_to_image_url or {}
|
||||
|
||||
-- Download and parse
|
||||
--print("Searching "..site.." wikidata for images")
|
||||
local body, code, headers, status = https.request(WIKIDATA_API_URL:format(site, escape_url(topic)))
|
||||
if not body then error(code) end
|
||||
local data = json.decode(body)
|
||||
|
||||
if not data then return nil, 'JSON could not decode data from wikipedia for '..titles_field end
|
||||
if data.success ~= 1 then return nil, 'Query was incorrect in some way' end
|
||||
|
||||
-- Find entity
|
||||
local entity_key = next(data.entities)
|
||||
if not entity_key then return end
|
||||
assert(next(data.entities, entity_key) == nil)
|
||||
local entity = data.entities[entity_key]
|
||||
|
||||
-- Determine if hit disambiguation entity
|
||||
if is_disambiguation_entity(entity) then
|
||||
local wikipedia_page = get_wikipedia_pages({topic}, language)
|
||||
local links = get_disambiguation_links(wikipedia_page[topic])
|
||||
assert(#links > 0)
|
||||
return search_wikidata_for_image(links[math.random(#links)], language)
|
||||
end
|
||||
|
||||
-- Find image, if any
|
||||
local filename = select_image_filename_from_entity(entity)
|
||||
if not filename then return end
|
||||
filename = filename:gsub(' ', '_')
|
||||
local hex = md5.sumhexa(filename)
|
||||
local url = WIKIMEDIA_IMAGE_PATH:format(hex:sub(1,1), hex:sub(1,2), filename)
|
||||
|
||||
return escape_url(url)
|
||||
end
|
||||
|
||||
assert_equal( search_wikidata_for_image('Java', 'en')
|
||||
, 'https://upload.wikimedia.org/wikipedia/commons/0/0b/Gunung_Merapi_2006-05-14%2C_MODIS.jpg' )
|
||||
|
||||
assert_equal( search_wikidata_for_image('poop emoji', 'en')
|
||||
, 'https://upload.wikimedia.org/wikipedia/commons/6/6a/Emoji_u1f4a9.svg' )
|
||||
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
-- General search for images
|
||||
|
||||
function internet.search_images (topics)
|
||||
assert(type(topics) == 'table')
|
||||
if #topics == 0 then return {} end
|
||||
-- Init
|
||||
local topic_to_image_url = {}
|
||||
-- Wikipedia
|
||||
search_wikipedia_for_images(topics, 'da', topic_to_image_url)
|
||||
if not all_topics_has_image(topic_to_image_url) then return topic_to_image_url end
|
||||
search_wikipedia_for_images(topics, 'en', topic_to_image_url)
|
||||
|
||||
for _, topic in ipairs(topics) do
|
||||
local val
|
||||
|
||||
-- Wikidata
|
||||
if not val then val = search_wikidata_for_image(topic, 'da') end
|
||||
if not val then val = search_wikidata_for_image(topic, 'en') end
|
||||
-- Logoes
|
||||
for topic, val in pairs(topic_to_image_url) do
|
||||
if not val then
|
||||
topic_to_image_url[topic] = search_clearbit_for_logo(topic:lower())
|
||||
end
|
||||
end
|
||||
-- Stock photoes
|
||||
for topic, val in pairs(topic_to_image_url) do
|
||||
if not val then
|
||||
topic_to_image_url[topic] = search_splashbase_for_image_topic(topic:lower())
|
||||
end
|
||||
if not val then val = search_clearbit_for_logo(topic:lower()) end
|
||||
-- Stock Photoes
|
||||
if not val then val = search_splashbase_for_stock_photoes(topic:lower()) end
|
||||
|
||||
topic_to_image_url[topic] = val
|
||||
end
|
||||
-- Ret
|
||||
return topic_to_image_url
|
||||
end
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
-- Download file
|
||||
|
||||
function internet.download_file (url, filename)
|
||||
-- retrieve the content of a URL
|
||||
|
|
Loading…
Reference in New Issue
Block a user