392 lines
13 KiB
Lua
392 lines
13 KiB
Lua
|
|
local http = require 'socket.http'
|
|
local https = require 'ssl.https'
|
|
local md5 = require 'md5'
|
|
local json = require 'json'
|
|
|
|
local internet = {}
|
|
|
|
--------------------------------------------------------------------------------
|
|
-- Util
|
|
|
|
local function assert_equal (a, b)
|
|
if a ~= b then
|
|
error(('Assertion failed!\n\tThis : %s\n\tShould be : %s'):format(a, b))
|
|
end
|
|
return true
|
|
end
|
|
|
|
local SCANDI_SYMBOLS = { 'æ', 'Æ', 'ø', 'Ø', 'å', 'Å' }
|
|
|
|
local function string_contains_scandi (str)
|
|
assert(type(str) == 'string')
|
|
for _, symbol in ipairs(SCANDI_SYMBOLS) do
|
|
if str:match(symbol) then return true end
|
|
end
|
|
return false
|
|
end
|
|
|
|
|
|
local function escape_url (url, non_standard)
|
|
local non_standard = non_standard or {}
|
|
return url:gsub(' ', non_standard[' '] or '%%20'):gsub(',', non_standard[','] or '%%2C')
|
|
end
|
|
|
|
local function escape_pattern (text)
|
|
return text:gsub('[+-?*]', '%%%1')
|
|
end
|
|
|
|
local function safe_access (base, path)
|
|
for i = 1, #path do
|
|
local item = base[path[i]]
|
|
if not item then return nil, path[i] end
|
|
base = item
|
|
end
|
|
return base
|
|
end
|
|
|
|
local function generic_request (...)
|
|
--print('Request', ...)
|
|
local output, code, headers, status = https.request(...)
|
|
--print('Https', output, code, headers, status)
|
|
if code ~= nil and status ~= 'connection refused' then
|
|
return output, code, headers, status
|
|
end
|
|
local output, code, headers, status = http.request(...)
|
|
--print('Http', output, code, headers, status)
|
|
return output, code, headers, status
|
|
end
|
|
|
|
local function report_https_request_error (status, code)
|
|
local f = io.stdout
|
|
f:write 'Error when attempting request:\n'
|
|
f:write (' Status: '..tostring(status)..'\n')
|
|
f:write (' Code: '..tostring(code)..'\n')
|
|
--f:write (' Headers:\n ')
|
|
end
|
|
|
|
--------------------------------------------------------------------------------
|
|
-- Searching Clearbit for logoes
|
|
-- Contains logoes
|
|
|
|
local function search_clearbit_for_logo (topic)
|
|
if not (type(topic) == 'string' and topic == topic:lower() and #topic > 0) then
|
|
return nil, 'Bad topic: '..tostring(topic)
|
|
elseif string_contains_scandi(topic) then
|
|
return nil, 'Clearbit does not like æøå: '..tostring(topic)
|
|
end
|
|
--
|
|
for _, domain in ipairs { 'org', 'com', 'net', 'dk' } do
|
|
local search_url = ('https://logo-core.clearbit.com/%s.%s'):format(topic, domain)
|
|
local _, code, headers, status = https.request { url = search_url, method = 'HEAD' }
|
|
if code == 200 then return search_url end
|
|
end
|
|
end
|
|
|
|
--------------------------------------------------------------------------------
|
|
-- Searching Shutterstock for stockphotoes
|
|
|
|
local htmlparser = require 'htmlparser'
|
|
|
|
local function search_shutterstock_for_stock_photoes (topic)
|
|
if not (type(topic) == 'string' and topic == topic:lower() and #topic > 0) then
|
|
return nil, 'Bad topic: '..tostring(topic)
|
|
elseif string_contains_scandi(topic) then
|
|
return nil, 'Splashbase does not like æøå: '..tostring(topic)
|
|
end
|
|
|
|
local search_url = 'https://www.shutterstock.com/search/'..escape_url(topic)
|
|
local body, code, headers, status = https.request(search_url)
|
|
if not body then error(code) end
|
|
|
|
local html = htmlparser.parse(body, 10000)
|
|
|
|
if not html then return nil, 'HTML could not decode data for '..topic end
|
|
|
|
local img_elems = html:select 'img.z_g_i'
|
|
local img_url = img_elems[math.random(#img_elems)].attributes.src
|
|
assert(type(img_url) == 'string')
|
|
return img_url
|
|
end
|
|
|
|
--------------------------------------------------------------------------------
|
|
-- Searching splashbase for fairly-licensed stockphotoes
|
|
|
|
local function search_splashbase_for_stock_photoes (topic)
|
|
if not (type(topic) == 'string' and topic == topic:lower() and #topic > 0) then
|
|
return nil, 'Bad topic: '..tostring(topic)
|
|
elseif string_contains_scandi(topic) then
|
|
return nil, 'Splashbase does not like æøå: '..tostring(topic)
|
|
end
|
|
|
|
local search_url = escape_url('http://www.splashbase.co/api/v1/images/search?query='..topic)
|
|
local body, code, headers, status = https.request(search_url)
|
|
if not body then error(code) end
|
|
local data = json.decode(body)
|
|
|
|
if not data then return nil, 'JSON could not decode data for '..topic end
|
|
if #data.images <= 0 then return nil, 'Query returned no data for '..topic end
|
|
|
|
local img_url = data.images[math.random(#data.images)].url
|
|
assert(type(img_url) == 'string')
|
|
return img_url
|
|
end
|
|
|
|
--------------------------------------------------------------------------------
|
|
-- Search wikipedia for images on pages
|
|
|
|
local WIKIPEDIA_API_URL = 'https://%s.wikipedia.org/w/api.php?action=query&titles=%s&prop=pageimages&format=json&piprop=original&redirects=1&prop=categories&prop=links'
|
|
|
|
--[[
|
|
local WIKIPEDIA_DISAMBIGUATION_CATEGORIES_FOR_LANG = {
|
|
da = 'Kategori:Flertydig',
|
|
en = 'Category:All disambiguation pages'
|
|
}
|
|
--]]
|
|
|
|
local WIKIPEDIA_CONTENT_NAMESPACE = 0
|
|
|
|
local function get_disambiguation_links (page)
|
|
assert(type(page) == 'table')
|
|
--
|
|
local pagename = escape_pattern(page.title:lower())
|
|
--
|
|
local links = {}
|
|
for _, link_info in pairs(page.links or {}) do
|
|
if link_info.title
|
|
and link_info.title:lower():match(pagename)
|
|
and link_info.ns == WIKIPEDIA_CONTENT_NAMESPACE then
|
|
links[#links+1] = link_info.title
|
|
end
|
|
end
|
|
--
|
|
return links
|
|
end
|
|
|
|
local function get_wikipedia_pages (topics, language)
|
|
assert(type(topics) == 'table')
|
|
assert(type(language) == 'string')
|
|
--
|
|
local titles_field = escape_url(table.concat(topics, '|'))
|
|
|
|
local body, code, headers, status = https.request(WIKIPEDIA_API_URL:format(language, titles_field))
|
|
if not body then
|
|
report_https_request_error(status, code)
|
|
return {}
|
|
end
|
|
local data = json.decode(body)
|
|
|
|
if not data then return nil, 'JSON could not decode data from wikipedia for '..titles_field end
|
|
--if data.success ~= 1 then return nil, 'Query was incorrect in some way' end
|
|
|
|
-- Determine if some topic was redirected or normalized
|
|
local pages = {}
|
|
for _, page in pairs(data.query.pages) do
|
|
pages[page.title] = page
|
|
end
|
|
for _, redirect in pairs(data.query.normalized or {}) do
|
|
pages[ redirect.from ] = pages[ redirect.to ]
|
|
end
|
|
for _, redirect in pairs(data.query.redirects or {}) do
|
|
pages[ redirect.from ] = pages[ redirect.to ]
|
|
end
|
|
--
|
|
return pages
|
|
end
|
|
|
|
|
|
--------------------------------------------------------------------------------
|
|
-- Search Wikidata for infobox images
|
|
|
|
local WIKIMEDIA_IMAGE_PATH = 'https://upload.wikimedia.org/wikipedia/commons/%s/%s/%s'
|
|
local WIKIDATA_API_URL = 'https://www.wikidata.org/w/api.php?action=wbgetentities&sites=%s&titles=%s&languages=en&props=claims&format=json&redirects=yes&normalize=yes'
|
|
|
|
local function is_disambiguation_entity (entity)
|
|
local is_a_list = safe_access(entity, {'claims', 'P31'})
|
|
if not is_a_list then return false end
|
|
for _, is_a_attr in ipairs(is_a_list) do
|
|
if safe_access(is_a_attr, {'mainsnak', 'datavalue', 'value', 'id'}) == 'Q4167410' then
|
|
return true
|
|
end
|
|
end
|
|
return false
|
|
end
|
|
|
|
local IMAGE_CLAIM_PRIORITY = { 'P41', 'P154', 'P18' }
|
|
|
|
local function select_image_filename_from_entity (entity)
|
|
if not entity.claims then return nil end
|
|
for _, image_property_name in ipairs(IMAGE_CLAIM_PRIORITY) do
|
|
local claim = entity.claims[image_property_name]
|
|
for _, subclaim in pairs(claim or {}) do
|
|
local filename = safe_access(subclaim, { 'mainsnak', 'datavalue', 'value' })
|
|
if filename then return filename end
|
|
end
|
|
end
|
|
end
|
|
|
|
local function search_wikidata_for_image (topic, language)
|
|
-- Assert and correction
|
|
assert(type(topic) == 'string' or topic == nil)
|
|
assert(type(language) == 'string' or language == nil)
|
|
local language = language or 'en'
|
|
local site = language..'wiki'
|
|
local topic_to_image_url = topic_to_image_url or {}
|
|
|
|
-- Download and parse
|
|
local body, code, headers, status = https.request(WIKIDATA_API_URL:format(site, escape_url(topic)))
|
|
if not body then
|
|
report_https_request_error(status, code)
|
|
return nil
|
|
end
|
|
local data = json.decode(body)
|
|
|
|
if not data then return nil, 'JSON could not decode data from wikipedia for '..titles_field end
|
|
if data.success ~= 1 then return nil, 'Query was incorrect in some way' end
|
|
|
|
-- Find entity
|
|
local entity_key = next(data.entities)
|
|
if not entity_key then return end
|
|
assert(next(data.entities, entity_key) == nil)
|
|
local entity = data.entities[entity_key]
|
|
|
|
-- Determine if hit disambiguation entity
|
|
if is_disambiguation_entity (entity) then
|
|
local wikipedia_page = get_wikipedia_pages({topic}, language)
|
|
local links = get_disambiguation_links(wikipedia_page[topic])
|
|
if #links <= 0 then return nil, 'Ramte flertydig '..language..' wikipedia side for "'..topic..'", men kunne ikke finde nogle links!' end
|
|
assert(#links > 0)
|
|
return search_wikidata_for_image(links[math.random(#links)], language)
|
|
end
|
|
|
|
-- Find image, if any
|
|
local filename = select_image_filename_from_entity(entity)
|
|
if not filename then return end
|
|
filename = filename:gsub(' ', '_')
|
|
local hex = md5.sumhexa(filename)
|
|
local url = WIKIMEDIA_IMAGE_PATH:format(hex:sub(1,1), hex:sub(1,2), filename)
|
|
|
|
return escape_url(url)
|
|
end
|
|
|
|
assert_equal( search_wikidata_for_image('Java', 'en')
|
|
, 'https://upload.wikimedia.org/wikipedia/commons/0/0b/Gunung_Merapi_2006-05-14%2C_MODIS.jpg' )
|
|
|
|
assert_equal( search_wikidata_for_image('poop emoji', 'en')
|
|
, 'https://upload.wikimedia.org/wikipedia/commons/6/6a/Emoji_u1f4a9.svg' )
|
|
|
|
|
|
--------------------------------------------------------------------------------
|
|
-- General search for images
|
|
|
|
function internet.search_images (topics)
|
|
assert(type(topics) == 'table')
|
|
if #topics == 0 then return {} end
|
|
-- Init
|
|
local topic_to_image_url = {}
|
|
|
|
for _, topic in ipairs(topics) do
|
|
local val
|
|
|
|
-- Wikidata
|
|
if not val then val = search_wikidata_for_image(topic, 'da') end
|
|
if not val then val = search_wikidata_for_image(topic, 'en') end
|
|
-- Logoes
|
|
if not val then val = search_clearbit_for_logo(topic:lower()) end
|
|
-- Stock Photoes
|
|
if not val then val = search_shutterstock_for_stock_photoes(topic) end
|
|
if not val then val = search_splashbase_for_stock_photoes(topic:lower()) end
|
|
|
|
topic_to_image_url[topic] = val
|
|
end
|
|
-- Ret
|
|
return topic_to_image_url
|
|
end
|
|
|
|
--------------------------------------------------------------------------------
|
|
-- Find images on reddit
|
|
|
|
function internet.find_reddit_memes (subreddit, filter)
|
|
|
|
-- Error check
|
|
assert(type(subreddit) == 'string')
|
|
filter = filter or function() return true end
|
|
assert(type(filter) == 'function')
|
|
|
|
--
|
|
local search_url = escape_url('https://www.reddit.com/r/'..subreddit..'/new.json')
|
|
local body, code, headers, status = https.request(search_url)
|
|
if not body then
|
|
report_https_request_error(status, code)
|
|
return {}
|
|
end
|
|
|
|
local data = json.decode(body)
|
|
|
|
local memes = {}
|
|
for _, meme_data in pairs(data.data.children) do
|
|
meme_data = meme_data.data
|
|
local success = filter(meme_data)
|
|
if success then memes[#memes+1] = meme_data end
|
|
end
|
|
|
|
return memes
|
|
end
|
|
|
|
--------------------------------------------------------------------------------
|
|
-- Download file
|
|
|
|
function internet.download_file (url, filename)
|
|
-- retrieve the content of a URL, and store in filename
|
|
|
|
assert(type(url) == 'string')
|
|
assert(type(filename) == 'string')
|
|
|
|
if url:match '^file://' then
|
|
local path = url:match '^file://(.+)$'
|
|
os.execute('cp "'..path..'" "'..filename..'"')
|
|
return true
|
|
end
|
|
|
|
--local body, code, headers, status = generic_request(url)
|
|
local body, code, headers, status = https.request(url)
|
|
|
|
if code ~= 200 then
|
|
return false, code
|
|
--error(('Connection to "%s" failed, with error "%s"'):format(url, status))
|
|
end
|
|
assert(type(body) == 'string')
|
|
|
|
-- save the content to a file
|
|
local f = assert(io.open(filename, 'wb')) -- open in "binary" mode
|
|
f:write(body)
|
|
f:close()
|
|
|
|
return true
|
|
end
|
|
|
|
function internet.download_video (url)
|
|
assert(type(url) == 'string')
|
|
local video_filename = os.tmpname()
|
|
local status = os.execute(('youtube-dl "%s" -o "%s"'):format(url, video_filename))
|
|
assert(status == 0)
|
|
return video_filename..'.mkv'
|
|
end
|
|
|
|
function internet.download_headers (url)
|
|
assert(type(url) == 'string')
|
|
--
|
|
local _, code, headers, status = generic_request {
|
|
url = url,
|
|
method = 'HEAD'
|
|
}
|
|
--
|
|
return headers
|
|
end
|
|
|
|
|
|
--------------------------------------------------------------------------------
|
|
|
|
return internet
|
|
|