memebot/internet.lua

392 lines
13 KiB
Lua

local http = require 'socket.http'
local https = require 'ssl.https'
local md5 = require 'md5'
local json = require 'json'
local internet = {}
--------------------------------------------------------------------------------
-- Util
local function assert_equal (a, b)
if a ~= b then
error(('Assertion failed!\n\tThis : %s\n\tShould be : %s'):format(a, b))
end
return true
end
local SCANDI_SYMBOLS = { 'æ', 'Æ', 'ø', 'Ø', 'å', 'Å' }
local function string_contains_scandi (str)
assert(type(str) == 'string')
for _, symbol in ipairs(SCANDI_SYMBOLS) do
if str:match(symbol) then return true end
end
return false
end
local function escape_url (url, non_standard)
local non_standard = non_standard or {}
return url:gsub(' ', non_standard[' '] or '%%20'):gsub(',', non_standard[','] or '%%2C')
end
local function escape_pattern (text)
return text:gsub('[+-?*]', '%%%1')
end
local function safe_access (base, path)
for i = 1, #path do
local item = base[path[i]]
if not item then return nil, path[i] end
base = item
end
return base
end
local function generic_request (...)
--print('Request', ...)
local output, code, headers, status = https.request(...)
--print('Https', output, code, headers, status)
if code ~= nil and status ~= 'connection refused' then
return output, code, headers, status
end
local output, code, headers, status = http.request(...)
--print('Http', output, code, headers, status)
return output, code, headers, status
end
local function report_https_request_error (status, code)
local f = io.stdout
f:write 'Error when attempting request:\n'
f:write (' Status: '..tostring(status)..'\n')
f:write (' Code: '..tostring(code)..'\n')
--f:write (' Headers:\n ')
end
--------------------------------------------------------------------------------
-- Searching Clearbit for logoes
-- Contains logoes
local function search_clearbit_for_logo (topic)
if not (type(topic) == 'string' and topic == topic:lower() and #topic > 0) then
return nil, 'Bad topic: '..tostring(topic)
elseif string_contains_scandi(topic) then
return nil, 'Clearbit does not like æøå: '..tostring(topic)
end
--
for _, domain in ipairs { 'org', 'com', 'net', 'dk' } do
local search_url = ('https://logo-core.clearbit.com/%s.%s'):format(topic, domain)
local _, code, headers, status = https.request { url = search_url, method = 'HEAD' }
if code == 200 then return search_url end
end
end
--------------------------------------------------------------------------------
-- Searching Shutterstock for stockphotoes
local htmlparser = require 'htmlparser'
local function search_shutterstock_for_stock_photoes (topic)
if not (type(topic) == 'string' and topic == topic:lower() and #topic > 0) then
return nil, 'Bad topic: '..tostring(topic)
elseif string_contains_scandi(topic) then
return nil, 'Splashbase does not like æøå: '..tostring(topic)
end
local search_url = 'https://www.shutterstock.com/search/'..escape_url(topic)
local body, code, headers, status = https.request(search_url)
if not body then error(code) end
local html = htmlparser.parse(body, 10000)
if not html then return nil, 'HTML could not decode data for '..topic end
local img_elems = html:select 'img.z_g_i'
local img_url = img_elems[math.random(#img_elems)].attributes.src
assert(type(img_url) == 'string')
return img_url
end
--------------------------------------------------------------------------------
-- Searching splashbase for fairly-licensed stockphotoes
local function search_splashbase_for_stock_photoes (topic)
if not (type(topic) == 'string' and topic == topic:lower() and #topic > 0) then
return nil, 'Bad topic: '..tostring(topic)
elseif string_contains_scandi(topic) then
return nil, 'Splashbase does not like æøå: '..tostring(topic)
end
local search_url = escape_url('http://www.splashbase.co/api/v1/images/search?query='..topic)
local body, code, headers, status = https.request(search_url)
if not body then error(code) end
local data = json.decode(body)
if not data then return nil, 'JSON could not decode data for '..topic end
if #data.images <= 0 then return nil, 'Query returned no data for '..topic end
local img_url = data.images[math.random(#data.images)].url
assert(type(img_url) == 'string')
return img_url
end
--------------------------------------------------------------------------------
-- Search wikipedia for images on pages
local WIKIPEDIA_API_URL = 'https://%s.wikipedia.org/w/api.php?action=query&titles=%s&prop=pageimages&format=json&piprop=original&redirects=1&prop=categories&prop=links'
--[[
local WIKIPEDIA_DISAMBIGUATION_CATEGORIES_FOR_LANG = {
da = 'Kategori:Flertydig',
en = 'Category:All disambiguation pages'
}
--]]
local WIKIPEDIA_CONTENT_NAMESPACE = 0
local function get_disambiguation_links (page)
assert(type(page) == 'table')
--
local pagename = escape_pattern(page.title:lower())
--
local links = {}
for _, link_info in pairs(page.links or {}) do
if link_info.title
and link_info.title:lower():match(pagename)
and link_info.ns == WIKIPEDIA_CONTENT_NAMESPACE then
links[#links+1] = link_info.title
end
end
--
return links
end
local function get_wikipedia_pages (topics, language)
assert(type(topics) == 'table')
assert(type(language) == 'string')
--
local titles_field = escape_url(table.concat(topics, '|'))
local body, code, headers, status = https.request(WIKIPEDIA_API_URL:format(language, titles_field))
if not body then
report_https_request_error(status, code)
return {}
end
local data = json.decode(body)
if not data then return nil, 'JSON could not decode data from wikipedia for '..titles_field end
--if data.success ~= 1 then return nil, 'Query was incorrect in some way' end
-- Determine if some topic was redirected or normalized
local pages = {}
for _, page in pairs(data.query.pages) do
pages[page.title] = page
end
for _, redirect in pairs(data.query.normalized or {}) do
pages[ redirect.from ] = pages[ redirect.to ]
end
for _, redirect in pairs(data.query.redirects or {}) do
pages[ redirect.from ] = pages[ redirect.to ]
end
--
return pages
end
--------------------------------------------------------------------------------
-- Search Wikidata for infobox images
local WIKIMEDIA_IMAGE_PATH = 'https://upload.wikimedia.org/wikipedia/commons/%s/%s/%s'
local WIKIDATA_API_URL = 'https://www.wikidata.org/w/api.php?action=wbgetentities&sites=%s&titles=%s&languages=en&props=claims&format=json&redirects=yes&normalize=yes'
local function is_disambiguation_entity (entity)
local is_a_list = safe_access(entity, {'claims', 'P31'})
if not is_a_list then return false end
for _, is_a_attr in ipairs(is_a_list) do
if safe_access(is_a_attr, {'mainsnak', 'datavalue', 'value', 'id'}) == 'Q4167410' then
return true
end
end
return false
end
local IMAGE_CLAIM_PRIORITY = { 'P41', 'P154', 'P18' }
local function select_image_filename_from_entity (entity)
if not entity.claims then return nil end
for _, image_property_name in ipairs(IMAGE_CLAIM_PRIORITY) do
local claim = entity.claims[image_property_name]
for _, subclaim in pairs(claim or {}) do
local filename = safe_access(subclaim, { 'mainsnak', 'datavalue', 'value' })
if filename then return filename end
end
end
end
local function search_wikidata_for_image (topic, language)
-- Assert and correction
assert(type(topic) == 'string' or topic == nil)
assert(type(language) == 'string' or language == nil)
local language = language or 'en'
local site = language..'wiki'
local topic_to_image_url = topic_to_image_url or {}
-- Download and parse
local body, code, headers, status = https.request(WIKIDATA_API_URL:format(site, escape_url(topic)))
if not body then
report_https_request_error(status, code)
return nil
end
local data = json.decode(body)
if not data then return nil, 'JSON could not decode data from wikipedia for '..titles_field end
if data.success ~= 1 then return nil, 'Query was incorrect in some way' end
-- Find entity
local entity_key = next(data.entities)
if not entity_key then return end
assert(next(data.entities, entity_key) == nil)
local entity = data.entities[entity_key]
-- Determine if hit disambiguation entity
if is_disambiguation_entity (entity) then
local wikipedia_page = get_wikipedia_pages({topic}, language)
local links = get_disambiguation_links(wikipedia_page[topic])
if #links <= 0 then return nil, 'Ramte flertydig '..language..' wikipedia side for "'..topic..'", men kunne ikke finde nogle links!' end
assert(#links > 0)
return search_wikidata_for_image(links[math.random(#links)], language)
end
-- Find image, if any
local filename = select_image_filename_from_entity(entity)
if not filename then return end
filename = filename:gsub(' ', '_')
local hex = md5.sumhexa(filename)
local url = WIKIMEDIA_IMAGE_PATH:format(hex:sub(1,1), hex:sub(1,2), filename)
return escape_url(url)
end
assert_equal( search_wikidata_for_image('Java', 'en')
, 'https://upload.wikimedia.org/wikipedia/commons/0/0b/Gunung_Merapi_2006-05-14%2C_MODIS.jpg' )
assert_equal( search_wikidata_for_image('poop emoji', 'en')
, 'https://upload.wikimedia.org/wikipedia/commons/6/6a/Emoji_u1f4a9.svg' )
--------------------------------------------------------------------------------
-- General search for images
function internet.search_images (topics)
assert(type(topics) == 'table')
if #topics == 0 then return {} end
-- Init
local topic_to_image_url = {}
for _, topic in ipairs(topics) do
local val
-- Wikidata
if not val then val = search_wikidata_for_image(topic, 'da') end
if not val then val = search_wikidata_for_image(topic, 'en') end
-- Logoes
if not val then val = search_clearbit_for_logo(topic:lower()) end
-- Stock Photoes
if not val then val = search_shutterstock_for_stock_photoes(topic) end
if not val then val = search_splashbase_for_stock_photoes(topic:lower()) end
topic_to_image_url[topic] = val
end
-- Ret
return topic_to_image_url
end
--------------------------------------------------------------------------------
-- Find images on reddit
function internet.find_reddit_memes (subreddit, filter)
-- Error check
assert(type(subreddit) == 'string')
filter = filter or function() return true end
assert(type(filter) == 'function')
--
local search_url = escape_url('https://www.reddit.com/r/'..subreddit..'/new.json')
local body, code, headers, status = https.request(search_url)
if not body then
report_https_request_error(status, code)
return {}
end
local data = json.decode(body)
local memes = {}
for _, meme_data in pairs(data.data.children) do
meme_data = meme_data.data
local success = filter(meme_data)
if success then memes[#memes+1] = meme_data end
end
return memes
end
--------------------------------------------------------------------------------
-- Download file
function internet.download_file (url, filename)
-- retrieve the content of a URL, and store in filename
assert(type(url) == 'string')
assert(type(filename) == 'string')
if url:match '^file://' then
local path = url:match '^file://(.+)$'
os.execute('cp "'..path..'" "'..filename..'"')
return true
end
--local body, code, headers, status = generic_request(url)
local body, code, headers, status = https.request(url)
if code ~= 200 then
return false, code
--error(('Connection to "%s" failed, with error "%s"'):format(url, status))
end
assert(type(body) == 'string')
-- save the content to a file
local f = assert(io.open(filename, 'wb')) -- open in "binary" mode
f:write(body)
f:close()
return true
end
function internet.download_video (url)
assert(type(url) == 'string')
local video_filename = os.tmpname()
local status = os.execute(('youtube-dl "%s" -o "%s"'):format(url, video_filename))
assert(status == 0)
return video_filename..'.mkv'
end
function internet.download_headers (url)
assert(type(url) == 'string')
--
local _, code, headers, status = generic_request {
url = url,
method = 'HEAD'
}
--
return headers
end
--------------------------------------------------------------------------------
return internet