memebot/internet.lua


local http  = require 'socket.http'
local https = require 'ssl.https'
local md5   = require 'md5'
local json  = require 'json'

local internet = {}

--------------------------------------------------------------------------------
-- Util

local function assert_equal (a, b)
    if a ~= b then
        error(('Assertion failed!\n\tThis      :  %s\n\tShould be :  %s'):format(a, b))
    end
    return true
end

local SCANDI_SYMBOLS = { 'æ', 'Æ', 'ø', 'Ø', 'å', 'Å' }

local function string_contains_scandi (str)
    assert(type(str) == 'string')
    for _, symbol in ipairs(SCANDI_SYMBOLS) do
        if str:match(symbol) then  return  true  end
    end
    return false
end


local function escape_url (url, non_standard)
    local non_standard = non_standard or {}
    return url:gsub(' ', non_standard[' '] or '%%20'):gsub(',', non_standard[','] or '%%2C')
end

local function escape_pattern (text)
    return text:gsub('[+-?*]', '%%%1')
end

local function safe_access (base, path)
    for i = 1, #path do
        local item = base[path[i]]
        if not item then  return nil, path[i]  end
        base = item
    end
    return base
end

local function generic_request (...)
    --print('Request', ...)
    local output, code, headers, status  =  https.request(...)
    --print('Https', output, code, headers, status)
    if code ~= nil and status ~= 'connection refused' then
        return output, code, headers, status
    end
    local output, code, headers, status  =  http.request(...)
    --print('Http', output, code, headers, status)
    return output, code, headers, status
end

local function report_https_request_error (status, code)
    local f = io.stdout
    f:write 'Error when attempting request:\n'
    f:write ('    Status: '..tostring(status)..'\n')
    f:write ('    Code:   '..tostring(code)..'\n')
    --f:write ('    Headers:\n        ')
end

--------------------------------------------------------------------------------
-- Searching Clearbit for logoes
-- Contains logoes

local function search_clearbit_for_logo (topic)
    if not (type(topic) == 'string' and topic == topic:lower() and #topic > 0) then
        return nil, 'Bad topic: '..tostring(topic)
    elseif string_contains_scandi(topic) then
        return nil, 'Clearbit does not like æøå: '..tostring(topic)
    end
    --
    for _, domain in ipairs { 'org', 'com', 'net', 'dk' } do
        local search_url  =  ('https://logo-core.clearbit.com/%s.%s'):format(topic, domain)
        local _, code, headers, status  =  https.request { url = search_url, method = 'HEAD' }
        if code == 200 then  return search_url  end
    end
end

--------------------------------------------------------------------------------
-- Searching Shutterstock for stockphotoes

local htmlparser = require 'htmlparser'

local function search_shutterstock_for_stock_photoes (topic)
    if not (type(topic) == 'string' and topic == topic:lower() and #topic > 0) then
        return nil, 'Bad topic: '..tostring(topic)
    elseif string_contains_scandi(topic) then
        return nil, 'Splashbase does not like æøå: '..tostring(topic)
    end

    local search_url  =  'https://www.shutterstock.com/search/'..escape_url(topic)
    local body, code, headers, status  =  https.request(search_url)
    if not body then  error(code)  end

    local html = htmlparser.parse(body, 10000)

    if not html then  return nil, 'HTML could not decode data for '..topic  end

    local img_elems  =  html:select 'img.z_g_i'
    local img_url    =  img_elems[math.random(#img_elems)].attributes.src
    assert(type(img_url) == 'string')
    return img_url
end

--------------------------------------------------------------------------------
-- Searching splashbase for fairly-licensed stockphotoes

local function search_splashbase_for_stock_photoes (topic)
    if not (type(topic) == 'string' and topic == topic:lower() and #topic > 0) then
        return nil, 'Bad topic: '..tostring(topic)
    elseif string_contains_scandi(topic) then
        return nil, 'Splashbase does not like æøå: '..tostring(topic)
    end

    local search_url  =  escape_url('http://www.splashbase.co/api/v1/images/search?query='..topic)
    local body, code, headers, status  =  https.request(search_url)
    if not body then  error(code)  end
    local data = json.decode(body)

    if not data          then  return nil, 'JSON could not decode data for '..topic  end
    if #data.images <= 0 then  return nil, 'Query returned no data for '..topic      end

    local img_url  =  data.images[math.random(#data.images)].url
    assert(type(img_url) == 'string')
    return img_url
end

--------------------------------------------------------------------------------
-- Search wikipedia for images on pages

local WIKIPEDIA_API_URL  =  'https://%s.wikipedia.org/w/api.php?action=query&titles=%s&prop=pageimages&format=json&piprop=original&redirects=1&prop=categories&prop=links'

--[[
local WIKIPEDIA_DISAMBIGUATION_CATEGORIES_FOR_LANG = {
    da = 'Kategori:Flertydig',
    en = 'Category:All disambiguation pages'
}
--]]

local WIKIPEDIA_CONTENT_NAMESPACE = 0

local function get_disambiguation_links (page)
    assert(type(page) == 'table')
    --
    local pagename = escape_pattern(page.title:lower())
    --
    local links = {}
    for _, link_info in pairs(page.links or {}) do
        if link_info.title
           and link_info.title:lower():match(pagename)
           and link_info.ns == WIKIPEDIA_CONTENT_NAMESPACE then
            links[#links+1] = link_info.title
        end
    end
    --
    return links
end

local function get_wikipedia_pages (topics, language)
    assert(type(topics)   == 'table')
    assert(type(language) == 'string')
    --
    local titles_field  =  escape_url(table.concat(topics, '|'))

    local body, code, headers, status  =  https.request(WIKIPEDIA_API_URL:format(language, titles_field))
    if not body then
        report_https_request_error(status, code)
        return {}
    end
    local data = json.decode(body)

    if not data          then  return nil, 'JSON could not decode data from wikipedia for '..titles_field  end
    --if data.success ~= 1 then  return nil, 'Query was incorrect in some way'  end

    -- Determine if some topic was redirected or normalized
    local pages = {}
    for _, page in pairs(data.query.pages) do
        pages[page.title] = page
    end
    for _, redirect in pairs(data.query.normalized or {}) do
        pages[ redirect.from ] = pages[ redirect.to ]
    end
    for _, redirect in pairs(data.query.redirects or {}) do
        pages[ redirect.from ] = pages[ redirect.to ]
    end
    --
    return pages
end


--------------------------------------------------------------------------------
-- Search Wikidata for infobox images

local WIKIMEDIA_IMAGE_PATH  =  'https://upload.wikimedia.org/wikipedia/commons/%s/%s/%s'
local WIKIDATA_API_URL      =  'https://www.wikidata.org/w/api.php?action=wbgetentities&sites=%s&titles=%s&languages=en&props=claims&format=json&redirects=yes&normalize=yes'

local function is_disambiguation_entity (entity)
    local is_a_list = safe_access(entity, {'claims', 'P31'})
    if not is_a_list then  return false  end
    for _, is_a_attr in ipairs(is_a_list) do
        if safe_access(is_a_attr, {'mainsnak', 'datavalue', 'value', 'id'}) == 'Q4167410' then
            return true
        end
    end
    return false
end

local IMAGE_CLAIM_PRIORITY = { 'P41', 'P154', 'P18' }

local function select_image_filename_from_entity (entity)
    if not entity.claims then  return nil  end
    for _, image_property_name in ipairs(IMAGE_CLAIM_PRIORITY) do
        local claim  =  entity.claims[image_property_name]
        for _, subclaim in pairs(claim or {}) do
            local filename  =  safe_access(subclaim, { 'mainsnak', 'datavalue', 'value' })
            if filename then  return filename  end
        end
    end
end

local function search_wikidata_for_image (topic, language)
    -- Assert and correction
    assert(type(topic)     == 'string' or topic    == nil)
    assert(type(language)  == 'string' or language == nil)
    local language = language or 'en'
    local site = language..'wiki'
    local topic_to_image_url  =  topic_to_image_url or {}

    -- Download and parse
    local body, code, headers, status  =  https.request(WIKIDATA_API_URL:format(site, escape_url(topic)))
    if not body then
        report_https_request_error(status, code)
        return nil
    end
    local data = json.decode(body)

    if not data          then  return nil, 'JSON could not decode data from wikipedia for '..titles_field  end
    if data.success ~= 1 then  return nil, 'Query was incorrect in some way'  end

    -- Find entity
    local entity_key = next(data.entities)
    if not entity_key then  return end
    assert(next(data.entities, entity_key) == nil)
    local entity = data.entities[entity_key]

    -- Determine if hit disambiguation entity
    if is_disambiguation_entity (entity) then
        local wikipedia_page = get_wikipedia_pages({topic}, language)
        local links          = get_disambiguation_links(wikipedia_page[topic])
        if #links <= 0 then  return nil, 'Ramte flertydig '..language..' wikipedia side for "'..topic..'", men kunne ikke finde nogle links!'  end
        assert(#links > 0)
        return search_wikidata_for_image(links[math.random(#links)], language)
    end

    -- Find image, if any
    local  filename  =  select_image_filename_from_entity(entity)
    if not filename then  return  end
    filename   =  filename:gsub(' ', '_')
    local hex  =  md5.sumhexa(filename)
    local url  =  WIKIMEDIA_IMAGE_PATH:format(hex:sub(1,1), hex:sub(1,2), filename)

    return escape_url(url)
end

assert_equal( search_wikidata_for_image('Java', 'en')
            , 'https://upload.wikimedia.org/wikipedia/commons/0/0b/Gunung_Merapi_2006-05-14%2C_MODIS.jpg' )

assert_equal( search_wikidata_for_image('poop emoji', 'en')
            , 'https://upload.wikimedia.org/wikipedia/commons/6/6a/Emoji_u1f4a9.svg' )


--------------------------------------------------------------------------------
-- General search for images

function internet.search_images (topics)
    assert(type(topics) == 'table')
    if #topics == 0 then  return {}  end
    -- Init
    local topic_to_image_url  =  {}

    for _, topic in ipairs(topics) do
        local val

        -- Wikidata
        if not val then  val =  search_wikidata_for_image(topic, 'da')  end
        if not val then  val =  search_wikidata_for_image(topic, 'en')  end
        -- Logoes
        if not val then  val =  search_clearbit_for_logo(topic:lower())  end
        -- Stock Photoes
        if not val then  val =  search_shutterstock_for_stock_photoes(topic)  end
        if not val then  val =  search_splashbase_for_stock_photoes(topic:lower())  end

        topic_to_image_url[topic]  =  val
    end
    -- Ret
    return topic_to_image_url
end

--------------------------------------------------------------------------------
-- Find images on reddit

function internet.find_reddit_memes (subreddit, filter)

    -- Error check
    assert(type(subreddit) == 'string')
    filter  =  filter or function()  return true  end
    assert(type(filter) == 'function')

    --
    local search_url  =  escape_url('https://www.reddit.com/r/'..subreddit..'/new.json')
    local body, code, headers, status  =  https.request(search_url)
    if not body then
        report_https_request_error(status, code)
        return {}
    end

    local data = json.decode(body)

    local memes  =  {}
    for _, meme_data in pairs(data.data.children) do
        meme_data  =  meme_data.data
        local success  =  filter(meme_data)
        if success then  memes[#memes+1]  =  meme_data  end
    end

    return memes
end

--------------------------------------------------------------------------------
-- Download file

function internet.download_file (url, filename)
    -- retrieve the content of a URL, and store in filename

    assert(type(url)      == 'string')
    assert(type(filename) == 'string')

    if url:match '^file://' then
        local path  =  url:match '^file://(.+)$'
        os.execute('cp "'..path..'" "'..filename..'"')
        return true
    end

    --local body, code, headers, status  =  generic_request(url)
    local body, code, headers, status  =  https.request(url)

    if code ~= 200 then
	    return false, code
	    --error(('Connection to "%s" failed, with error "%s"'):format(url, status))
    end
    assert(type(body) == 'string')

    -- save the content to a file
    local f = assert(io.open(filename, 'wb'))   -- open in "binary" mode
    f:write(body)
    f:close()

    return true
end

function internet.download_video (url)
    assert(type(url) == 'string')
    local video_filename  =  os.tmpname()
    local status  =  os.execute(('youtube-dl "%s" -o "%s"'):format(url, video_filename))
    assert(status == 0)
    return video_filename..'.mkv'
end

function internet.download_headers (url)
    assert(type(url) == 'string')
    --
    local _, code, headers, status  =  generic_request {
        url     =  url,
        method  =  'HEAD'
    }
    --
    return headers
end


--------------------------------------------------------------------------------

return internet