errors/string_distance.lua

---- String Distance.lua ----
-- A submodule of the `errors` library, with various string distance functions.
-- Utilities for using these distance functions are also present.
--
-- Each distance function returns 3 values:
-- * How similar the strings were.
-- * The value of maximum similarity.
-- * The value of least similarity.
-- By using these values, it's possible to normalize.
--
-- The `levenshtein` function is based on: https://gist.github.com/james2doyle/e406180e143da3bdd102

--------------------------------------------------------------------------------
-- Utility functions


local function split_string_into_words (str)
    -- TODO: Add unicode support
    -- TODO: Add support for splitting 'helloWorld' into {'hello', 'World'}

    assert(str)

    local words = {}
    for word in str:gmatch('[A-Z]?[a-z]*') do
        if #word > 0 then  words[#words+1] = word:lower()  end
    end
    return words
end

--------------------------------------------------------------------------------
-- Similarity metrics

local function levenshtein (str1, str2)
    -- levenshtein(str1, str2)
    --
    -- Calculates the amount of 'inserts', 'removals' or 'substitutions'
    -- required to transform `str1` into `str2`, and vice versa.
    -- Note that the strings are automatically converted to lowercase.
    --
    -- Lower numbers denote more similar strings.
    --
    -- Adapted from the C version given at: https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance

    -- Error handeling.
    assert(type(str1) == 'string')
    assert(type(str2) == 'string')


    -- Do work
    local str1, str2 = str1:lower(), str2:lower()
    local len1, len2 = #str1, #str2

    -- Quick cut-offs to save time
    if len1 == 0 then
        return len2, math.abs(len1 - len2), math.max(len1, len2)
    elseif len2 == 0 then
        return len1, math.abs(len1 - len2), math.max(len1, len2)
    elseif str1 == str2 then
        return 0, math.abs(len1 - len2), math.max(len1, len2)
    end

    -- Init column
    local column = {}
    for y = 1, len1 do  column[y] = y  end

    -- Algorithm
    for x = 1, len2 do
        column[0] = x
        local lastdiag, olddiag  =  x - 1, nil
        for y = 1, len1 do
            olddiag = column[y]
            column[y] =  math.min(column[y] + 1, column[y-1] + 1, lastdiag + (str1:byte(y-1) == str2:byte(x-1) and 0 or 1))
            lastdiag = olddiag
        end
    end

    -- Return the last value - this is the Levenshtein distance
    return column[len1], math.abs(len1 - len2), math.max(len1, len2)
end

local function longest_common_subsequence (str1, str2)
    -- longest_common_subsequence(str1, str2)
    --
    -- Calculates the longest common subsequence, of the two input strings.
    -- That is the maximum amount of characters who fit into the same places in
    -- both strings, with possible characters in betweens. This is not the same
    -- as longest common substring.
    -- Note that the strings are automatically converted to lowercase.
    --
    -- Higher numbers denote more similar strings.

    -- Error handeling.
    assert(type(str1) == 'string')
    assert(type(str2) == 'string')

    -- Do work
    local str1, str2 = str1:lower(), str2:lower()
    local len1, len2 = #str1, #str2

    -- Quick cut-offs to save time
    if str1 == str2 then
        return len1, len1, 0
    elseif len1 == 0 or len2 == 0 then
        return 0, math.max(len1, len2), 0
    end

    -- Init C
	local matrix = {}
	for i = 0, len1 do  matrix[i] = {[0] = 0}   end
	for j = 0, len2 do  matrix[0][j] = 0  end

	-- Fill up table
	for i = 1, len1 do
		for j = 1, len2 do
			matrix[i][j] = (str1:byte(i) == str2:byte(j)) and (matrix[i-1][j-1] + 1) or math.max(matrix[i][j-1], matrix[i-1][j])
		end
	end

	-- Return
	return matrix[len1][len2], math.max(len1, len2), 0
end

local function jaccard_similarity_of_words (str1, str2)
    -- jaccard_similarity_of_words(str1, str2)
    --
    -- Calculates the jaccard similarity of the words in the strings.
    --
    -- Higher numbers denote more similar strings. At 1 the strings contain
    -- exactly the same words.

    -- Error handeling.
    assert(type(str1) == 'string')
    assert(type(str2) == 'string')

    -- Quick cut-offs to save time
    if str1:lower() == str2:lower() or str1 == '' and str2 == '' then
        return 1, 1, 0
    elseif str1 == '' or str2 == '' then
        return 0, 1, 0
    end

    -- Work work
    local words1, words2, all = {}, {}, {}, {}
    for _, word in ipairs(split_string_into_words(str1)) do
        words1[word], all[word] = true, true
    end
    for _, word in ipairs(split_string_into_words(str2)) do
        words2[word], all[word] = true, true
    end

    -- Which words are in common?
    local num_in_common, num_words_in_total = 0, 0
    for word, _ in pairs(all) do
        num_words_in_total = num_words_in_total + 1
        if words1[word] and words2[word] then  num_in_common = num_in_common + 1  end
    end

    -- Return similarity
    return num_in_common/num_words_in_total, 1, 0
end

local SIMILARITY_METRICS = {
    levenshtein,
    longest_common_subsequence,
    jaccard_similarity_of_words,
}

--------------------------------------------------------------------------------

local function strings_with_highest_similarity (str, list_of_other_str)
    -- strings_with_highest_similarity(str, list)
    --
    -- Returns a new list, sorted by comparing the strings in the list to the
    -- predefined string, sorted in descending order, eg. the first elements in
    -- the output list is the most similar.

    -- Error checking
    if type(str) ~= 'string' then  error(('[errors/internal]: Bad argument #1, expected string, got %s (%s)'):format(str, type(str))) end
    if type(list_of_other_str) ~= 'table'  then  error(('[errors/internal]: Bad argument #2, expected table, got %s (%s)'):format(list_of_other_str, type(list_of_other_str))) end
    for i = 1, #list_of_other_str do
        if type(list_of_other_str[i]) ~= 'string'  then  error(('[errors/internal]: Bad argument #2, expected sequence of strings, but got %s (%s) on index %i'):format(list_of_other_str[i], type(list_of_other_str[i])), i) end
    end

    -- Do work
    local possible = {}

    -- Calculate similarity metrics
    for _, other_str in ipairs(list_of_other_str) do
        local total_sim = 0
        --print(other_str)
        for _, similarity_func in ipairs(SIMILARITY_METRICS) do
            local sim, max_sim, min_sim = similarity_func(str, other_str)
            assert(max_sim ~= min_sim)
            total_sim = total_sim + (sim-min_sim)/(max_sim-min_sim)
            --print('', sim, (sim-min_sim)/(max_sim-min_sim))
        end
        possible[#possible+1] = {other_str, total_sim}
        --print('\tTotal: '.. total_sim)
    end

    -- Sort and flatten
    table.sort(possible, function(a, b) return a[2] > b[2] end)
    for i = 1, #possible do  possible[i] = possible[i][1]  end

    -- Return the sorted list
    return possible
end

--------------------------------------------------------------------------------

return {
    levenshtein                      =  levenshtein,
    longest_common_subsequence       =  longest_common_subsequence,
    jaccard_similarity_of_words      =  jaccard_similarity_of_words,
    strings_with_highest_similarity  =  strings_with_highest_similarity,
}
Initial implementation of errors library. 2017-06-09 13:22:25 +00:00			`---- String Distance.lua ----`
			-- A submodule of the `errors` library, with various string distance functions.
			`-- Utilities for using these distance functions are also present.`
			`--`
			`-- Each distance function returns 3 values:`
			`-- * How similar the strings were.`
			`-- * The value of maximum similarity.`
			`-- * The value of least similarity.`
			`-- By using these values, it's possible to normalize.`
			`--`
			-- The `levenshtein` function is based on: https://gist.github.com/james2doyle/e406180e143da3bdd102

			`--------------------------------------------------------------------------------`
			`-- Utility functions`


			`local function split_string_into_words (str)`
			`-- TODO: Add unicode support`
			`-- TODO: Add support for splitting 'helloWorld' into {'hello', 'World'}`

			`assert(str)`

			`local words = {}`
			`for word in str:gmatch('[A-Z]?[a-z]*') do`
			`if #word > 0 then words[#words+1] = word:lower() end`
			`end`
			`return words`
			`end`

			`--------------------------------------------------------------------------------`
			`-- Similarity metrics`

			`local function levenshtein (str1, str2)`
			`-- levenshtein(str1, str2)`
			`--`
			`-- Calculates the amount of 'inserts', 'removals' or 'substitutions'`
			-- required to transform `str1` into `str2`, and vice versa.
			`-- Note that the strings are automatically converted to lowercase.`
			`--`
			`-- Lower numbers denote more similar strings.`
			`--`
			`-- Adapted from the C version given at: https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance`

			`-- Error handeling.`
			`assert(type(str1) == 'string')`
			`assert(type(str2) == 'string')`


			`-- Do work`
			`local str1, str2 = str1:lower(), str2:lower()`
			`local len1, len2 = #str1, #str2`

			`-- Quick cut-offs to save time`
			`if len1 == 0 then`
			`return len2, math.abs(len1 - len2), math.max(len1, len2)`
			`elseif len2 == 0 then`
			`return len1, math.abs(len1 - len2), math.max(len1, len2)`
			`elseif str1 == str2 then`
			`return 0, math.abs(len1 - len2), math.max(len1, len2)`
			`end`

			`-- Init column`
			`local column = {}`
			`for y = 1, len1 do column[y] = y end`

			`-- Algorithm`
			`for x = 1, len2 do`
			`column[0] = x`
			`local lastdiag, olddiag = x - 1, nil`
			`for y = 1, len1 do`
			`olddiag = column[y]`
			`column[y] = math.min(column[y] + 1, column[y-1] + 1, lastdiag + (str1:byte(y-1) == str2:byte(x-1) and 0 or 1))`
			`lastdiag = olddiag`
			`end`
			`end`

			`-- Return the last value - this is the Levenshtein distance`
			`return column[len1], math.abs(len1 - len2), math.max(len1, len2)`
			`end`

			`local function longest_common_subsequence (str1, str2)`
			`-- longest_common_subsequence(str1, str2)`
			`--`
			`-- Calculates the longest common subsequence, of the two input strings.`
			`-- That is the maximum amount of characters who fit into the same places in`
			`-- both strings, with possible characters in betweens. This is not the same`
			`-- as longest common substring.`
			`-- Note that the strings are automatically converted to lowercase.`
			`--`
			`-- Higher numbers denote more similar strings.`

			`-- Error handeling.`
			`assert(type(str1) == 'string')`
			`assert(type(str2) == 'string')`

			`-- Do work`
			`local str1, str2 = str1:lower(), str2:lower()`
			`local len1, len2 = #str1, #str2`

			`-- Quick cut-offs to save time`
			`if str1 == str2 then`
			`return len1, len1, 0`
			`elseif len1 == 0 or len2 == 0 then`
			`return 0, math.max(len1, len2), 0`
			`end`

			`-- Init C`
			`local matrix = {}`
			`for i = 0, len1 do matrix[i] = {[0] = 0} end`
			`for j = 0, len2 do matrix[0][j] = 0 end`

			`-- Fill up table`
			`for i = 1, len1 do`
			`for j = 1, len2 do`
			`matrix[i][j] = (str1:byte(i) == str2:byte(j)) and (matrix[i-1][j-1] + 1) or math.max(matrix[i][j-1], matrix[i-1][j])`
			`end`
			`end`

			`-- Return`
			`return matrix[len1][len2], math.max(len1, len2), 0`
			`end`

			`local function jaccard_similarity_of_words (str1, str2)`
			`-- jaccard_similarity_of_words(str1, str2)`
			`--`
			`-- Calculates the jaccard similarity of the words in the strings.`
			`--`
			`-- Higher numbers denote more similar strings. At 1 the strings contain`
			`-- exactly the same words.`

			`-- Error handeling.`
			`assert(type(str1) == 'string')`
			`assert(type(str2) == 'string')`

			`-- Quick cut-offs to save time`
			`if str1:lower() == str2:lower() or str1 == '' and str2 == '' then`
			`return 1, 1, 0`
			`elseif str1 == '' or str2 == '' then`
			`return 0, 1, 0`
			`end`

			`-- Work work`
			`local words1, words2, all = {}, {}, {}, {}`
			`for _, word in ipairs(split_string_into_words(str1)) do`
			`words1[word], all[word] = true, true`
			`end`
			`for _, word in ipairs(split_string_into_words(str2)) do`
			`words2[word], all[word] = true, true`
			`end`

			`-- Which words are in common?`
			`local num_in_common, num_words_in_total = 0, 0`
			`for word, _ in pairs(all) do`
			`num_words_in_total = num_words_in_total + 1`
			`if words1[word] and words2[word] then num_in_common = num_in_common + 1 end`
			`end`

			`-- Return similarity`
			`return num_in_common/num_words_in_total, 1, 0`
			`end`

			`local SIMILARITY_METRICS = {`
			`levenshtein,`
			`longest_common_subsequence,`
			`jaccard_similarity_of_words,`
			`}`

			`--------------------------------------------------------------------------------`

			`local function strings_with_highest_similarity (str, list_of_other_str)`
			`-- strings_with_highest_similarity(str, list)`
			`--`
			`-- Returns a new list, sorted by comparing the strings in the list to the`
			`-- predefined string, sorted in descending order, eg. the first elements in`
			`-- the output list is the most similar.`

			`-- Error checking`
Improved error checking messages for `strings_with_highest_similarity`. 2017-08-27 09:47:05 +00:00			`if type(str) ~= 'string' then error(('[errors/internal]: Bad argument #1, expected string, got %s (%s)'):format(str, type(str))) end`
			`if type(list_of_other_str) ~= 'table' then error(('[errors/internal]: Bad argument #2, expected table, got %s (%s)'):format(list_of_other_str, type(list_of_other_str))) end`
			`for i = 1, #list_of_other_str do`
			`if type(list_of_other_str[i]) ~= 'string' then error(('[errors/internal]: Bad argument #2, expected sequence of strings, but got %s (%s) on index %i'):format(list_of_other_str[i], type(list_of_other_str[i])), i) end`
			`end`
Initial implementation of errors library. 2017-06-09 13:22:25 +00:00
			`-- Do work`
			`local possible = {}`

			`-- Calculate similarity metrics`
			`for _, other_str in ipairs(list_of_other_str) do`
			`local total_sim = 0`
			`--print(other_str)`
			`for _, similarity_func in ipairs(SIMILARITY_METRICS) do`
			`local sim, max_sim, min_sim = similarity_func(str, other_str)`
			`assert(max_sim ~= min_sim)`
			`total_sim = total_sim + (sim-min_sim)/(max_sim-min_sim)`
			`--print('', sim, (sim-min_sim)/(max_sim-min_sim))`
			`end`
			`possible[#possible+1] = {other_str, total_sim}`
			`--print('\tTotal: '.. total_sim)`
			`end`

			`-- Sort and flatten`
			`table.sort(possible, function(a, b) return a[2] > b[2] end)`
			`for i = 1, #possible do possible[i] = possible[i][1] end`

			`-- Return the sorted list`
			`return possible`
			`end`

			`--------------------------------------------------------------------------------`

			`return {`
			`levenshtein = levenshtein,`
			`longest_common_subsequence = longest_common_subsequence,`
			`jaccard_similarity_of_words = jaccard_similarity_of_words,`
			`strings_with_highest_similarity = strings_with_highest_similarity,`
			`}`