216 lines
7.7 KiB
Lua
216 lines
7.7 KiB
Lua
---- String Distance.lua ----
|
|
-- A submodule of the `errors` library, with various string distance functions.
|
|
-- Utilities for using these distance functions are also present.
|
|
--
|
|
-- Each distance function returns 3 values:
|
|
-- * How similar the strings were.
|
|
-- * The value of maximum similarity.
|
|
-- * The value of least similarity.
|
|
-- By using these values, it's possible to normalize.
|
|
--
|
|
-- The `levenshtein` function is based on: https://gist.github.com/james2doyle/e406180e143da3bdd102
|
|
|
|
--------------------------------------------------------------------------------
|
|
-- Utility functions
|
|
|
|
|
|
local function split_string_into_words (str)
|
|
-- TODO: Add unicode support
|
|
-- TODO: Add support for splitting 'helloWorld' into {'hello', 'World'}
|
|
|
|
assert(str)
|
|
|
|
local words = {}
|
|
for word in str:gmatch('[A-Z]?[a-z]*') do
|
|
if #word > 0 then words[#words+1] = word:lower() end
|
|
end
|
|
return words
|
|
end
|
|
|
|
--------------------------------------------------------------------------------
|
|
-- Similarity metrics
|
|
|
|
local function levenshtein (str1, str2)
|
|
-- levenshtein(str1, str2)
|
|
--
|
|
-- Calculates the amount of 'inserts', 'removals' or 'substitutions'
|
|
-- required to transform `str1` into `str2`, and vice versa.
|
|
-- Note that the strings are automatically converted to lowercase.
|
|
--
|
|
-- Lower numbers denote more similar strings.
|
|
--
|
|
-- Adapted from the C version given at: https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance
|
|
|
|
-- Error handeling.
|
|
if type(str1) ~= 'string' then error(('[errors/internal]: Bad argument #1 to levenshtein, expected string, got %s (%s)'):format(str1, type(str1))) end
|
|
if type(str2) ~= 'string' then error(('[errors/internal]: Bad argument #1 to levenshtein, expected string, got %s (%s)'):format(str2, type(str2))) end
|
|
|
|
-- Do work
|
|
local str1, str2 = str1:lower(), str2:lower()
|
|
local len1, len2 = #str1, #str2
|
|
|
|
-- Quick cut-offs to save time
|
|
if len1 == 0 then
|
|
return len2, math.abs(len1 - len2), math.max(len1, len2)
|
|
elseif len2 == 0 then
|
|
return len1, math.abs(len1 - len2), math.max(len1, len2)
|
|
elseif str1 == str2 then
|
|
return 0, math.abs(len1 - len2), math.max(len1, len2)
|
|
end
|
|
|
|
-- Init column
|
|
local column = {}
|
|
for y = 1, len1 do column[y] = y end
|
|
|
|
-- Algorithm
|
|
for x = 1, len2 do
|
|
column[0] = x
|
|
local lastdiag, olddiag = x - 1, nil
|
|
for y = 1, len1 do
|
|
olddiag = column[y]
|
|
column[y] = math.min(column[y] + 1, column[y-1] + 1, lastdiag + (str1:byte(y-1) == str2:byte(x-1) and 0 or 1))
|
|
lastdiag = olddiag
|
|
end
|
|
end
|
|
|
|
-- Return the last value - this is the Levenshtein distance
|
|
return column[len1], math.abs(len1 - len2), math.max(len1, len2)
|
|
end
|
|
|
|
local function longest_common_subsequence (str1, str2)
|
|
-- longest_common_subsequence(str1, str2)
|
|
--
|
|
-- Calculates the longest common subsequence, of the two input strings.
|
|
-- That is the maximum amount of characters who fit into the same places in
|
|
-- both strings, with possible characters in betweens. This is not the same
|
|
-- as longest common substring.
|
|
-- Note that the strings are automatically converted to lowercase.
|
|
--
|
|
-- Higher numbers denote more similar strings.
|
|
|
|
-- Error handeling.
|
|
if type(str1) ~= 'string' then error(('[errors/internal]: Bad argument #1 to longest_common_subsequence, expected string, got %s (%s)'):format(str1, type(str1))) end
|
|
if type(str2) ~= 'string' then error(('[errors/internal]: Bad argument #1 to longest_common_subsequence, expected string, got %s (%s)'):format(str2, type(str2))) end
|
|
|
|
-- Do work
|
|
local str1, str2 = str1:lower(), str2:lower()
|
|
local len1, len2 = #str1, #str2
|
|
|
|
-- Quick cut-offs to save time
|
|
if str1 == str2 then
|
|
return len1, len1, 0
|
|
elseif len1 == 0 or len2 == 0 then
|
|
return 0, math.max(len1, len2), 0
|
|
end
|
|
|
|
-- Init C
|
|
local matrix = {}
|
|
for i = 0, len1 do matrix[i] = {[0] = 0} end
|
|
for j = 0, len2 do matrix[0][j] = 0 end
|
|
|
|
-- Fill up table
|
|
for i = 1, len1 do
|
|
for j = 1, len2 do
|
|
matrix[i][j] = (str1:byte(i) == str2:byte(j)) and (matrix[i-1][j-1] + 1) or math.max(matrix[i][j-1], matrix[i-1][j])
|
|
end
|
|
end
|
|
|
|
-- Return
|
|
return matrix[len1][len2], math.max(len1, len2), 0
|
|
end
|
|
|
|
local function jaccard_similarity_of_words (str1, str2)
|
|
-- jaccard_similarity_of_words(str1, str2)
|
|
--
|
|
-- Calculates the jaccard similarity of the words in the strings.
|
|
--
|
|
-- Higher numbers denote more similar strings. At 1 the strings contain
|
|
-- exactly the same words.
|
|
|
|
-- Error handeling.
|
|
assert(type(str1) == 'string')
|
|
assert(type(str2) == 'string')
|
|
|
|
-- Quick cut-offs to save time
|
|
if str1:lower() == str2:lower() or str1 == '' and str2 == '' then
|
|
return 1, 1, 0
|
|
elseif str1 == '' or str2 == '' then
|
|
return 0, 1, 0
|
|
end
|
|
|
|
-- Work work
|
|
local words1, words2, all = {}, {}, {}, {}
|
|
for _, word in ipairs(split_string_into_words(str1)) do
|
|
words1[word], all[word] = true, true
|
|
end
|
|
for _, word in ipairs(split_string_into_words(str2)) do
|
|
words2[word], all[word] = true, true
|
|
end
|
|
|
|
-- Which words are in common?
|
|
local num_in_common, num_words_in_total = 0, 0
|
|
for word, _ in pairs(all) do
|
|
num_words_in_total = num_words_in_total + 1
|
|
if words1[word] and words2[word] then num_in_common = num_in_common + 1 end
|
|
end
|
|
|
|
-- Return similarity
|
|
return num_in_common/num_words_in_total, 1, 0
|
|
end
|
|
|
|
local SIMILARITY_METRICS = {
|
|
levenshtein,
|
|
longest_common_subsequence,
|
|
jaccard_similarity_of_words,
|
|
}
|
|
|
|
--------------------------------------------------------------------------------
|
|
|
|
local function strings_with_highest_similarity (str, list_of_other_str)
|
|
-- strings_with_highest_similarity(str, list)
|
|
--
|
|
-- Returns a new list, sorted by comparing the strings in the list to the
|
|
-- predefined string, sorted in descending order, eg. the first elements in
|
|
-- the output list is the most similar.
|
|
|
|
-- Error checking
|
|
if type(str) ~= 'string' then error(('[errors/internal]: Bad argument #1, expected string, got %s (%s)'):format(str, type(str))) end
|
|
if type(list_of_other_str) ~= 'table' then error(('[errors/internal]: Bad argument #2, expected table, got %s (%s)'):format(list_of_other_str, type(list_of_other_str))) end
|
|
for i = 1, #list_of_other_str do
|
|
if type(list_of_other_str[i]) ~= 'string' then error(('[errors/internal]: Bad argument #2, expected sequence of strings, but got %s (%s) on index %i'):format(list_of_other_str[i], type(list_of_other_str[i]), i)) end
|
|
end
|
|
|
|
-- Do work
|
|
local possible = {}
|
|
|
|
-- Calculate similarity metrics
|
|
for _, other_str in ipairs(list_of_other_str) do
|
|
local total_sim = 0
|
|
--print(other_str)
|
|
for _, similarity_func in ipairs(SIMILARITY_METRICS) do
|
|
local sim, max_sim, min_sim = similarity_func(str, other_str)
|
|
assert(max_sim ~= min_sim)
|
|
total_sim = total_sim + (sim-min_sim)/(max_sim-min_sim)
|
|
--print('', sim, (sim-min_sim)/(max_sim-min_sim))
|
|
end
|
|
possible[#possible+1] = {other_str, total_sim}
|
|
--print('\tTotal: '.. total_sim)
|
|
end
|
|
|
|
-- Sort and flatten
|
|
table.sort(possible, function(a, b) return a[2] > b[2] end)
|
|
for i = 1, #possible do possible[i] = possible[i][1] end
|
|
|
|
-- Return the sorted list
|
|
return possible
|
|
end
|
|
|
|
--------------------------------------------------------------------------------
|
|
|
|
return {
|
|
levenshtein = levenshtein,
|
|
longest_common_subsequence = longest_common_subsequence,
|
|
jaccard_similarity_of_words = jaccard_similarity_of_words,
|
|
strings_with_highest_similarity = strings_with_highest_similarity,
|
|
}
|