2017-06-09 13:22:25 +00:00
---- String Distance.lua ----
-- A submodule of the `errors` library, with various string distance functions.
-- Utilities for using these distance functions are also present.
--
-- Each distance function returns 3 values:
-- * How similar the strings were.
-- * The value of maximum similarity.
-- * The value of least similarity.
-- By using these values, it's possible to normalize.
--
-- The `levenshtein` function is based on: https://gist.github.com/james2doyle/e406180e143da3bdd102
--------------------------------------------------------------------------------
-- Utility functions
local function split_string_into_words ( str )
-- TODO: Add unicode support
-- TODO: Add support for splitting 'helloWorld' into {'hello', 'World'}
assert ( str )
local words = { }
for word in str : gmatch ( ' [A-Z]?[a-z]* ' ) do
if # word > 0 then words [ # words + 1 ] = word : lower ( ) end
end
return words
end
--------------------------------------------------------------------------------
-- Similarity metrics
local function levenshtein ( str1 , str2 )
-- levenshtein(str1, str2)
--
-- Calculates the amount of 'inserts', 'removals' or 'substitutions'
-- required to transform `str1` into `str2`, and vice versa.
-- Note that the strings are automatically converted to lowercase.
--
-- Lower numbers denote more similar strings.
--
-- Adapted from the C version given at: https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance
-- Error handeling.
2017-08-27 10:05:46 +00:00
if type ( str1 ) ~= ' string ' then error ( ( ' [errors/internal]: Bad argument #1 to levenshtein, expected string, got %s (%s) ' ) : format ( str1 , type ( str1 ) ) ) end
if type ( str2 ) ~= ' string ' then error ( ( ' [errors/internal]: Bad argument #1 to levenshtein, expected string, got %s (%s) ' ) : format ( str2 , type ( str2 ) ) ) end
2017-06-09 13:22:25 +00:00
-- Do work
2024-04-28 10:43:55 +00:00
str1 , str2 = str1 : lower ( ) , str2 : lower ( )
2017-06-09 13:22:25 +00:00
local len1 , len2 = # str1 , # str2
-- Quick cut-offs to save time
if len1 == 0 then
return len2 , math.abs ( len1 - len2 ) , math.max ( len1 , len2 )
elseif len2 == 0 then
return len1 , math.abs ( len1 - len2 ) , math.max ( len1 , len2 )
elseif str1 == str2 then
return 0 , math.abs ( len1 - len2 ) , math.max ( len1 , len2 )
end
-- Init column
local column = { }
for y = 1 , len1 do column [ y ] = y end
-- Algorithm
for x = 1 , len2 do
column [ 0 ] = x
2024-04-28 10:43:55 +00:00
local lastdiag = x - 1
2017-06-09 13:22:25 +00:00
for y = 1 , len1 do
2024-04-28 10:43:55 +00:00
local olddiag = column [ y ]
2017-06-09 13:22:25 +00:00
column [ y ] = math.min ( column [ y ] + 1 , column [ y - 1 ] + 1 , lastdiag + ( str1 : byte ( y - 1 ) == str2 : byte ( x - 1 ) and 0 or 1 ) )
lastdiag = olddiag
end
end
-- Return the last value - this is the Levenshtein distance
return column [ len1 ] , math.abs ( len1 - len2 ) , math.max ( len1 , len2 )
end
local function longest_common_subsequence ( str1 , str2 )
-- longest_common_subsequence(str1, str2)
--
-- Calculates the longest common subsequence, of the two input strings.
-- That is the maximum amount of characters who fit into the same places in
-- both strings, with possible characters in betweens. This is not the same
-- as longest common substring.
-- Note that the strings are automatically converted to lowercase.
--
-- Higher numbers denote more similar strings.
-- Error handeling.
2017-08-27 10:05:46 +00:00
if type ( str1 ) ~= ' string ' then error ( ( ' [errors/internal]: Bad argument #1 to longest_common_subsequence, expected string, got %s (%s) ' ) : format ( str1 , type ( str1 ) ) ) end
if type ( str2 ) ~= ' string ' then error ( ( ' [errors/internal]: Bad argument #1 to longest_common_subsequence, expected string, got %s (%s) ' ) : format ( str2 , type ( str2 ) ) ) end
2017-06-09 13:22:25 +00:00
-- Do work
2024-04-28 10:43:55 +00:00
str1 , str2 = str1 : lower ( ) , str2 : lower ( )
2017-06-09 13:22:25 +00:00
local len1 , len2 = # str1 , # str2
-- Quick cut-offs to save time
if str1 == str2 then
return len1 , len1 , 0
elseif len1 == 0 or len2 == 0 then
return 0 , math.max ( len1 , len2 ) , 0
end
-- Init C
local matrix = { }
for i = 0 , len1 do matrix [ i ] = { [ 0 ] = 0 } end
for j = 0 , len2 do matrix [ 0 ] [ j ] = 0 end
-- Fill up table
for i = 1 , len1 do
for j = 1 , len2 do
matrix [ i ] [ j ] = ( str1 : byte ( i ) == str2 : byte ( j ) ) and ( matrix [ i - 1 ] [ j - 1 ] + 1 ) or math.max ( matrix [ i ] [ j - 1 ] , matrix [ i - 1 ] [ j ] )
end
end
-- Return
return matrix [ len1 ] [ len2 ] , math.max ( len1 , len2 ) , 0
end
local function jaccard_similarity_of_words ( str1 , str2 )
-- jaccard_similarity_of_words(str1, str2)
--
-- Calculates the jaccard similarity of the words in the strings.
--
-- Higher numbers denote more similar strings. At 1 the strings contain
-- exactly the same words.
-- Error handeling.
assert ( type ( str1 ) == ' string ' )
assert ( type ( str2 ) == ' string ' )
-- Quick cut-offs to save time
if str1 : lower ( ) == str2 : lower ( ) or str1 == ' ' and str2 == ' ' then
return 1 , 1 , 0
elseif str1 == ' ' or str2 == ' ' then
return 0 , 1 , 0
end
-- Work work
2024-04-28 10:43:55 +00:00
local words1 , words2 , all = { } , { } , { }
2017-06-09 13:22:25 +00:00
for _ , word in ipairs ( split_string_into_words ( str1 ) ) do
words1 [ word ] , all [ word ] = true , true
end
for _ , word in ipairs ( split_string_into_words ( str2 ) ) do
words2 [ word ] , all [ word ] = true , true
end
-- Which words are in common?
local num_in_common , num_words_in_total = 0 , 0
for word , _ in pairs ( all ) do
num_words_in_total = num_words_in_total + 1
if words1 [ word ] and words2 [ word ] then num_in_common = num_in_common + 1 end
end
-- Return similarity
return num_in_common / num_words_in_total , 1 , 0
end
local SIMILARITY_METRICS = {
levenshtein ,
longest_common_subsequence ,
jaccard_similarity_of_words ,
}
--------------------------------------------------------------------------------
local function strings_with_highest_similarity ( str , list_of_other_str )
-- strings_with_highest_similarity(str, list)
--
-- Returns a new list, sorted by comparing the strings in the list to the
-- predefined string, sorted in descending order, eg. the first elements in
-- the output list is the most similar.
-- Error checking
2017-08-27 09:47:05 +00:00
if type ( str ) ~= ' string ' then error ( ( ' [errors/internal]: Bad argument #1, expected string, got %s (%s) ' ) : format ( str , type ( str ) ) ) end
if type ( list_of_other_str ) ~= ' table ' then error ( ( ' [errors/internal]: Bad argument #2, expected table, got %s (%s) ' ) : format ( list_of_other_str , type ( list_of_other_str ) ) ) end
for i = 1 , # list_of_other_str do
2019-06-10 17:41:19 +00:00
list_of_other_str [ i ] = tostring ( list_of_other_str [ i ] )
--if type(list_of_other_str[i]) ~= 'string' then error(('[errors/internal]: Bad argument #2, expected sequence of strings, but got %s (%s) on index %i in sequence'):format(list_of_other_str[i], type(list_of_other_str[i]), i)) end
2017-08-27 09:47:05 +00:00
end
2017-06-09 13:22:25 +00:00
-- Do work
local possible = { }
-- Calculate similarity metrics
for _ , other_str in ipairs ( list_of_other_str ) do
local total_sim = 0
for _ , similarity_func in ipairs ( SIMILARITY_METRICS ) do
2018-07-21 12:15:21 +00:00
local sim , max_sim , min_sim = similarity_func ( str , other_str )
2018-07-21 12:35:06 +00:00
--assert(min_sim <= max_sim)
2017-06-09 13:22:25 +00:00
total_sim = total_sim + ( sim - min_sim ) / ( max_sim - min_sim )
end
possible [ # possible + 1 ] = { other_str , total_sim }
--print('\tTotal: '.. total_sim)
end
-- Sort and flatten
table.sort ( possible , function ( a , b ) return a [ 2 ] > b [ 2 ] end )
for i = 1 , # possible do possible [ i ] = possible [ i ] [ 1 ] end
-- Return the sorted list
return possible
end
--------------------------------------------------------------------------------
return {
levenshtein = levenshtein ,
longest_common_subsequence = longest_common_subsequence ,
jaccard_similarity_of_words = jaccard_similarity_of_words ,
strings_with_highest_similarity = strings_with_highest_similarity ,
}
2018-07-21 12:15:21 +00:00