From a75a33b85fa2106c5702198921a041ab1d3c1ba3 Mon Sep 17 00:00:00 2001 From: Jon Michael Aanes Date: Fri, 9 Jun 2017 15:22:25 +0200 Subject: [PATCH] Initial implementation of errors library. --- errors.lua | 56 +++++++++ init.lua | 2 + string_distance.lua | 214 ++++++++++++++++++++++++++++++++++ test/test_errors.lua | 71 +++++++++++ test/test_string_distance.lua | 121 +++++++++++++++++++ test/tests.lua | 7 ++ 6 files changed, 471 insertions(+) create mode 100644 errors.lua create mode 100644 init.lua create mode 100644 string_distance.lua create mode 100644 test/test_errors.lua create mode 100644 test/test_string_distance.lua create mode 100644 test/tests.lua diff --git a/errors.lua b/errors.lua new file mode 100644 index 0000000..4498b88 --- /dev/null +++ b/errors.lua @@ -0,0 +1,56 @@ + +local string_dist = require 'string_distance' + +assert(debug and debug.getinfo) + +-------------------------------------------------------------------------------- +-- Error handler + +local ErrorHandler = setmetatable({}, {__call = function (c, ...) return c.new(...) end}) + ErrorHandler.__index = ErrorHandler + +function ErrorHandler.new (module_name) + return setmetatable({module_name = module_name, registered = {}}, ErrorHandler) +end + +function ErrorHandler:register (f) + assert(type(self) == 'table') + assert(type(f) == 'function') + self.registered[f] = true +end + +local ERROR_MODES = { ['internal'] = true, ['external'] = true } + +function ErrorHandler:getErrorFunc (mode, name) + + -- Parameter fixing + local mode = mode or 'internal' + local name = name or (mode == 'external' and '') or mode + if name ~= '' then name = '/'..name end + + -- Error checking + assert(ERROR_MODES[mode]) + assert(type(name) == 'string') + + -- Create error funcs. + if mode == 'internal' then + return function (format_msg, ...) + error(('[%s%s]: '..format_msg):format(self.module_name, name, ...), 2) + end + elseif mode == 'external' then + return function (format_msg, ...) + local level = 2 + while self.registered[debug.getinfo(level, 'f').func] do + level = level + 1 + end + error(('[%s%s]: '..format_msg):format(self.module_name, name, ...), level) + end + end + assert(false) +end + +-------------------------------------------------------------------------------- + +return { + ErrorHandler = ErrorHandler +} diff --git a/init.lua b/init.lua new file mode 100644 index 0000000..57d1a64 --- /dev/null +++ b/init.lua @@ -0,0 +1,2 @@ + +return require 'errors.errors' diff --git a/string_distance.lua b/string_distance.lua new file mode 100644 index 0000000..3091744 --- /dev/null +++ b/string_distance.lua @@ -0,0 +1,214 @@ +---- String Distance.lua ---- +-- A submodule of the `errors` library, with various string distance functions. +-- Utilities for using these distance functions are also present. +-- +-- Each distance function returns 3 values: +-- * How similar the strings were. +-- * The value of maximum similarity. +-- * The value of least similarity. +-- By using these values, it's possible to normalize. +-- +-- The `levenshtein` function is based on: https://gist.github.com/james2doyle/e406180e143da3bdd102 + +-------------------------------------------------------------------------------- +-- Utility functions + + +local function split_string_into_words (str) + -- TODO: Add unicode support + -- TODO: Add support for splitting 'helloWorld' into {'hello', 'World'} + + assert(str) + + local words = {} + for word in str:gmatch('[A-Z]?[a-z]*') do + if #word > 0 then words[#words+1] = word:lower() end + end + return words +end + +-------------------------------------------------------------------------------- +-- Similarity metrics + +local function levenshtein (str1, str2) + -- levenshtein(str1, str2) + -- + -- Calculates the amount of 'inserts', 'removals' or 'substitutions' + -- required to transform `str1` into `str2`, and vice versa. + -- Note that the strings are automatically converted to lowercase. + -- + -- Lower numbers denote more similar strings. + -- + -- Adapted from the C version given at: https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance + + -- Error handeling. + assert(type(str1) == 'string') + assert(type(str2) == 'string') + + + -- Do work + local str1, str2 = str1:lower(), str2:lower() + local len1, len2 = #str1, #str2 + + -- Quick cut-offs to save time + if len1 == 0 then + return len2, math.abs(len1 - len2), math.max(len1, len2) + elseif len2 == 0 then + return len1, math.abs(len1 - len2), math.max(len1, len2) + elseif str1 == str2 then + return 0, math.abs(len1 - len2), math.max(len1, len2) + end + + -- Init column + local column = {} + for y = 1, len1 do column[y] = y end + + -- Algorithm + for x = 1, len2 do + column[0] = x + local lastdiag, olddiag = x - 1, nil + for y = 1, len1 do + olddiag = column[y] + column[y] = math.min(column[y] + 1, column[y-1] + 1, lastdiag + (str1:byte(y-1) == str2:byte(x-1) and 0 or 1)) + lastdiag = olddiag + end + end + + -- Return the last value - this is the Levenshtein distance + return column[len1], math.abs(len1 - len2), math.max(len1, len2) +end + +local function longest_common_subsequence (str1, str2) + -- longest_common_subsequence(str1, str2) + -- + -- Calculates the longest common subsequence, of the two input strings. + -- That is the maximum amount of characters who fit into the same places in + -- both strings, with possible characters in betweens. This is not the same + -- as longest common substring. + -- Note that the strings are automatically converted to lowercase. + -- + -- Higher numbers denote more similar strings. + + -- Error handeling. + assert(type(str1) == 'string') + assert(type(str2) == 'string') + + -- Do work + local str1, str2 = str1:lower(), str2:lower() + local len1, len2 = #str1, #str2 + + -- Quick cut-offs to save time + if str1 == str2 then + return len1, len1, 0 + elseif len1 == 0 or len2 == 0 then + return 0, math.max(len1, len2), 0 + end + + -- Init C + local matrix = {} + for i = 0, len1 do matrix[i] = {[0] = 0} end + for j = 0, len2 do matrix[0][j] = 0 end + + -- Fill up table + for i = 1, len1 do + for j = 1, len2 do + matrix[i][j] = (str1:byte(i) == str2:byte(j)) and (matrix[i-1][j-1] + 1) or math.max(matrix[i][j-1], matrix[i-1][j]) + end + end + + -- Return + return matrix[len1][len2], math.max(len1, len2), 0 +end + +local function jaccard_similarity_of_words (str1, str2) + -- jaccard_similarity_of_words(str1, str2) + -- + -- Calculates the jaccard similarity of the words in the strings. + -- + -- Higher numbers denote more similar strings. At 1 the strings contain + -- exactly the same words. + + -- Error handeling. + assert(type(str1) == 'string') + assert(type(str2) == 'string') + + -- Quick cut-offs to save time + if str1:lower() == str2:lower() or str1 == '' and str2 == '' then + return 1, 1, 0 + elseif str1 == '' or str2 == '' then + return 0, 1, 0 + end + + -- Work work + local words1, words2, all = {}, {}, {}, {} + for _, word in ipairs(split_string_into_words(str1)) do + words1[word], all[word] = true, true + end + for _, word in ipairs(split_string_into_words(str2)) do + words2[word], all[word] = true, true + end + + -- Which words are in common? + local num_in_common, num_words_in_total = 0, 0 + for word, _ in pairs(all) do + num_words_in_total = num_words_in_total + 1 + if words1[word] and words2[word] then num_in_common = num_in_common + 1 end + end + + -- Return similarity + return num_in_common/num_words_in_total, 1, 0 +end + +local SIMILARITY_METRICS = { + levenshtein, + longest_common_subsequence, + jaccard_similarity_of_words, +} + +-------------------------------------------------------------------------------- + +local function strings_with_highest_similarity (str, list_of_other_str) + -- strings_with_highest_similarity(str, list) + -- + -- Returns a new list, sorted by comparing the strings in the list to the + -- predefined string, sorted in descending order, eg. the first elements in + -- the output list is the most similar. + + -- Error checking + assert(type(str) == 'string') + assert(type(list_of_other_str) == 'table') + for i = 1, #list_of_other_str do assert(type(list_of_other_str[i]) == 'string') end + + -- Do work + local possible = {} + + -- Calculate similarity metrics + for _, other_str in ipairs(list_of_other_str) do + local total_sim = 0 + --print(other_str) + for _, similarity_func in ipairs(SIMILARITY_METRICS) do + local sim, max_sim, min_sim = similarity_func(str, other_str) + assert(max_sim ~= min_sim) + total_sim = total_sim + (sim-min_sim)/(max_sim-min_sim) + --print('', sim, (sim-min_sim)/(max_sim-min_sim)) + end + possible[#possible+1] = {other_str, total_sim} + --print('\tTotal: '.. total_sim) + end + + -- Sort and flatten + table.sort(possible, function(a, b) return a[2] > b[2] end) + for i = 1, #possible do possible[i] = possible[i][1] end + + -- Return the sorted list + return possible +end + +-------------------------------------------------------------------------------- + +return { + levenshtein = levenshtein, + longest_common_subsequence = longest_common_subsequence, + jaccard_similarity_of_words = jaccard_similarity_of_words, + strings_with_highest_similarity = strings_with_highest_similarity, +} diff --git a/test/test_errors.lua b/test/test_errors.lua new file mode 100644 index 0000000..c42a5c0 --- /dev/null +++ b/test/test_errors.lua @@ -0,0 +1,71 @@ + +local error_handler = require('errors').ErrorHandler 'errors_test' + +local SUITE = require('TestSuite').new('errors') +SUITE:setEnviroment { + error_handler = error_handler, + external_error = error_handler:getErrorFunc 'external', + internal_error = error_handler:getErrorFunc 'internal', +} + +-------------------------------------------------------------------------------- +-- Basic errors functionallity + +SUITE:addTest('errors basic functionallity', function() + local func_a = function () external_error 'Hello World' end + local func_b, func_b_line = function () func_a() end, curline() + error_handler:register(func_a) + + local status, error_msg = pcall(func_b) + assert_equal(false, status) + local expected_error = './test/test_errors.lua:'..func_b_line..': [errors_test]: Hello World' + assert_equal(expected_error, error_msg) +end) + +SUITE:addTest('errors deep nesting works', function() + local func_a = function () external_error 'Hello World' end + local func_b, func_b_line = function () func_a() end, curline() + local func_c, func_c_line = function () func_b() end, curline() + error_handler:register(func_a) + error_handler:register(func_b) + + local status, error_msg = pcall(func_c) + assert_equal(false, status) + local expected_error = './test/test_errors.lua:'..func_c_line..': [errors_test]: Hello World' + assert_equal(expected_error, error_msg) +end) + +SUITE:addTest('internal errors also works', function() + local func_a, func_a_line = function () internal_error 'Hello World' end, curline() + local func_b, func_b_line = function () func_a() end, curline() + local func_c, func_c_line = function () func_b() end, curline() + error_handler:register(func_a) + error_handler:register(func_b) + + local status, error_msg = pcall(func_c) + assert_equal(false, status) + local expected_error = './test/test_errors.lua:'..func_a_line..': [errors_test/internal]: Hello World' + assert_equal(expected_error, error_msg) +end) + +SUITE:addTest('errors modes can be overwritten', function() + local extra_error = error_handler:getErrorFunc('external', 'extra') + local func, func_line = function () extra_error 'Hi' end, curline() + + local status, error_msg = pcall(func) + local expected_error = './test/test_errors.lua:'..func_line..': [errors_test/extra]: Hi' + assert_equal(expected_error, error_msg) +end) + +SUITE:addTest('errors modes can be overwritten, including internal', function() + local extra_error = error_handler:getErrorFunc('internal', 'extra') + local func, func_line = function () extra_error 'Hi' end, curline() + + local status, error_msg = pcall(func) + local expected_error = './test/test_errors.lua:'..func_line..': [errors_test/extra]: Hi' + assert_equal(expected_error, error_msg) +end) + +-------------------------------------------------------------------------------- + +return SUITE diff --git a/test/test_string_distance.lua b/test/test_string_distance.lua new file mode 100644 index 0000000..0a1dfe9 --- /dev/null +++ b/test/test_string_distance.lua @@ -0,0 +1,121 @@ + +local SUITE = require('TestSuite').new('string_distance') +SUITE:setEnviroment { + levenshtein = require('string_distance').levenshtein, + longest_common_subsequence = require('string_distance').longest_common_subsequence, + jaccard_similarity_of_words = require('string_distance').jaccard_similarity_of_words, + strings_with_highest_similarity = require('string_distance').strings_with_highest_similarity, +} + +-------------------------------------------------------------------------------- +-- Levenshtein + +SUITE:addTest('levenshtein example 1', function() + -- From: https://en.wikipedia.org/wiki/Levenshtein_distance + assert_equal( 3, levenshtein('kitten', 'sitting') ) +end) + +SUITE:addTest('levenshtein example 2', function() + -- From: https://people.cs.pitt.edu/~kirk/cs1501/Pruhs/Spring2006/assignments/editdistance/Levenshtein%20Distance.htm + assert_equal( 2, levenshtein('gumbo', 'gambol') ) +end) + +SUITE:addTest('levenshtein example 3', function() + -- From: https://secweb.cs.odu.edu/~zeil/cs361/web/website/Lectures/styles/pages/editdistance.html + assert_equal( 1, levenshtein('hello', 'jello') ) +end) + +SUITE:addTest('levenshtein example 4', function() + -- From: https://secweb.cs.odu.edu/~zeil/cs361/web/website/Lectures/styles/pages/editdistance.html + assert_equal( 3, levenshtein('good', 'goodbye') ) +end) + +SUITE:addTest('levenshtein identical strings have distance 0', function() + for _, word in pairs {'hello', 'kitten', 'sitting', 'jello', 'good'} do + assert_equal( 0, levenshtein(word, word) ) + end +end) + +SUITE:addTest('levenshtein normalizes to lowercase', function() + assert_equal( 3, levenshtein('kItten', 'sitTiNg') ) +end) + +SUITE:addTest('levenshtein not defined for non-strings', function() + bad_call( levenshtein, 5, 'hi' ) + bad_call( levenshtein, 'derp', {} ) +end) + +-------------------------------------------------------------------------------- +-- Longest common subsequence + +SUITE:addTest('subsequence example 1', function() + -- From: http://www.geeksforgeeks.org/dynamic-programming-set-4-longest-common-subsequence/ + assert_equal( 4, longest_common_subsequence('AGGTAB', 'GXTXAYB') ) +end) + +SUITE:addTest('subsequence example 2', function() + -- From: http://www.cs.cmu.edu/afs/cs/academic/class/15451-s15/LectureNotes/lecture04.pdf + assert_equal( 4, longest_common_subsequence('ABAZDC', 'BACBAD') ) +end) + +SUITE:addTest('subsequence normalizes to lowercase', function() + assert_equal( 4, longest_common_subsequence('AGGtAB', 'GXTXAYb') ) +end) + +SUITE:addTest('subsequence not defined for non-strings', function() + bad_call( longest_common_subsequence, 5, 'hi' ) + bad_call( longest_common_subsequence, 'derp', {} ) +end) + +-------------------------------------------------------------------------------- +-- Jaccard Similarity + +SUITE:addTest('jaccard example 1', function() + local input_1 = 'hello world' + local input_2 = 'hello planet' + assert_equal( 1/3, jaccard_similarity_of_words(input_1, input_2) ) +end) + +SUITE:addTest('jaccard identical strings', function() + local input_1 = 'hello world' + local input_2 = 'hello world' + assert_equal( 1, jaccard_similarity_of_words(input_1, input_2) ) +end) + +SUITE:addTest('jaccard identical words', function() + local input_1 = 'hello world' + local input_2 = 'world hello' + assert_equal( 1, jaccard_similarity_of_words(input_1, input_2) ) +end) + +SUITE:addTest('jaccard CamelCase works', function() + local input_1 = 'HelloWorld' + local input_2 = 'hello world' + assert_equal( 1, jaccard_similarity_of_words(input_1, input_2) ) +end) + +SUITE:addTest('jaccard snake_case works', function() + local input_1 = 'hello_world' + local input_2 = 'worldHello' + assert_equal( 1, jaccard_similarity_of_words(input_1, input_2) ) +end) + +SUITE:addTest('jaccard singlewords', function() + local input_1 = 'hello' + local input_2 = 'world' + assert_equal( 0, jaccard_similarity_of_words(input_1, input_2) ) +end) + +-------------------------------------------------------------------------------- +-- strings_with_highest_similarity + +SUITE:addTest('strings_with_highest_similarity example 1', function() + local strings = { 'Ada Lovelace', 'Charles Babbage ', 'Allan Turing', 'Grace Hopper' } + local output = strings_with_highest_similarity('turning', strings) + assert_equal( 'Allan Turing', output[1] ) +end) + +-------------------------------------------------------------------------------- + + +return SUITE diff --git a/test/tests.lua b/test/tests.lua new file mode 100644 index 0000000..d309267 --- /dev/null +++ b/test/tests.lua @@ -0,0 +1,7 @@ + +package.path = package.path .. ';./test/?.lua;./src/?.lua' + +local TEST_SUITE = require("TestSuite").new('errors') + TEST_SUITE:addModules('test/test_*') + TEST_SUITE:setOptions(...) + TEST_SUITE:runTests()