require 'fun' () local utf8 = require 'utf8' local ASCII_CHAR_PATTERN = '[\32-\126\009\010\013]' local UNICODE_CHAR_PATTERN = '[\01-\127\192-\255][\128-\191]*' local function ledgible_subsequences_in (str) return totable(str:gmatch(ASCII_CHAR_PATTERN..'+')) end local function probability_of_ascii_string (str) local sub_seqs = ledgible_subsequences_in(str) local nr_characters = #str local nr_ledgible_characters = foldl(operator.add, map(operator.len, sub_seqs)) local len_of_longest_subseq = foldl(math.max, map(operator.len, sub_seqs)) return ((len_of_longest_subseq/nr_ledgible_characters) + (nr_ledgible_characters / nr_characters)) / 2 end local function probability_of_utf8_string (str) local valid_bytes = 0 for char, valid in utf8.iterate(str) do if valid then valid_bytes = valid_bytes + #char end end return valid_bytes / #str end local function probability_of_utf16_string (str) return 0 end local function probability_of_binary_data (str) return 2/3 end local str_representations = { ascii = probability_of_ascii_string, utf8 = probability_of_utf8_string , utf16 = probability_of_utf16_string, binary = probability_of_binary_data, } return function (str) local str_info, most_likely, most_likely_prob = {}, 'ascii', 0 for repr_name, prob_func in pairs(str_representations) do local prob = prob_func(str) str_info[repr_name..'_prob'] = prob if prob >= most_likely_prob then most_likely, most_likely_prob = repr_name, prob end end str_info.most_likely = most_likely return str_info end