-------------------------------------------------------------------------------- local ASCII_CHAR_PATTERN = '[\32-\126\009\010\013]' local UNICODE_CHAR_PATTERN = '[\01-\127\192-\255][\128-\191]*' -------------------------------------------------------------------------------- -- Unicode stuff local function iterate_utf8_chars (str) -- TODO: Detect invalid codepoints. return str:gmatch(UNICODE_CHAR_PATTERN) end -------------------------------------------------------------------------------- local function probability_of_ascii_string (str) assert(type(str) == 'string') -- Find ascii subsequences of the string. -- Then find the total number of ascii characters, -- and the length of the longest subsequence. local len_of_longest_subseq, nr_ascii_chars = 0, 0 for subseq in str:gmatch(ASCII_CHAR_PATTERN..'+') do len_of_longest_subseq = math.max(#subseq, len_of_longest_subseq) nr_ascii_chars = nr_ascii_chars + #subseq end -- Perform probability calculation -- This heuristic is based on the observation that large numbers of -- ascii characters, and long subsequences are the primary indicators -- of ascii strings. return (len_of_longest_subseq + nr_ascii_chars) / (2 * #str) end local function probability_of_utf8_string (str) assert(type(str) == 'string') -- Find numbers of valid utf8 bytes local valid_bytes = 0 for char in iterate_utf8_chars(str) do valid_bytes = valid_bytes + #char end -- Calculate ratio of valid bytes to total number of bytes. return valid_bytes / #str end local function probability_of_utf16_string (str) return 0 end local function probability_of_binary_data (str) -- Binary data is kinda weird. One assumption we can make is that the byte -- values 0x00 and 0xFF will be popular, and that the rest will be almost -- equally distributed. It will also disregard most boundaries between -- encodings. local bytes = {} for i = 0, 255 do bytes[i] = 0 end for i = 1, #str do local byte = str:byte(i) bytes[byte] = bytes[byte] + 1 end -- bytes[0] = bytes[0] * 1.5 for i = 32, 126 do bytes[i] = 0 end -- local bytes_outside_ascii = 0 for i = 0, #bytes do bytes_outside_ascii = bytes_outside_ascii + bytes[i] end return bytes_outside_ascii/#str end local str_representations = { ascii = probability_of_ascii_string, utf8 = probability_of_utf8_string , utf16 = probability_of_utf16_string, binary = probability_of_binary_data, } return function (str) local str_info, most_likely, most_likely_prob = {}, 'ascii', 0 for repr_name, prob_func in pairs(str_representations) do local prob = prob_func(str) str_info[repr_name..'_prob'] = prob if prob >= most_likely_prob then most_likely, most_likely_prob = repr_name, prob end end str_info.most_likely = most_likely return str_info end