55 lines
1.5 KiB
Lua
55 lines
1.5 KiB
Lua
|
|
||
|
require 'fun' ()
|
||
|
local utf8 = require 'utf8'
|
||
|
|
||
|
local ASCII_CHAR_PATTERN = '[\32-\126\009\010\013]'
|
||
|
local UNICODE_CHAR_PATTERN = '[\01-\127\192-\255][\128-\191]*'
|
||
|
|
||
|
local function ledgible_subsequences_in (str)
|
||
|
return totable(str:gmatch(ASCII_CHAR_PATTERN..'+'))
|
||
|
end
|
||
|
|
||
|
local function probability_of_ascii_string (str)
|
||
|
local sub_seqs = ledgible_subsequences_in(str)
|
||
|
local nr_characters = #str
|
||
|
local nr_ledgible_characters = foldl(operator.add, map(operator.len, sub_seqs))
|
||
|
local len_of_longest_subseq = foldl(math.max, map(operator.len, sub_seqs))
|
||
|
return ((len_of_longest_subseq/nr_ledgible_characters) + (nr_ledgible_characters / nr_characters)) / 2
|
||
|
end
|
||
|
|
||
|
local function probability_of_utf8_string (str)
|
||
|
local valid_bytes = 0
|
||
|
for char, valid in utf8.iterate(str) do
|
||
|
if valid then valid_bytes = valid_bytes + #char end
|
||
|
end
|
||
|
return valid_bytes / #str
|
||
|
end
|
||
|
|
||
|
local function probability_of_utf16_string (str)
|
||
|
return 0
|
||
|
end
|
||
|
|
||
|
local function probability_of_binary_data (str)
|
||
|
return 2/3
|
||
|
end
|
||
|
|
||
|
local str_representations = {
|
||
|
ascii = probability_of_ascii_string,
|
||
|
utf8 = probability_of_utf8_string ,
|
||
|
utf16 = probability_of_utf16_string,
|
||
|
binary = probability_of_binary_data,
|
||
|
}
|
||
|
|
||
|
return function (str)
|
||
|
local str_info, most_likely, most_likely_prob = {}, 'ascii', 0
|
||
|
for repr_name, prob_func in pairs(str_representations) do
|
||
|
local prob = prob_func(str)
|
||
|
str_info[repr_name..'_prob'] = prob
|
||
|
if prob >= most_likely_prob then
|
||
|
most_likely, most_likely_prob = repr_name, prob
|
||
|
end
|
||
|
end
|
||
|
str_info.most_likely = most_likely
|
||
|
return str_info
|
||
|
end
|