2017-05-01 13:41:23 +00:00
|
|
|
|
2017-10-22 12:26:19 +00:00
|
|
|
--------------------------------------------------------------------------------
|
2017-05-01 13:41:23 +00:00
|
|
|
|
|
|
|
local ASCII_CHAR_PATTERN = '[\32-\126\009\010\013]'
|
|
|
|
local UNICODE_CHAR_PATTERN = '[\01-\127\192-\255][\128-\191]*'
|
|
|
|
|
2017-10-22 12:26:19 +00:00
|
|
|
--------------------------------------------------------------------------------
|
|
|
|
-- Unicode stuff
|
|
|
|
|
|
|
|
local function iterate_utf8_chars (str)
|
|
|
|
-- TODO: Detect invalid codepoints.
|
|
|
|
return str:gmatch(UNICODE_CHAR_PATTERN)
|
|
|
|
end
|
|
|
|
|
|
|
|
--------------------------------------------------------------------------------
|
2017-05-01 13:41:23 +00:00
|
|
|
|
|
|
|
local function probability_of_ascii_string (str)
|
2017-08-07 08:39:45 +00:00
|
|
|
assert(type(str) == 'string')
|
|
|
|
|
|
|
|
-- Find ascii subsequences of the string.
|
|
|
|
-- Then find the total number of ascii characters,
|
|
|
|
-- and the length of the longest subsequence.
|
|
|
|
local len_of_longest_subseq, nr_ascii_chars = 0, 0
|
|
|
|
for subseq in str:gmatch(ASCII_CHAR_PATTERN..'+') do
|
|
|
|
len_of_longest_subseq = math.max(#subseq, len_of_longest_subseq)
|
|
|
|
nr_ascii_chars = nr_ascii_chars + #subseq
|
|
|
|
end
|
|
|
|
|
|
|
|
-- Perform probability calculation
|
|
|
|
-- This heuristic is based on the observation that large numbers of
|
|
|
|
-- ascii characters, and long subsequences are the primary indicators
|
|
|
|
-- of ascii strings.
|
|
|
|
return (len_of_longest_subseq + nr_ascii_chars) / (2 * #str)
|
2017-05-01 13:41:23 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
local function probability_of_utf8_string (str)
|
2017-08-07 08:39:45 +00:00
|
|
|
assert(type(str) == 'string')
|
|
|
|
|
|
|
|
-- Find numbers of valid utf8 bytes
|
2017-05-01 13:41:23 +00:00
|
|
|
local valid_bytes = 0
|
2017-10-22 12:26:19 +00:00
|
|
|
for char in iterate_utf8_chars(str) do
|
|
|
|
valid_bytes = valid_bytes + #char
|
2017-05-01 13:41:23 +00:00
|
|
|
end
|
2017-08-07 08:39:45 +00:00
|
|
|
|
|
|
|
-- Calculate ratio of valid bytes to total number of bytes.
|
2017-05-01 13:41:23 +00:00
|
|
|
return valid_bytes / #str
|
|
|
|
end
|
|
|
|
|
|
|
|
local function probability_of_utf16_string (str)
|
|
|
|
return 0
|
|
|
|
end
|
|
|
|
|
|
|
|
local function probability_of_binary_data (str)
|
2017-10-22 12:26:19 +00:00
|
|
|
-- Binary data is kinda weird. One assumption we can make is that the byte
|
|
|
|
-- values 0x00 and 0xFF will be popular, and that the rest will be almost
|
|
|
|
-- equally distributed. It will also disregard most boundaries between
|
|
|
|
-- encodings.
|
|
|
|
local bytes = {}
|
|
|
|
for i = 0, 255 do bytes[i] = 0 end
|
|
|
|
for i = 1, #str do
|
|
|
|
local byte = str:byte(i)
|
|
|
|
bytes[byte] = bytes[byte] + 1
|
|
|
|
end
|
|
|
|
--
|
|
|
|
bytes[0] = bytes[0] * 1.5
|
|
|
|
for i = 32, 126 do bytes[i] = 0 end
|
|
|
|
--
|
|
|
|
local bytes_outside_ascii = 0
|
|
|
|
for i = 0, #bytes do bytes_outside_ascii = bytes_outside_ascii + bytes[i] end
|
|
|
|
return bytes_outside_ascii/#str
|
2017-05-01 13:41:23 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
local str_representations = {
|
|
|
|
ascii = probability_of_ascii_string,
|
|
|
|
utf8 = probability_of_utf8_string ,
|
|
|
|
utf16 = probability_of_utf16_string,
|
|
|
|
binary = probability_of_binary_data,
|
|
|
|
}
|
|
|
|
|
|
|
|
return function (str)
|
|
|
|
local str_info, most_likely, most_likely_prob = {}, 'ascii', 0
|
|
|
|
for repr_name, prob_func in pairs(str_representations) do
|
|
|
|
local prob = prob_func(str)
|
|
|
|
str_info[repr_name..'_prob'] = prob
|
|
|
|
if prob >= most_likely_prob then
|
|
|
|
most_likely, most_likely_prob = repr_name, prob
|
|
|
|
end
|
|
|
|
end
|
|
|
|
str_info.most_likely = most_likely
|
|
|
|
return str_info
|
|
|
|
end
|