1
0
pretty/analyze_byte_string.lua

68 lines
1.9 KiB
Lua

--require 'fun' ()
local utf8 = require 'utf8'
local ASCII_CHAR_PATTERN = '[\32-\126\009\010\013]'
local UNICODE_CHAR_PATTERN = '[\01-\127\192-\255][\128-\191]*'
local function probability_of_ascii_string (str)
assert(type(str) == 'string')
-- Find ascii subsequences of the string.
-- Then find the total number of ascii characters,
-- and the length of the longest subsequence.
local len_of_longest_subseq, nr_ascii_chars = 0, 0
for subseq in str:gmatch(ASCII_CHAR_PATTERN..'+') do
len_of_longest_subseq = math.max(#subseq, len_of_longest_subseq)
nr_ascii_chars = nr_ascii_chars + #subseq
end
-- Perform probability calculation
-- This heuristic is based on the observation that large numbers of
-- ascii characters, and long subsequences are the primary indicators
-- of ascii strings.
return (len_of_longest_subseq + nr_ascii_chars) / (2 * #str)
end
local function probability_of_utf8_string (str)
assert(type(str) == 'string')
-- Find numbers of valid utf8 bytes
local valid_bytes = 0
for char, valid in utf8.iterate(str) do
if valid then valid_bytes = valid_bytes + #char end
end
-- Calculate ratio of valid bytes to total number of bytes.
return valid_bytes / #str
end
local function probability_of_utf16_string (str)
return 0
end
local function probability_of_binary_data (str)
return 2/3
end
local str_representations = {
ascii = probability_of_ascii_string,
utf8 = probability_of_utf8_string ,
utf16 = probability_of_utf16_string,
binary = probability_of_binary_data,
}
return function (str)
local str_info, most_likely, most_likely_prob = {}, 'ascii', 0
for repr_name, prob_func in pairs(str_representations) do
local prob = prob_func(str)
str_info[repr_name..'_prob'] = prob
if prob >= most_likely_prob then
most_likely, most_likely_prob = repr_name, prob
end
end
str_info.most_likely = most_likely
return str_info
end