pretty/analyze_byte_string.lua


--------------------------------------------------------------------------------

local ASCII_CHAR_PATTERN   = '[\32-\126\009\010\013]'
local UNICODE_CHAR_PATTERN = '[\01-\127\192-\255][\128-\191]*'

--------------------------------------------------------------------------------
-- Unicode stuff

local function iterate_utf8_chars (str)
	-- TODO: Detect invalid codepoints.
	return str:gmatch(UNICODE_CHAR_PATTERN)
end

--------------------------------------------------------------------------------

local function probability_of_ascii_string (str)
	assert(type(str) == 'string')

	-- Find ascii subsequences of the string.
	-- Then find the total number of ascii characters,
	-- and the length of the longest subsequence.
	local len_of_longest_subseq, nr_ascii_chars  =  0, 0
	for subseq in str:gmatch(ASCII_CHAR_PATTERN..'+') do
		len_of_longest_subseq  =  math.max(#subseq, len_of_longest_subseq)
		nr_ascii_chars            =  nr_ascii_chars + #subseq
	end

	-- Perform probability calculation
	-- This heuristic is based on the observation that large numbers of
	-- ascii characters, and long subsequences are the primary indicators
	-- of ascii strings.
	return (len_of_longest_subseq + nr_ascii_chars) / (2 * #str)
end

local function probability_of_utf8_string (str)
	assert(type(str) == 'string')

	-- Find numbers of valid utf8 bytes
	local valid_bytes  =  0
	for char in iterate_utf8_chars(str) do
		valid_bytes = valid_bytes + #char
	end

	-- Calculate ratio of valid bytes to total number of bytes.
	return valid_bytes / #str
end

local function probability_of_utf16_string (str)
	return 0
end

local function probability_of_binary_data (str)
	-- Binary data is kinda weird. One assumption we can make is that the byte
	-- values 0x00 and 0xFF will be popular, and that the rest will be almost
	-- equally distributed. It will also disregard most boundaries between
	-- encodings.
	local bytes = {}
	for i = 0, 255 do  bytes[i] = 0  end
	for i = 1, #str do
		local byte = str:byte(i)
		bytes[byte] = bytes[byte] + 1
	end
	--
	bytes[0] = bytes[0] * 1.5
	for i = 32, 126 do  bytes[i] = 0  end
	--
	local bytes_outside_ascii = 0
	for i = 0, #bytes do  bytes_outside_ascii = bytes_outside_ascii + bytes[i]  end
	return bytes_outside_ascii/#str
end

local str_representations = {
	ascii  = probability_of_ascii_string,
	utf8   = probability_of_utf8_string ,
	utf16  = probability_of_utf16_string,
	binary = probability_of_binary_data,
}

return function (str)
	local str_info, most_likely, most_likely_prob = {}, 'ascii', 0
	for repr_name, prob_func in pairs(str_representations) do
		local prob = prob_func(str)
		str_info[repr_name..'_prob'] = prob
		if prob >= most_likely_prob then
			most_likely, most_likely_prob  =  repr_name, prob
		end
	end
	       str_info.most_likely = most_likely
	return str_info
end