--require 'fun' ()
local utf8 = require 'utf8'

local ASCII_CHAR_PATTERN   = '[\32-\126\009\010\013]'
local UNICODE_CHAR_PATTERN = '[\01-\127\192-\255][\128-\191]*'


local function probability_of_ascii_string (str)
	assert(type(str) == 'string')

	-- Find ascii subsequences of the string.
	-- Then find the total number of ascii characters,
	-- and the length of the longest subsequence.
	local len_of_longest_subseq, nr_ascii_chars  =  0, 0
	for subseq in str:gmatch(ASCII_CHAR_PATTERN..'+') do
		len_of_longest_subseq  =  math.max(#subseq, len_of_longest_subseq)
		nr_ascii_chars            =  nr_ascii_chars + #subseq
	end

	-- Perform probability calculation
	-- This heuristic is based on the observation that large numbers of
	-- ascii characters, and long subsequences are the primary indicators
	-- of ascii strings.
	return (len_of_longest_subseq + nr_ascii_chars) / (2 * #str)
end

local function probability_of_utf8_string (str)
	assert(type(str) == 'string')

	-- Find numbers of valid utf8 bytes
	local valid_bytes  =  0
	for char, valid in utf8.iterate(str) do
		if valid then  valid_bytes = valid_bytes + #char  end
	end

	-- Calculate ratio of valid bytes to total number of bytes.
	return valid_bytes / #str
end

local function probability_of_utf16_string (str)
	return 0
end

local function probability_of_binary_data (str)
	return 2/3
end

local str_representations = {
	ascii  = probability_of_ascii_string,
	utf8   = probability_of_utf8_string ,
	utf16  = probability_of_utf16_string,
	binary = probability_of_binary_data,
}

return function (str)
	local str_info, most_likely, most_likely_prob = {}, 'ascii', 0
	for repr_name, prob_func in pairs(str_representations) do
		local prob = prob_func(str)
		str_info[repr_name..'_prob'] = prob
		if prob >= most_likely_prob then
			most_likely, most_likely_prob  =  repr_name, prob
		end
	end
	       str_info.most_likely = most_likely
	return str_info
end