diff --git a/analyze_byte_string.lua b/analyze_byte_string.lua index 8327ae5..61c202c 100644 --- a/analyze_byte_string.lua +++ b/analyze_byte_string.lua @@ -1,10 +1,18 @@ ---require 'fun' () -local utf8 = require 'utf8' +-------------------------------------------------------------------------------- local ASCII_CHAR_PATTERN = '[\32-\126\009\010\013]' local UNICODE_CHAR_PATTERN = '[\01-\127\192-\255][\128-\191]*' +-------------------------------------------------------------------------------- +-- Unicode stuff + +local function iterate_utf8_chars (str) + -- TODO: Detect invalid codepoints. + return str:gmatch(UNICODE_CHAR_PATTERN) +end + +-------------------------------------------------------------------------------- local function probability_of_ascii_string (str) assert(type(str) == 'string') @@ -30,8 +38,8 @@ local function probability_of_utf8_string (str) -- Find numbers of valid utf8 bytes local valid_bytes = 0 - for char, valid in utf8.iterate(str) do - if valid then valid_bytes = valid_bytes + #char end + for char in iterate_utf8_chars(str) do + valid_bytes = valid_bytes + #char end -- Calculate ratio of valid bytes to total number of bytes. @@ -43,7 +51,23 @@ local function probability_of_utf16_string (str) end local function probability_of_binary_data (str) - return 2/3 + -- Binary data is kinda weird. One assumption we can make is that the byte + -- values 0x00 and 0xFF will be popular, and that the rest will be almost + -- equally distributed. It will also disregard most boundaries between + -- encodings. + local bytes = {} + for i = 0, 255 do bytes[i] = 0 end + for i = 1, #str do + local byte = str:byte(i) + bytes[byte] = bytes[byte] + 1 + end + -- + bytes[0] = bytes[0] * 1.5 + for i = 32, 126 do bytes[i] = 0 end + -- + local bytes_outside_ascii = 0 + for i = 0, #bytes do bytes_outside_ascii = bytes_outside_ascii + bytes[i] end + return bytes_outside_ascii/#str end local str_representations = { diff --git a/common.lua b/common.lua index dee6d04..3f96154 100644 --- a/common.lua +++ b/common.lua @@ -18,9 +18,35 @@ local function enum (t) return e end +-------------------------------------------------------------------------------- +-- Unicode + +local UNICODE_CHAR_PATTERN = '[\01-\127\192-\255][\128-\191]*' + +local UNICODE_ZERO_WIDTH_CHARACTERS = {} +for i = 128, 191 do UNICODE_ZERO_WIDTH_CHARACTERS['\204'..string.char(i)] = true end +for i = 128, 175 do UNICODE_ZERO_WIDTH_CHARACTERS['\205'..string.char(i)] = true end + + +local function iterate_utf8_chars (str) + -- TODO: Detect invalid codepoints. + return str:gmatch(UNICODE_CHAR_PATTERN) +end + +local function utf8_string_length (str) + local len = 0 + for char in iterate_utf8_chars(str) do + if not UNICODE_ZERO_WIDTH_CHARACTERS[char] then + len = len + 1 + end + end + return len +end + -------------------------------------------------------------------------------- return { TABLE_TYPE = enum { 'EMPTY', 'SEQUENCE', 'STRING_MAP', 'PURE_MAP', 'MIXED', 'SET' }, DISPLAY = { HIDE = 1, SMALL = 2, INLINE = 3, EXPAND = 4 }, + utf8_string_length = utf8_string_length, } diff --git a/function.lua b/function.lua index 8d19f87..c5b3449 100644 --- a/function.lua +++ b/function.lua @@ -60,6 +60,7 @@ simplest, and move towards abstraction. local LIBRARY = require((... and select('1', ...):match('.+%.') or '')..'library') or {} local DISPLAY = assert(require((... and select('1', ...):match('.+%.') or '')..'common'), '[pretty]: Could not load vital library: common') . DISPLAY +local utf8_string_length = assert(require((... and select('1', ...):match('.+%.') or '')..'common'), '[pretty]: Could not load vital library: common') . utf8_string_length -- Constants @@ -129,7 +130,7 @@ local function get_line_index (str, line_nr) local index = 0 for _ = 2, line_nr do index = str:find('\n', index, true) - if not index then return #str end + if not index then return utf8_string_length(str) end index = index + 1 end return index @@ -238,7 +239,7 @@ local function width_of_strings_in_l (l, start_i, end_i) -- FIXME: Copy of the one in pretty.lua local width = 0 for i = start_i or 1, (end_i or #l) do - width = width + #l[i] + width = width + utf8_string_length(l[i]) end return width end @@ -319,7 +320,7 @@ return function (value, display, l, format_value) if display ~= DISPLAY.EXPAND then l[#l+1] = (function_body:sub(1,1) == '\n') and '' or ' ' l[#l+1] = function_body - l[#l+1] = { 'align', 'func_end', #function_body } + l[#l+1] = { 'align', 'func_end', utf8_string_length(function_body) } l[#l+1] = (function_body:sub(-1) == '\n' or function_body == '') and '' or ' ' return l 'end' end diff --git a/pretty.lua b/pretty.lua index c2a244f..96ee85f 100644 --- a/pretty.lua +++ b/pretty.lua @@ -201,6 +201,8 @@ end -------------------------------------------------------------------------------- -- Formatting Util +local length_of_utf8_string = import 'common' . utf8_string_length + local function width_of_strings_in_l (l, start_i, stop_i) -- Argument fixing and Error Checking @@ -214,7 +216,7 @@ local function width_of_strings_in_l (l, start_i, stop_i) -- Do stuff local width = 0 for i = start_i, stop_i do - width = width + ((type(l[i]) ~= 'string') and 1 or #l[i]) + width = width + ((type(l[i]) ~= 'string') and 1 or length_of_utf8_string(l[i])) end return width end @@ -417,7 +419,7 @@ local DISPLAY = import 'common' . DISPLAY local function format_key_and_value_string_map (key, value, display, l, format_value) l[#l+1] = key - l[#l+1] = { 'align', 'key', #key } + l[#l+1] = { 'align', 'key', length_of_utf8_string(key) } l[#l+1] = ' = ' return format_value(value, display, l) end diff --git a/test/test_cdata.lua b/test/test_cdata.lua index 1165db5..824726c 100644 --- a/test/test_cdata.lua +++ b/test/test_cdata.lua @@ -2,7 +2,7 @@ local SUITE = require 'TestSuite' 'cdata' -- Only relevant in LUAJIT. -if type(jit) ~= 'table' or true then return SUITE end +if type(jit) ~= 'table' then return SUITE end SUITE:setEnvironment{ format = require 'pretty', @@ -58,6 +58,14 @@ SUITE:addTest('More binary', function () assert_equal('binary', info.most_likely) end) +SUITE:addTest('Classify an actual binary as binary', function () + local f = io.open('/usr/bin/ln', 'r') + local str = f:read '*all' + f:close() + local info = analyze_byte_string(str) + assert_equal('binary', info.most_likely) +end) + -------------------------------------------------------------------------------- format_test { @@ -109,7 +117,7 @@ do } end -SUITE:addTest('a_very_small_part_of_math', function () +SUITE:addTest('a very small amount of math ruins everything', function () local p = ffi.new('char[1]') p[0] = 27 local actual_result = format(p + 0, {}) diff --git a/test/test_function.lua b/test/test_function.lua index a6aebf4..4b94ca4 100644 --- a/test/test_function.lua +++ b/test/test_function.lua @@ -463,7 +463,7 @@ if HAS_UNICODE_IDEN then name = 'Align functions with unicode-named parameters nicely', adv_getlocal = true, input = loadstring 'return {\nfunction (ψ) return ψ end,\nfunction (b) return b end\n}' (), - expect = '{\n function (ψ) return ψ end\n function (b) return b end\n}', + expect = '{\n function (ψ) return ψ end,\n function (b) return b end\n}', } end diff --git a/test/test_pretty.lua b/test/test_pretty.lua index 4de8ad5..c9e187b 100644 --- a/test/test_pretty.lua +++ b/test/test_pretty.lua @@ -395,6 +395,66 @@ SUITE:addTest('UseCase: Can load function from file that is shortly deleted', fu assert(true) end) +local BIG_EXAMPLE_TABLE = [[ +return { + [0] = 21082, [1] = 696, + [2] = 463, [3] = 235, + [4] = 315, [5] = 312, + [6] = 204, [7] = 124, + [8] = 692, [9] = 84, + [10] = 248, [11] = 148, + [12] = 108, [13] = 109, + [14] = 1019, [15] = 1211, + [16] = 470, [17] = 73, + [18] = 121, [19] = 36, + [20] = 149, [21] = 514, + [22] = 38, [23] = 45, + [24] = 353, [25] = 27, + [26] = 27, [27] = 51, + [28] = 84, [29] = 61, + [30] = 29, [31] = 448, + [32] = 2064, [33] = 65, + [34] = 34, [35] = 20, + [36] = 859, [37] = 239, + [38] = 24, [39] = 41, + [40] = 297, [41] = 95, + [42] = 43, [43] = 30, + [44] = 202, [45] = 123, + [46] = 243, [47] = 98, + [48] = 207, [49] = 484, + [50] = 31, [51] = 59, + [52] = 51, [53] = 118, + [54] = 27, [55] = 22, + [56] = 227, [57] = 168, + [58] = 55, [59] = 38, + [60] = 74, [61] = 106, + [62] = 62, [63] = 40, + [64] = 170, [65] = 857, + [66] = 412, [67] = 136, + [68] = 737, [69] = 238, + [70] = 64, [71] = 119, + [72] = 2567, [73] = 481, + [74] = 50, [75] = 55, + [76] = 714, [77] = 189, + [78] = 61, [79] = 55, + [80] = 114, [81] = 26, + [82] = 69, [83] = 150, + [84] = 238, [85] = 172, + [86] = 65, [87] = 81, + [88] = 102, [89] = 39, + [90] = 30, [91] = 154, + [92] = 155, [93] = 191, + [94] = 75, [95] = 185, + [96] = 62, [97] = 334, + [98] = 119, [99] = 217, + [100] = 261 +}]] + +SUITE:addTest('UseCase: Big Example Table', function () + assert_equal(BIG_EXAMPLE_TABLE, 'return '..format(loadstring(BIG_EXAMPLE_TABLE)())) +end) + + -------------------------------------------------------------------------------- return SUITE