Cdata is now being tested against again, and some unicode stuff has been implemented.

2017-10-22 14:26:19 +02:00 · 2017-10-22 14:26:19 +02:00 · 856d9df690
commit 856d9df690
parent 94dd6acb0c
7 changed files with 134 additions and 13 deletions
--- a/analyze_byte_string.lua
+++ b/analyze_byte_string.lua
@ -1,10 +1,18 @@

--require 'fun' ()
-local utf8 = require 'utf8'
+--------------------------------------------------------------------------------

 local ASCII_CHAR_PATTERN   = '[\32-\126\009\010\013]'
 local UNICODE_CHAR_PATTERN = '[\01-\127\192-\255][\128-\191]*'

+--------------------------------------------------------------------------------
+-- Unicode stuff
+
+local function iterate_utf8_chars (str)
+	-- TODO: Detect invalid codepoints.
+	return str:gmatch(UNICODE_CHAR_PATTERN)
+end
+
+--------------------------------------------------------------------------------

 local function probability_of_ascii_string (str)
 	assert(type(str) == 'string')
@ -30,8 +38,8 @@ local function probability_of_utf8_string (str)

 	-- Find numbers of valid utf8 bytes
 	local valid_bytes  =  0
-	for char, valid in utf8.iterate(str) do
-		if valid then  valid_bytes = valid_bytes + #char  end
+	for char in iterate_utf8_chars(str) do
+		valid_bytes = valid_bytes + #char
 	end

 	-- Calculate ratio of valid bytes to total number of bytes.
@ -43,7 +51,23 @@ local function probability_of_utf16_string (str)
 end

 local function probability_of_binary_data (str)
-	return 2/3
+	-- Binary data is kinda weird. One assumption we can make is that the byte
+	-- values 0x00 and 0xFF will be popular, and that the rest will be almost
+	-- equally distributed. It will also disregard most boundaries between
+	-- encodings.
+	local bytes = {}
+	for i = 0, 255 do  bytes[i] = 0  end
+	for i = 1, #str do
+		local byte = str:byte(i)
+		bytes[byte] = bytes[byte] + 1
+	end
+	--
+	bytes[0] = bytes[0] * 1.5
+	for i = 32, 126 do  bytes[i] = 0  end
+	--
+	local bytes_outside_ascii = 0
+	for i = 0, #bytes do  bytes_outside_ascii = bytes_outside_ascii + bytes[i]  end
+	return bytes_outside_ascii/#str
 end

 local str_representations = {
--- a/common.lua
+++ b/common.lua
@ -18,9 +18,35 @@ local function enum (t)
 	return e
 end

+--------------------------------------------------------------------------------
+-- Unicode
+
+local UNICODE_CHAR_PATTERN = '[\01-\127\192-\255][\128-\191]*'
+
+local UNICODE_ZERO_WIDTH_CHARACTERS = {}
+for i = 128, 191 do  UNICODE_ZERO_WIDTH_CHARACTERS['\204'..string.char(i)] = true  end
+for i = 128, 175 do  UNICODE_ZERO_WIDTH_CHARACTERS['\205'..string.char(i)] = true  end
+
+
+local function iterate_utf8_chars (str)
+	-- TODO: Detect invalid codepoints.
+	return str:gmatch(UNICODE_CHAR_PATTERN)
+end
+
+local function utf8_string_length (str)
+	local len = 0
+	for char in iterate_utf8_chars(str) do
+		if not UNICODE_ZERO_WIDTH_CHARACTERS[char] then
+			len = len + 1
+		end
+	end
+	return len
+end
+
 --------------------------------------------------------------------------------

 return {
 	TABLE_TYPE = enum { 'EMPTY', 'SEQUENCE', 'STRING_MAP', 'PURE_MAP', 'MIXED', 'SET' },
 	DISPLAY = { HIDE = 1, SMALL = 2, INLINE = 3, EXPAND = 4 },
+	utf8_string_length = utf8_string_length,
 }
--- a/function.lua
+++ b/function.lua
@ -60,6 +60,7 @@ simplest, and move towards abstraction.

 local LIBRARY =  require((... and select('1', ...):match('.+%.') or '')..'library') or {}
 local DISPLAY =  assert(require((... and select('1', ...):match('.+%.') or '')..'common'), '[pretty]: Could not load vital library: common') . DISPLAY
+local utf8_string_length =  assert(require((... and select('1', ...):match('.+%.') or '')..'common'), '[pretty]: Could not load vital library: common') . utf8_string_length

 -- Constants

@ -129,7 +130,7 @@ local function get_line_index (str, line_nr)
    local index = 0
    for _ = 2, line_nr do
        index = str:find('\n', index, true)
-        if not index then  return #str  end
+        if not index then  return utf8_string_length(str)  end
        index = index + 1
    end
    return index
@ -238,7 +239,7 @@ local function width_of_strings_in_l (l, start_i, end_i)
 	-- FIXME: Copy of the one in pretty.lua
    local  width = 0
    for i = start_i or 1, (end_i or #l) do
-        width = width + #l[i]
+        width = width + utf8_string_length(l[i])
    end
    return width
 end
@ -319,7 +320,7 @@ return function (value, display, l, format_value)
    if display ~= DISPLAY.EXPAND  then
 		l[#l+1] = (function_body:sub(1,1) == '\n') and '' or ' '
 		l[#l+1] = function_body
-		l[#l+1] = { 'align', 'func_end', #function_body }
+		l[#l+1] = { 'align', 'func_end', utf8_string_length(function_body) }
 		l[#l+1] = (function_body:sub(-1) == '\n' or function_body == '') and '' or ' '
 		return l 'end'
    end
--- a/pretty.lua
+++ b/pretty.lua
@ -201,6 +201,8 @@ end
 --------------------------------------------------------------------------------
 -- Formatting Util

+local length_of_utf8_string  =  import 'common' . utf8_string_length
+
 local function width_of_strings_in_l (l, start_i, stop_i)

    -- Argument fixing and Error Checking
@ -214,7 +216,7 @@ local function width_of_strings_in_l (l, start_i, stop_i)
    -- Do stuff
    local  width = 0
    for i = start_i, stop_i do
-        width = width + ((type(l[i]) ~= 'string') and 1 or #l[i])
+        width = width + ((type(l[i]) ~= 'string') and 1 or length_of_utf8_string(l[i]))
    end
    return width
 end
@ -417,7 +419,7 @@ local DISPLAY          	=  import 'common' . DISPLAY

 local function format_key_and_value_string_map (key, value, display, l, format_value)
    l[#l+1] = key
-    l[#l+1] = { 'align', 'key', #key }
+    l[#l+1] = { 'align', 'key', length_of_utf8_string(key) }
    l[#l+1] = ' = '
    return format_value(value, display, l)
 end
--- a/test/test_cdata.lua
+++ b/test/test_cdata.lua
@ -2,7 +2,7 @@
 local SUITE = require 'TestSuite' 'cdata'

 -- Only relevant in LUAJIT.
-if type(jit) ~= 'table' or true then  return SUITE  end
+if type(jit) ~= 'table' then  return SUITE  end

 SUITE:setEnvironment{
    format  = require 'pretty',
@ -58,6 +58,14 @@ SUITE:addTest('More binary', function ()
    assert_equal('binary', info.most_likely)
 end)

+SUITE:addTest('Classify an actual binary as binary', function ()
+    local f = io.open('/usr/bin/ln', 'r')
+    local str = f:read '*all'
+    f:close()
+    local info  =  analyze_byte_string(str)
+    assert_equal('binary', info.most_likely)
+end)
+
 --------------------------------------------------------------------------------

 format_test {
@ -109,7 +117,7 @@ do
    }
 end

-SUITE:addTest('a_very_small_part_of_math', function ()
+SUITE:addTest('a very small amount of math ruins everything', function ()
    local p = ffi.new('char[1]')
          p[0] = 27
    local actual_result  =  format(p + 0, {})
--- a/test/test_function.lua
+++ b/test/test_function.lua
@ -463,7 +463,7 @@ if HAS_UNICODE_IDEN then
        name = 'Align functions with unicode-named parameters nicely',
        adv_getlocal = true,
        input  = loadstring 'return {\nfunction (ψ) return ψ end,\nfunction (b) return b end\n}' (),
-        expect = '{\n    function (ψ) return ψ end\n    function (b) return b end\n}',
+        expect = '{\n    function (ψ) return ψ end,\n    function (b) return b end\n}',
    }
 end

--- a/test/test_pretty.lua
+++ b/test/test_pretty.lua
@ -395,6 +395,66 @@ SUITE:addTest('UseCase: Can load function from file that is shortly deleted', fu
    assert(true)
 end)

+local BIG_EXAMPLE_TABLE = [[
+return {
+    [0]   = 21082, [1]  = 696,
+    [2]   = 463,   [3]  = 235,
+    [4]   = 315,   [5]  = 312,
+    [6]   = 204,   [7]  = 124,
+    [8]   = 692,   [9]  = 84,
+    [10]  = 248,   [11] = 148,
+    [12]  = 108,   [13] = 109,
+    [14]  = 1019,  [15] = 1211,
+    [16]  = 470,   [17] = 73,
+    [18]  = 121,   [19] = 36,
+    [20]  = 149,   [21] = 514,
+    [22]  = 38,    [23] = 45,
+    [24]  = 353,   [25] = 27,
+    [26]  = 27,    [27] = 51,
+    [28]  = 84,    [29] = 61,
+    [30]  = 29,    [31] = 448,
+    [32]  = 2064,  [33] = 65,
+    [34]  = 34,    [35] = 20,
+    [36]  = 859,   [37] = 239,
+    [38]  = 24,    [39] = 41,
+    [40]  = 297,   [41] = 95,
+    [42]  = 43,    [43] = 30,
+    [44]  = 202,   [45] = 123,
+    [46]  = 243,   [47] = 98,
+    [48]  = 207,   [49] = 484,
+    [50]  = 31,    [51] = 59,
+    [52]  = 51,    [53] = 118,
+    [54]  = 27,    [55] = 22,
+    [56]  = 227,   [57] = 168,
+    [58]  = 55,    [59] = 38,
+    [60]  = 74,    [61] = 106,
+    [62]  = 62,    [63] = 40,
+    [64]  = 170,   [65] = 857,
+    [66]  = 412,   [67] = 136,
+    [68]  = 737,   [69] = 238,
+    [70]  = 64,    [71] = 119,
+    [72]  = 2567,  [73] = 481,
+    [74]  = 50,    [75] = 55,
+    [76]  = 714,   [77] = 189,
+    [78]  = 61,    [79] = 55,
+    [80]  = 114,   [81] = 26,
+    [82]  = 69,    [83] = 150,
+    [84]  = 238,   [85] = 172,
+    [86]  = 65,    [87] = 81,
+    [88]  = 102,   [89] = 39,
+    [90]  = 30,    [91] = 154,
+    [92]  = 155,   [93] = 191,
+    [94]  = 75,    [95] = 185,
+    [96]  = 62,    [97] = 334,
+    [98]  = 119,   [99] = 217,
+    [100] = 261
+}]]
+
+SUITE:addTest('UseCase: Big Example Table', function ()
+    assert_equal(BIG_EXAMPLE_TABLE, 'return '..format(loadstring(BIG_EXAMPLE_TABLE)()))
+end)
+
+
 --------------------------------------------------------------------------------

 return SUITE