Cdata is now being tested against again, and some unicode stuff has been implemented.
This commit is contained in:
parent
94dd6acb0c
commit
856d9df690
|
@ -1,10 +1,18 @@
|
|||
|
||||
--require 'fun' ()
|
||||
local utf8 = require 'utf8'
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
local ASCII_CHAR_PATTERN = '[\32-\126\009\010\013]'
|
||||
local UNICODE_CHAR_PATTERN = '[\01-\127\192-\255][\128-\191]*'
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
-- Unicode stuff
|
||||
|
||||
local function iterate_utf8_chars (str)
|
||||
-- TODO: Detect invalid codepoints.
|
||||
return str:gmatch(UNICODE_CHAR_PATTERN)
|
||||
end
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
local function probability_of_ascii_string (str)
|
||||
assert(type(str) == 'string')
|
||||
|
@ -30,8 +38,8 @@ local function probability_of_utf8_string (str)
|
|||
|
||||
-- Find numbers of valid utf8 bytes
|
||||
local valid_bytes = 0
|
||||
for char, valid in utf8.iterate(str) do
|
||||
if valid then valid_bytes = valid_bytes + #char end
|
||||
for char in iterate_utf8_chars(str) do
|
||||
valid_bytes = valid_bytes + #char
|
||||
end
|
||||
|
||||
-- Calculate ratio of valid bytes to total number of bytes.
|
||||
|
@ -43,7 +51,23 @@ local function probability_of_utf16_string (str)
|
|||
end
|
||||
|
||||
local function probability_of_binary_data (str)
|
||||
return 2/3
|
||||
-- Binary data is kinda weird. One assumption we can make is that the byte
|
||||
-- values 0x00 and 0xFF will be popular, and that the rest will be almost
|
||||
-- equally distributed. It will also disregard most boundaries between
|
||||
-- encodings.
|
||||
local bytes = {}
|
||||
for i = 0, 255 do bytes[i] = 0 end
|
||||
for i = 1, #str do
|
||||
local byte = str:byte(i)
|
||||
bytes[byte] = bytes[byte] + 1
|
||||
end
|
||||
--
|
||||
bytes[0] = bytes[0] * 1.5
|
||||
for i = 32, 126 do bytes[i] = 0 end
|
||||
--
|
||||
local bytes_outside_ascii = 0
|
||||
for i = 0, #bytes do bytes_outside_ascii = bytes_outside_ascii + bytes[i] end
|
||||
return bytes_outside_ascii/#str
|
||||
end
|
||||
|
||||
local str_representations = {
|
||||
|
|
26
common.lua
26
common.lua
|
@ -18,9 +18,35 @@ local function enum (t)
|
|||
return e
|
||||
end
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
-- Unicode
|
||||
|
||||
local UNICODE_CHAR_PATTERN = '[\01-\127\192-\255][\128-\191]*'
|
||||
|
||||
local UNICODE_ZERO_WIDTH_CHARACTERS = {}
|
||||
for i = 128, 191 do UNICODE_ZERO_WIDTH_CHARACTERS['\204'..string.char(i)] = true end
|
||||
for i = 128, 175 do UNICODE_ZERO_WIDTH_CHARACTERS['\205'..string.char(i)] = true end
|
||||
|
||||
|
||||
local function iterate_utf8_chars (str)
|
||||
-- TODO: Detect invalid codepoints.
|
||||
return str:gmatch(UNICODE_CHAR_PATTERN)
|
||||
end
|
||||
|
||||
local function utf8_string_length (str)
|
||||
local len = 0
|
||||
for char in iterate_utf8_chars(str) do
|
||||
if not UNICODE_ZERO_WIDTH_CHARACTERS[char] then
|
||||
len = len + 1
|
||||
end
|
||||
end
|
||||
return len
|
||||
end
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
return {
|
||||
TABLE_TYPE = enum { 'EMPTY', 'SEQUENCE', 'STRING_MAP', 'PURE_MAP', 'MIXED', 'SET' },
|
||||
DISPLAY = { HIDE = 1, SMALL = 2, INLINE = 3, EXPAND = 4 },
|
||||
utf8_string_length = utf8_string_length,
|
||||
}
|
||||
|
|
|
@ -60,6 +60,7 @@ simplest, and move towards abstraction.
|
|||
|
||||
local LIBRARY = require((... and select('1', ...):match('.+%.') or '')..'library') or {}
|
||||
local DISPLAY = assert(require((... and select('1', ...):match('.+%.') or '')..'common'), '[pretty]: Could not load vital library: common') . DISPLAY
|
||||
local utf8_string_length = assert(require((... and select('1', ...):match('.+%.') or '')..'common'), '[pretty]: Could not load vital library: common') . utf8_string_length
|
||||
|
||||
-- Constants
|
||||
|
||||
|
@ -129,7 +130,7 @@ local function get_line_index (str, line_nr)
|
|||
local index = 0
|
||||
for _ = 2, line_nr do
|
||||
index = str:find('\n', index, true)
|
||||
if not index then return #str end
|
||||
if not index then return utf8_string_length(str) end
|
||||
index = index + 1
|
||||
end
|
||||
return index
|
||||
|
@ -238,7 +239,7 @@ local function width_of_strings_in_l (l, start_i, end_i)
|
|||
-- FIXME: Copy of the one in pretty.lua
|
||||
local width = 0
|
||||
for i = start_i or 1, (end_i or #l) do
|
||||
width = width + #l[i]
|
||||
width = width + utf8_string_length(l[i])
|
||||
end
|
||||
return width
|
||||
end
|
||||
|
@ -319,7 +320,7 @@ return function (value, display, l, format_value)
|
|||
if display ~= DISPLAY.EXPAND then
|
||||
l[#l+1] = (function_body:sub(1,1) == '\n') and '' or ' '
|
||||
l[#l+1] = function_body
|
||||
l[#l+1] = { 'align', 'func_end', #function_body }
|
||||
l[#l+1] = { 'align', 'func_end', utf8_string_length(function_body) }
|
||||
l[#l+1] = (function_body:sub(-1) == '\n' or function_body == '') and '' or ' '
|
||||
return l 'end'
|
||||
end
|
||||
|
|
|
@ -201,6 +201,8 @@ end
|
|||
--------------------------------------------------------------------------------
|
||||
-- Formatting Util
|
||||
|
||||
local length_of_utf8_string = import 'common' . utf8_string_length
|
||||
|
||||
local function width_of_strings_in_l (l, start_i, stop_i)
|
||||
|
||||
-- Argument fixing and Error Checking
|
||||
|
@ -214,7 +216,7 @@ local function width_of_strings_in_l (l, start_i, stop_i)
|
|||
-- Do stuff
|
||||
local width = 0
|
||||
for i = start_i, stop_i do
|
||||
width = width + ((type(l[i]) ~= 'string') and 1 or #l[i])
|
||||
width = width + ((type(l[i]) ~= 'string') and 1 or length_of_utf8_string(l[i]))
|
||||
end
|
||||
return width
|
||||
end
|
||||
|
@ -417,7 +419,7 @@ local DISPLAY = import 'common' . DISPLAY
|
|||
|
||||
local function format_key_and_value_string_map (key, value, display, l, format_value)
|
||||
l[#l+1] = key
|
||||
l[#l+1] = { 'align', 'key', #key }
|
||||
l[#l+1] = { 'align', 'key', length_of_utf8_string(key) }
|
||||
l[#l+1] = ' = '
|
||||
return format_value(value, display, l)
|
||||
end
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
local SUITE = require 'TestSuite' 'cdata'
|
||||
|
||||
-- Only relevant in LUAJIT.
|
||||
if type(jit) ~= 'table' or true then return SUITE end
|
||||
if type(jit) ~= 'table' then return SUITE end
|
||||
|
||||
SUITE:setEnvironment{
|
||||
format = require 'pretty',
|
||||
|
@ -58,6 +58,14 @@ SUITE:addTest('More binary', function ()
|
|||
assert_equal('binary', info.most_likely)
|
||||
end)
|
||||
|
||||
SUITE:addTest('Classify an actual binary as binary', function ()
|
||||
local f = io.open('/usr/bin/ln', 'r')
|
||||
local str = f:read '*all'
|
||||
f:close()
|
||||
local info = analyze_byte_string(str)
|
||||
assert_equal('binary', info.most_likely)
|
||||
end)
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
format_test {
|
||||
|
@ -109,7 +117,7 @@ do
|
|||
}
|
||||
end
|
||||
|
||||
SUITE:addTest('a_very_small_part_of_math', function ()
|
||||
SUITE:addTest('a very small amount of math ruins everything', function ()
|
||||
local p = ffi.new('char[1]')
|
||||
p[0] = 27
|
||||
local actual_result = format(p + 0, {})
|
||||
|
|
|
@ -463,7 +463,7 @@ if HAS_UNICODE_IDEN then
|
|||
name = 'Align functions with unicode-named parameters nicely',
|
||||
adv_getlocal = true,
|
||||
input = loadstring 'return {\nfunction (ψ) return ψ end,\nfunction (b) return b end\n}' (),
|
||||
expect = '{\n function (ψ) return ψ end\n function (b) return b end\n}',
|
||||
expect = '{\n function (ψ) return ψ end,\n function (b) return b end\n}',
|
||||
}
|
||||
end
|
||||
|
||||
|
|
|
@ -395,6 +395,66 @@ SUITE:addTest('UseCase: Can load function from file that is shortly deleted', fu
|
|||
assert(true)
|
||||
end)
|
||||
|
||||
local BIG_EXAMPLE_TABLE = [[
|
||||
return {
|
||||
[0] = 21082, [1] = 696,
|
||||
[2] = 463, [3] = 235,
|
||||
[4] = 315, [5] = 312,
|
||||
[6] = 204, [7] = 124,
|
||||
[8] = 692, [9] = 84,
|
||||
[10] = 248, [11] = 148,
|
||||
[12] = 108, [13] = 109,
|
||||
[14] = 1019, [15] = 1211,
|
||||
[16] = 470, [17] = 73,
|
||||
[18] = 121, [19] = 36,
|
||||
[20] = 149, [21] = 514,
|
||||
[22] = 38, [23] = 45,
|
||||
[24] = 353, [25] = 27,
|
||||
[26] = 27, [27] = 51,
|
||||
[28] = 84, [29] = 61,
|
||||
[30] = 29, [31] = 448,
|
||||
[32] = 2064, [33] = 65,
|
||||
[34] = 34, [35] = 20,
|
||||
[36] = 859, [37] = 239,
|
||||
[38] = 24, [39] = 41,
|
||||
[40] = 297, [41] = 95,
|
||||
[42] = 43, [43] = 30,
|
||||
[44] = 202, [45] = 123,
|
||||
[46] = 243, [47] = 98,
|
||||
[48] = 207, [49] = 484,
|
||||
[50] = 31, [51] = 59,
|
||||
[52] = 51, [53] = 118,
|
||||
[54] = 27, [55] = 22,
|
||||
[56] = 227, [57] = 168,
|
||||
[58] = 55, [59] = 38,
|
||||
[60] = 74, [61] = 106,
|
||||
[62] = 62, [63] = 40,
|
||||
[64] = 170, [65] = 857,
|
||||
[66] = 412, [67] = 136,
|
||||
[68] = 737, [69] = 238,
|
||||
[70] = 64, [71] = 119,
|
||||
[72] = 2567, [73] = 481,
|
||||
[74] = 50, [75] = 55,
|
||||
[76] = 714, [77] = 189,
|
||||
[78] = 61, [79] = 55,
|
||||
[80] = 114, [81] = 26,
|
||||
[82] = 69, [83] = 150,
|
||||
[84] = 238, [85] = 172,
|
||||
[86] = 65, [87] = 81,
|
||||
[88] = 102, [89] = 39,
|
||||
[90] = 30, [91] = 154,
|
||||
[92] = 155, [93] = 191,
|
||||
[94] = 75, [95] = 185,
|
||||
[96] = 62, [97] = 334,
|
||||
[98] = 119, [99] = 217,
|
||||
[100] = 261
|
||||
}]]
|
||||
|
||||
SUITE:addTest('UseCase: Big Example Table', function ()
|
||||
assert_equal(BIG_EXAMPLE_TABLE, 'return '..format(loadstring(BIG_EXAMPLE_TABLE)()))
|
||||
end)
|
||||
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
return SUITE
|
||||
|
|
Loading…
Reference in New Issue
Block a user