1
0

Cdata is now being tested against again, and some unicode stuff has been implemented.

This commit is contained in:
Jon Michael Aanes 2017-10-22 14:26:19 +02:00
parent 94dd6acb0c
commit 856d9df690
7 changed files with 134 additions and 13 deletions

View File

@ -1,10 +1,18 @@
--require 'fun' ()
local utf8 = require 'utf8'
--------------------------------------------------------------------------------
local ASCII_CHAR_PATTERN = '[\32-\126\009\010\013]'
local UNICODE_CHAR_PATTERN = '[\01-\127\192-\255][\128-\191]*'
--------------------------------------------------------------------------------
-- Unicode stuff
local function iterate_utf8_chars (str)
-- TODO: Detect invalid codepoints.
return str:gmatch(UNICODE_CHAR_PATTERN)
end
--------------------------------------------------------------------------------
local function probability_of_ascii_string (str)
assert(type(str) == 'string')
@ -30,8 +38,8 @@ local function probability_of_utf8_string (str)
-- Find numbers of valid utf8 bytes
local valid_bytes = 0
for char, valid in utf8.iterate(str) do
if valid then valid_bytes = valid_bytes + #char end
for char in iterate_utf8_chars(str) do
valid_bytes = valid_bytes + #char
end
-- Calculate ratio of valid bytes to total number of bytes.
@ -43,7 +51,23 @@ local function probability_of_utf16_string (str)
end
local function probability_of_binary_data (str)
return 2/3
-- Binary data is kinda weird. One assumption we can make is that the byte
-- values 0x00 and 0xFF will be popular, and that the rest will be almost
-- equally distributed. It will also disregard most boundaries between
-- encodings.
local bytes = {}
for i = 0, 255 do bytes[i] = 0 end
for i = 1, #str do
local byte = str:byte(i)
bytes[byte] = bytes[byte] + 1
end
--
bytes[0] = bytes[0] * 1.5
for i = 32, 126 do bytes[i] = 0 end
--
local bytes_outside_ascii = 0
for i = 0, #bytes do bytes_outside_ascii = bytes_outside_ascii + bytes[i] end
return bytes_outside_ascii/#str
end
local str_representations = {

View File

@ -18,9 +18,35 @@ local function enum (t)
return e
end
--------------------------------------------------------------------------------
-- Unicode
local UNICODE_CHAR_PATTERN = '[\01-\127\192-\255][\128-\191]*'
local UNICODE_ZERO_WIDTH_CHARACTERS = {}
for i = 128, 191 do UNICODE_ZERO_WIDTH_CHARACTERS['\204'..string.char(i)] = true end
for i = 128, 175 do UNICODE_ZERO_WIDTH_CHARACTERS['\205'..string.char(i)] = true end
local function iterate_utf8_chars (str)
-- TODO: Detect invalid codepoints.
return str:gmatch(UNICODE_CHAR_PATTERN)
end
local function utf8_string_length (str)
local len = 0
for char in iterate_utf8_chars(str) do
if not UNICODE_ZERO_WIDTH_CHARACTERS[char] then
len = len + 1
end
end
return len
end
--------------------------------------------------------------------------------
return {
TABLE_TYPE = enum { 'EMPTY', 'SEQUENCE', 'STRING_MAP', 'PURE_MAP', 'MIXED', 'SET' },
DISPLAY = { HIDE = 1, SMALL = 2, INLINE = 3, EXPAND = 4 },
utf8_string_length = utf8_string_length,
}

View File

@ -60,6 +60,7 @@ simplest, and move towards abstraction.
local LIBRARY = require((... and select('1', ...):match('.+%.') or '')..'library') or {}
local DISPLAY = assert(require((... and select('1', ...):match('.+%.') or '')..'common'), '[pretty]: Could not load vital library: common') . DISPLAY
local utf8_string_length = assert(require((... and select('1', ...):match('.+%.') or '')..'common'), '[pretty]: Could not load vital library: common') . utf8_string_length
-- Constants
@ -129,7 +130,7 @@ local function get_line_index (str, line_nr)
local index = 0
for _ = 2, line_nr do
index = str:find('\n', index, true)
if not index then return #str end
if not index then return utf8_string_length(str) end
index = index + 1
end
return index
@ -238,7 +239,7 @@ local function width_of_strings_in_l (l, start_i, end_i)
-- FIXME: Copy of the one in pretty.lua
local width = 0
for i = start_i or 1, (end_i or #l) do
width = width + #l[i]
width = width + utf8_string_length(l[i])
end
return width
end
@ -319,7 +320,7 @@ return function (value, display, l, format_value)
if display ~= DISPLAY.EXPAND then
l[#l+1] = (function_body:sub(1,1) == '\n') and '' or ' '
l[#l+1] = function_body
l[#l+1] = { 'align', 'func_end', #function_body }
l[#l+1] = { 'align', 'func_end', utf8_string_length(function_body) }
l[#l+1] = (function_body:sub(-1) == '\n' or function_body == '') and '' or ' '
return l 'end'
end

View File

@ -201,6 +201,8 @@ end
--------------------------------------------------------------------------------
-- Formatting Util
local length_of_utf8_string = import 'common' . utf8_string_length
local function width_of_strings_in_l (l, start_i, stop_i)
-- Argument fixing and Error Checking
@ -214,7 +216,7 @@ local function width_of_strings_in_l (l, start_i, stop_i)
-- Do stuff
local width = 0
for i = start_i, stop_i do
width = width + ((type(l[i]) ~= 'string') and 1 or #l[i])
width = width + ((type(l[i]) ~= 'string') and 1 or length_of_utf8_string(l[i]))
end
return width
end
@ -417,7 +419,7 @@ local DISPLAY = import 'common' . DISPLAY
local function format_key_and_value_string_map (key, value, display, l, format_value)
l[#l+1] = key
l[#l+1] = { 'align', 'key', #key }
l[#l+1] = { 'align', 'key', length_of_utf8_string(key) }
l[#l+1] = ' = '
return format_value(value, display, l)
end

View File

@ -2,7 +2,7 @@
local SUITE = require 'TestSuite' 'cdata'
-- Only relevant in LUAJIT.
if type(jit) ~= 'table' or true then return SUITE end
if type(jit) ~= 'table' then return SUITE end
SUITE:setEnvironment{
format = require 'pretty',
@ -58,6 +58,14 @@ SUITE:addTest('More binary', function ()
assert_equal('binary', info.most_likely)
end)
SUITE:addTest('Classify an actual binary as binary', function ()
local f = io.open('/usr/bin/ln', 'r')
local str = f:read '*all'
f:close()
local info = analyze_byte_string(str)
assert_equal('binary', info.most_likely)
end)
--------------------------------------------------------------------------------
format_test {
@ -109,7 +117,7 @@ do
}
end
SUITE:addTest('a_very_small_part_of_math', function ()
SUITE:addTest('a very small amount of math ruins everything', function ()
local p = ffi.new('char[1]')
p[0] = 27
local actual_result = format(p + 0, {})

View File

@ -463,7 +463,7 @@ if HAS_UNICODE_IDEN then
name = 'Align functions with unicode-named parameters nicely',
adv_getlocal = true,
input = loadstring 'return {\nfunction (ψ) return ψ end,\nfunction (b) return b end\n}' (),
expect = '{\n function (ψ) return ψ end\n function (b) return b end\n}',
expect = '{\n function (ψ) return ψ end,\n function (b) return b end\n}',
}
end

View File

@ -395,6 +395,66 @@ SUITE:addTest('UseCase: Can load function from file that is shortly deleted', fu
assert(true)
end)
local BIG_EXAMPLE_TABLE = [[
return {
[0] = 21082, [1] = 696,
[2] = 463, [3] = 235,
[4] = 315, [5] = 312,
[6] = 204, [7] = 124,
[8] = 692, [9] = 84,
[10] = 248, [11] = 148,
[12] = 108, [13] = 109,
[14] = 1019, [15] = 1211,
[16] = 470, [17] = 73,
[18] = 121, [19] = 36,
[20] = 149, [21] = 514,
[22] = 38, [23] = 45,
[24] = 353, [25] = 27,
[26] = 27, [27] = 51,
[28] = 84, [29] = 61,
[30] = 29, [31] = 448,
[32] = 2064, [33] = 65,
[34] = 34, [35] = 20,
[36] = 859, [37] = 239,
[38] = 24, [39] = 41,
[40] = 297, [41] = 95,
[42] = 43, [43] = 30,
[44] = 202, [45] = 123,
[46] = 243, [47] = 98,
[48] = 207, [49] = 484,
[50] = 31, [51] = 59,
[52] = 51, [53] = 118,
[54] = 27, [55] = 22,
[56] = 227, [57] = 168,
[58] = 55, [59] = 38,
[60] = 74, [61] = 106,
[62] = 62, [63] = 40,
[64] = 170, [65] = 857,
[66] = 412, [67] = 136,
[68] = 737, [69] = 238,
[70] = 64, [71] = 119,
[72] = 2567, [73] = 481,
[74] = 50, [75] = 55,
[76] = 714, [77] = 189,
[78] = 61, [79] = 55,
[80] = 114, [81] = 26,
[82] = 69, [83] = 150,
[84] = 238, [85] = 172,
[86] = 65, [87] = 81,
[88] = 102, [89] = 39,
[90] = 30, [91] = 154,
[92] = 155, [93] = 191,
[94] = 75, [95] = 185,
[96] = 62, [97] = 334,
[98] = 119, [99] = 217,
[100] = 261
}]]
SUITE:addTest('UseCase: Big Example Table', function ()
assert_equal(BIG_EXAMPLE_TABLE, 'return '..format(loadstring(BIG_EXAMPLE_TABLE)()))
end)
--------------------------------------------------------------------------------
return SUITE