From 6f7e767b68063e82975258809506d75ac097651e Mon Sep 17 00:00:00 2001 From: Jon Michael Aanes Date: Fri, 21 Jul 2017 16:44:46 +0200 Subject: [PATCH] Many bad unicode sequences are now properly escaped --- pstring.lua | 21 +++++++++++++++++---- test/test_pstring.lua | 20 +++++++++++++++++--- 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/pstring.lua b/pstring.lua index 13a66a6..0600c55 100644 --- a/pstring.lua +++ b/pstring.lua @@ -53,10 +53,20 @@ local function escape_string (str) -- Error checking assert(type(str) == 'string') - -- Do stuff - local l = {} - for i = 1, #str do l[#l+1] = CHAR_TO_STR_REPR[str:byte(i)] end - return table.concat(l, '') + -- First escape the easy ones. + local str = str:gsub('.', function (char) return CHAR_TO_STR_REPR[char:byte()] end) + -- Escape malformed continuation characters + repeat + local count + str, count = str:gsub('([^\128-\255])([\128-\191])', function(a, b) print(a,b) return a..'\\' .. b:byte() end) + until count == 0 + -- Escape malformed start characters + repeat + local count + str, count = str:gsub('([\191-\255])([^\128-\191])', function(a, b) print(a,b) return '\\'..a:byte() .. b end) + until count == 0 + -- return + return str end local function smallest_secure_longform_string_level (str) @@ -166,6 +176,9 @@ end return function (str, depth, l) -- pretty.format_string + -- TODO: Prefer \ddd style escaping to shorter (\n, \t), when many of the + -- \ddd already exist in the text. + -- Error checking assert( type(str) == 'string' ) assert(type(depth) == 'number' and type(l) == 'table') diff --git a/test/test_pstring.lua b/test/test_pstring.lua index 3c2b8d8..933fc3e 100644 --- a/test/test_pstring.lua +++ b/test/test_pstring.lua @@ -224,11 +224,25 @@ format_test { } format_test { - name = 'Malformed Unicode is escaped', - input = '\000\001\003\012\169\003\000\030', - expect = '\'\\000\\000\\001\\003\\012\\169\\003\\000\\030\'', + name = 'Single utf8 continuation byte is escaped', + input = 'abc\169def', + expect = '\'abc\\169def\'', } +format_test { + name = 'Multiple utf8 continuation bytes are escaped', + input = 'abc\169\190\169\169def', + expect = '\'abc\\169\\190\\169\\169def\'', +} + +format_test { + name = 'Single start byte utf8 chars is escaped', + input = 'abc\255def', + expect = '\'abc\\255def\'', +} + +-- TODO: Add more malformed unicode tests: https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt + -------------------------------------------------------------------------------- return SUITE