Many bad unicode sequences are now properly escaped

2017-07-21 16:44:46 +02:00 · 2017-07-21 16:44:46 +02:00 · 6f7e767b68
commit 6f7e767b68
parent bfbfe4de56
2 changed files with 34 additions and 7 deletions
--- a/pstring.lua
+++ b/pstring.lua
@ -53,10 +53,20 @@ local function escape_string (str)
    -- Error checking
    assert(type(str) == 'string')
-    -- Do stuff
+    -- First escape the easy ones.
-    local l = {}
+	local str  =  str:gsub('.', function (char)  return CHAR_TO_STR_REPR[char:byte()]  end)
-    for i = 1, #str do  l[#l+1] = CHAR_TO_STR_REPR[str:byte(i)]  end
+	-- Escape malformed continuation characters
-    return table.concat(l, '')
+	repeat
 		local count
 		str, count = str:gsub('([^\128-\255])([\128-\191])', function(a, b) print(a,b)  return a..'\\' .. b:byte()  end)
 	until count == 0
 	-- Escape malformed start characters
 	repeat
 		local count
 		str, count = str:gsub('([\191-\255])([^\128-\191])', function(a, b) print(a,b)  return '\\'..a:byte() .. b  end)
 	until count == 0
 	-- return
 	return str
 end
 local function smallest_secure_longform_string_level (str)
@ -166,6 +176,9 @@ end
 return function (str, depth, l)
 	-- pretty.format_string
 	-- TODO: Prefer \ddd style escaping to shorter (\n, \t), when many of the
 	-- \ddd already exist in the text.
    -- Error checking
    assert( type(str) == 'string' )
    assert(type(depth) == 'number' and type(l) == 'table')
--- a/test/test_pstring.lua
+++ b/test/test_pstring.lua
@ -224,11 +224,25 @@ format_test {
 }
 format_test {
-    name   = 'Malformed Unicode is escaped',
+    name   = 'Single utf8 continuation byte is escaped',
-    input  = '\000\001\003\012\169\003\000\030',
+    input  = 'abc\169def',
-    expect = '\'\\000\\000\\001\\003\\012\\169\\003\\000\\030\'',
+    expect = '\'abc\\169def\'',
 }
 format_test {
    name   = 'Multiple utf8 continuation bytes are escaped',
    input  = 'abc\169\190\169\169def',
    expect = '\'abc\\169\\190\\169\\169def\'',
 }
 format_test {
    name   = 'Single start byte utf8 chars is escaped',
    input  = 'abc\255def',
    expect = '\'abc\\255def\'',
 }
 -- TODO: Add more malformed unicode tests: https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
 --------------------------------------------------------------------------------
 return SUITE