From 6f7e767b68063e82975258809506d75ac097651e Mon Sep 17 00:00:00 2001
From: Jon Michael Aanes <jonjmaa@gmail.com>
Date: Fri, 21 Jul 2017 16:44:46 +0200
Subject: [PATCH] Many bad unicode sequences are now properly escaped

---
 pstring.lua           | 21 +++++++++++++++++----
 test/test_pstring.lua | 20 +++++++++++++++++---
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/pstring.lua b/pstring.lua
index 13a66a6..0600c55 100644
--- a/pstring.lua
+++ b/pstring.lua
@@ -53,10 +53,20 @@ local function escape_string (str)
     -- Error checking
     assert(type(str) == 'string')
 
-    -- Do stuff
-    local l = {}
-    for i = 1, #str do  l[#l+1] = CHAR_TO_STR_REPR[str:byte(i)]  end
-    return table.concat(l, '')
+    -- First escape the easy ones.
+	local str  =  str:gsub('.', function (char)  return CHAR_TO_STR_REPR[char:byte()]  end)
+	-- Escape malformed continuation characters
+	repeat
+		local count
+		str, count = str:gsub('([^\128-\255])([\128-\191])', function(a, b) print(a,b)  return a..'\\' .. b:byte()  end)
+	until count == 0
+	-- Escape malformed start characters
+	repeat
+		local count
+		str, count = str:gsub('([\191-\255])([^\128-\191])', function(a, b) print(a,b)  return '\\'..a:byte() .. b  end)
+	until count == 0
+	-- return
+	return str
 end
 
 local function smallest_secure_longform_string_level (str)
@@ -166,6 +176,9 @@ end
 return function (str, depth, l)
 	-- pretty.format_string
 
+	-- TODO: Prefer \ddd style escaping to shorter (\n, \t), when many of the
+	-- \ddd already exist in the text.
+
     -- Error checking
     assert( type(str) == 'string' )
     assert(type(depth) == 'number' and type(l) == 'table')
diff --git a/test/test_pstring.lua b/test/test_pstring.lua
index 3c2b8d8..933fc3e 100644
--- a/test/test_pstring.lua
+++ b/test/test_pstring.lua
@@ -224,11 +224,25 @@ format_test {
 }
 
 format_test {
-    name   = 'Malformed Unicode is escaped',
-    input  = '\000\001\003\012\169\003\000\030',
-    expect = '\'\\000\\000\\001\\003\\012\\169\\003\\000\\030\'',
+    name   = 'Single utf8 continuation byte is escaped',
+    input  = 'abc\169def',
+    expect = '\'abc\\169def\'',
 }
 
+format_test {
+    name   = 'Multiple utf8 continuation bytes are escaped',
+    input  = 'abc\169\190\169\169def',
+    expect = '\'abc\\169\\190\\169\\169def\'',
+}
+
+format_test {
+    name   = 'Single start byte utf8 chars is escaped',
+    input  = 'abc\255def',
+    expect = '\'abc\\255def\'',
+}
+
+-- TODO: Add more malformed unicode tests: https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
+
 --------------------------------------------------------------------------------
 
 return SUITE