Add better string support.

2017-10-29 01:44:58 +02:00 · 2017-10-29 01:44:58 +02:00 · ffef3dd201
commit ffef3dd201
parent 61abaf22bd
1 changed files with 62 additions and 7 deletions
--- a/parser.py
+++ b/parser.py
@ -3,6 +3,13 @@
 import ply.lex as lex

 class LLVMLexer(object):
+    current_string = ""
+    line_begin = 0
+
+    states = [
+        ('string', 'exclusive')
+    ]
+
    reserved = {
        'if': 'IF',
        'then': 'THEN',
@ -57,6 +64,7 @@ class LLVMLexer(object):
    ]

    t_ignore = ' \t'
+    t_string_ignore = ''
    t_ASTERIX = r'\*'
    t_LPAREN = r'\('
    t_RPAREN = r'\)'
@ -81,20 +89,63 @@ class LLVMLexer(object):
    def t_newline(self, t):
        r'\n+'
        t.lexer.lineno += len(t.value)
+        self.line_begin = t.lexpos
+
+
+    def t_string_newline(self, t):
+        r'\n'
+        print("{}:{}: Newline is not allowed inside strings."
+              .format(t.lineno, t.lexpos - self.line_begin))
+        t.lexer.lineno += 1
+        self.line_begin = t.lexpos
+

    def t_INT(self, t):
        r'\d+'
        t.value = int(t.value)
        return t

+    def t_begin_string(self, t):
+        r'c\"'
+        t.lexer.begin('string')
+        self.current_string = ""

-    def t_STRING(self, t):
-        r'c"[^"]*"'
-        value = t.value[2:-1]
-        t.value = value
+    def t_string_doublequote(self, t):
+        r'\\"'
+        self.current_string += '"'
+
+
+    def t_string_backslash(self, t):
+        r'\\\\'
+        self.current_string += '\\'
+
+
+    def t_string_hex(self, t):
+        r'\\[0-9a-f][0-9a-f]'
+        code = int(t.value[1:], 16)
+        self.current_string += chr(code)
+
+
+    def t_string_singlebackslash(self, t):
+        r'\\'
+        print("{}:{}: Single backslash is not allowed inside strings."
+              .format(t.lineno, t.lexpos - self.line_begin))
+
+
+    def t_string_end(self, t):
+        r'"'
+        t.value = self.current_string
+        self.current_string = ""
+        t.type = "STRING"
+        t.lexer.begin('INITIAL')
        return t


+    def t_string_meat(self, t):
+        r'.'
+        self.current_string += t.value
+
+
    def t_ID(self, t):
        r'[a-zA-Z0-9_-]+'
        t.type = self.reserved.get(t.value, 'ID')
@ -112,13 +163,16 @@ class LLVMLexer(object):
        return t


-    def t_error(self, t):
-        print("{}:{}: Illegal character '{}'".format(t.lineno, t.lexpos, t.value[0]))
+    def t_ANY_error(self, t):
+        print("{}:{}: Illegal character '{}'"
+              .format(t.lineno, t.lexpos - self.line_begin, t.value[0]))
        t.lexer.skip(1)

+
    def build(self, **kwargs):
        self.lexer = lex.lex(module=self, **kwargs)

+
    def test(self, data):
        self.lexer.input(data)
        for tok in self.lexer:
@ -128,6 +182,7 @@ class LLVMLexer(object):
 if __name__ == '__main__':
    m = LLVMLexer()
    m.build()
-    data = '''123 456 c"abc" def add sdiv ; some comment
+    data = r'''123 456 c"abc _\2c_ \" \\ and so on" def add sdiv ; some comment
+    qqq
    '''
    m.test(data)