From ffef3dd20194c06406ff0fd565113018a7596576 Mon Sep 17 00:00:00 2001 From: cfreksen Date: Sun, 29 Oct 2017 01:44:58 +0200 Subject: [PATCH] Add better string support. --- parser.py | 69 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 62 insertions(+), 7 deletions(-) diff --git a/parser.py b/parser.py index fead75d..1e6b5f1 100644 --- a/parser.py +++ b/parser.py @@ -3,6 +3,13 @@ import ply.lex as lex class LLVMLexer(object): + current_string = "" + line_begin = 0 + + states = [ + ('string', 'exclusive') + ] + reserved = { 'if': 'IF', 'then': 'THEN', @@ -57,6 +64,7 @@ class LLVMLexer(object): ] t_ignore = ' \t' + t_string_ignore = '' t_ASTERIX = r'\*' t_LPAREN = r'\(' t_RPAREN = r'\)' @@ -81,20 +89,63 @@ class LLVMLexer(object): def t_newline(self, t): r'\n+' t.lexer.lineno += len(t.value) + self.line_begin = t.lexpos + + + def t_string_newline(self, t): + r'\n' + print("{}:{}: Newline is not allowed inside strings." + .format(t.lineno, t.lexpos - self.line_begin)) + t.lexer.lineno += 1 + self.line_begin = t.lexpos + def t_INT(self, t): r'\d+' t.value = int(t.value) return t + def t_begin_string(self, t): + r'c\"' + t.lexer.begin('string') + self.current_string = "" - def t_STRING(self, t): - r'c"[^"]*"' - value = t.value[2:-1] - t.value = value + def t_string_doublequote(self, t): + r'\\"' + self.current_string += '"' + + + def t_string_backslash(self, t): + r'\\\\' + self.current_string += '\\' + + + def t_string_hex(self, t): + r'\\[0-9a-f][0-9a-f]' + code = int(t.value[1:], 16) + self.current_string += chr(code) + + + def t_string_singlebackslash(self, t): + r'\\' + print("{}:{}: Single backslash is not allowed inside strings." + .format(t.lineno, t.lexpos - self.line_begin)) + + + def t_string_end(self, t): + r'"' + t.value = self.current_string + self.current_string = "" + t.type = "STRING" + t.lexer.begin('INITIAL') return t + def t_string_meat(self, t): + r'.' + self.current_string += t.value + + def t_ID(self, t): r'[a-zA-Z0-9_-]+' t.type = self.reserved.get(t.value, 'ID') @@ -112,13 +163,16 @@ class LLVMLexer(object): return t - def t_error(self, t): - print("{}:{}: Illegal character '{}'".format(t.lineno, t.lexpos, t.value[0])) + def t_ANY_error(self, t): + print("{}:{}: Illegal character '{}'" + .format(t.lineno, t.lexpos - self.line_begin, t.value[0])) t.lexer.skip(1) + def build(self, **kwargs): self.lexer = lex.lex(module=self, **kwargs) + def test(self, data): self.lexer.input(data) for tok in self.lexer: @@ -128,6 +182,7 @@ class LLVMLexer(object): if __name__ == '__main__': m = LLVMLexer() m.build() - data = '''123 456 c"abc" def add sdiv ; some comment + data = r'''123 456 c"abc _\2c_ \" \\ and so on" def add sdiv ; some comment + qqq ''' m.test(data)