Add better string support.

This commit is contained in:
cfreksen 2017-10-29 01:44:58 +02:00
parent 61abaf22bd
commit ffef3dd201
No known key found for this signature in database
GPG Key ID: EAC13EE101008978

View File

@ -3,6 +3,13 @@
import ply.lex as lex import ply.lex as lex
class LLVMLexer(object): class LLVMLexer(object):
current_string = ""
line_begin = 0
states = [
('string', 'exclusive')
]
reserved = { reserved = {
'if': 'IF', 'if': 'IF',
'then': 'THEN', 'then': 'THEN',
@ -57,6 +64,7 @@ class LLVMLexer(object):
] ]
t_ignore = ' \t' t_ignore = ' \t'
t_string_ignore = ''
t_ASTERIX = r'\*' t_ASTERIX = r'\*'
t_LPAREN = r'\(' t_LPAREN = r'\('
t_RPAREN = r'\)' t_RPAREN = r'\)'
@ -81,20 +89,63 @@ class LLVMLexer(object):
def t_newline(self, t): def t_newline(self, t):
r'\n+' r'\n+'
t.lexer.lineno += len(t.value) t.lexer.lineno += len(t.value)
self.line_begin = t.lexpos
def t_string_newline(self, t):
r'\n'
print("{}:{}: Newline is not allowed inside strings."
.format(t.lineno, t.lexpos - self.line_begin))
t.lexer.lineno += 1
self.line_begin = t.lexpos
def t_INT(self, t): def t_INT(self, t):
r'\d+' r'\d+'
t.value = int(t.value) t.value = int(t.value)
return t return t
def t_begin_string(self, t):
r'c\"'
t.lexer.begin('string')
self.current_string = ""
def t_STRING(self, t): def t_string_doublequote(self, t):
r'c"[^"]*"' r'\\"'
value = t.value[2:-1] self.current_string += '"'
t.value = value
def t_string_backslash(self, t):
r'\\\\'
self.current_string += '\\'
def t_string_hex(self, t):
r'\\[0-9a-f][0-9a-f]'
code = int(t.value[1:], 16)
self.current_string += chr(code)
def t_string_singlebackslash(self, t):
r'\\'
print("{}:{}: Single backslash is not allowed inside strings."
.format(t.lineno, t.lexpos - self.line_begin))
def t_string_end(self, t):
r'"'
t.value = self.current_string
self.current_string = ""
t.type = "STRING"
t.lexer.begin('INITIAL')
return t return t
def t_string_meat(self, t):
r'.'
self.current_string += t.value
def t_ID(self, t): def t_ID(self, t):
r'[a-zA-Z0-9_-]+' r'[a-zA-Z0-9_-]+'
t.type = self.reserved.get(t.value, 'ID') t.type = self.reserved.get(t.value, 'ID')
@ -112,13 +163,16 @@ class LLVMLexer(object):
return t return t
def t_error(self, t): def t_ANY_error(self, t):
print("{}:{}: Illegal character '{}'".format(t.lineno, t.lexpos, t.value[0])) print("{}:{}: Illegal character '{}'"
.format(t.lineno, t.lexpos - self.line_begin, t.value[0]))
t.lexer.skip(1) t.lexer.skip(1)
def build(self, **kwargs): def build(self, **kwargs):
self.lexer = lex.lex(module=self, **kwargs) self.lexer = lex.lex(module=self, **kwargs)
def test(self, data): def test(self, data):
self.lexer.input(data) self.lexer.input(data)
for tok in self.lexer: for tok in self.lexer:
@ -128,6 +182,7 @@ class LLVMLexer(object):
if __name__ == '__main__': if __name__ == '__main__':
m = LLVMLexer() m = LLVMLexer()
m.build() m.build()
data = '''123 456 c"abc" def add sdiv ; some comment data = r'''123 456 c"abc _\2c_ \" \\ and so on" def add sdiv ; some comment
qqq
''' '''
m.test(data) m.test(data)