"""Parser for LLVM--.""" import ply.lex as lex class LLVMLexer(object): current_string = "" line_begin = 0 states = [ ('string', 'exclusive') ] reserved = { 'if': 'IF', 'then': 'THEN', 'else': 'ELSE', 'while': 'WHILE', 'add': 'ADD', 'sub': 'SUB', 'mul': 'MUL', 'shl': 'SHL', 'lshr': 'LSHR', 'ashr': 'ASHR', 'and': 'AND', 'or': 'OR', 'xor': 'XOR', 'sdiv': 'SDIV', 'eq': 'EQ', 'ne': 'NE', 'slt': 'SLT', 'sle': 'SLE', 'sgt': 'SGT', 'sge': 'SGE', 'alloca': 'ALLOCA', 'load': 'LOAD', 'store': 'STORE', 'icmp': 'ICMP', 'call': 'CALL', 'bitcast': 'BITCAST', 'getelementptr': 'GETELEMENTPTR', 'zext': 'ZEXT', 'ptrtoint': 'PTRTOINT', 'ret': 'RET', 'br': 'BR', 'label': 'LABEL', 'define': 'DEFINE', 'null': 'NULL', 'global': 'GLOBAL', 'type': 'TYPE', 'to': 'TO', 'void': 'VOID', 'i1': 'I1', 'i8': 'I8', 'i32': 'I32', 'i64': 'I64', } tokens = [ 'INT', 'STRING', 'ASTERIX', 'LPAREN', 'RPAREN', 'LBRACK', 'RBRACK', 'LBRACE', 'RBRACE', 'ASSIGN', 'COLON', 'COMMA', 'PercentID', 'AtID', 'ID', 'COMMENT' ] t_ignore = ' \t' t_string_ignore = '' t_ASTERIX = r'\*' t_LPAREN = r'\(' t_RPAREN = r'\)' t_LBRACK = r'\[' t_RBRACK = r'\]' t_LBRACE = r'{' t_RBRACE = r'}' t_ASSIGN = r'=' t_COLON = r':' t_COMMA = r',' def __init__(self): self.tokens += self.reserved.values() def t_COMMENT(self, t): r'(;|declare|target).*' pass def t_newline(self, t): r'\n+' t.lexer.lineno += len(t.value) self.line_begin = t.lexpos def t_string_newline(self, t): r'\n' print("{}:{}: Newline is not allowed inside strings." .format(t.lineno, t.lexpos - self.line_begin)) t.lexer.lineno += 1 self.line_begin = t.lexpos def t_INT(self, t): r'\d+' t.value = int(t.value) return t def t_begin_string(self, t): r'c\"' t.lexer.begin('string') self.current_string = "" def t_string_doublequote(self, t): r'\\"' self.current_string += '"' def t_string_backslash(self, t): r'\\\\' self.current_string += '\\' def t_string_hex(self, t): r'\\[0-9a-f][0-9a-f]' code = int(t.value[1:], 16) self.current_string += chr(code) def t_string_singlebackslash(self, t): r'\\' print("{}:{}: Single backslash is not allowed inside strings." .format(t.lineno, t.lexpos - self.line_begin)) def t_string_end(self, t): r'"' t.value = self.current_string self.current_string = "" t.type = "STRING" t.lexer.begin('INITIAL') return t def t_string_meat(self, t): r'.' self.current_string += t.value def t_ID(self, t): r'[a-zA-Z0-9_-]+' t.type = self.reserved.get(t.value, 'ID') return t def t_PercentID(self, t): r'%[a-zA-Z0-9_-]+' t.value = t.value[1:] return t def t_AtID(self, t): r'@[a-zA-Z0-9_-]+' t.value = t.value[1:] return t def t_ANY_error(self, t): print("{}:{}: Illegal character '{}'" .format(t.lineno, t.lexpos - self.line_begin, t.value[0])) t.lexer.skip(1) def build(self, **kwargs): self.lexer = lex.lex(module=self, **kwargs) def test(self, data): self.lexer.input(data) for tok in self.lexer: print(tok) if __name__ == '__main__': m = LLVMLexer() m.build() data = r'''123 456 c"abc _\2c_ \" \\ and so on" def add sdiv ; some comment qqq ''' m.test(data)