"""Parser for LLVM--.""" import ply.lex as lex import ply.yacc as yacc import ll class LLVMParser(object): # Lexer current_string = "" line_begin = 0 states = [ ('string', 'exclusive') ] reserved = { 'add': 'ADD', 'sub': 'SUB', 'mul': 'MUL', 'shl': 'SHL', 'lshr': 'LSHR', 'ashr': 'ASHR', 'and': 'AND', 'or': 'OR', 'xor': 'XOR', 'sdiv': 'SDIV', 'eq': 'EQ', 'ne': 'NE', 'slt': 'SLT', 'sle': 'SLE', 'sgt': 'SGT', 'sge': 'SGE', 'alloca': 'ALLOCA', 'load': 'LOAD', 'store': 'STORE', 'icmp': 'ICMP', 'call': 'CALL', 'bitcast': 'BITCAST', 'getelementptr': 'GETELEMENTPTR', 'zext': 'ZEXT', 'ptrtoint': 'PTRTOINT', 'ret': 'RET', 'br': 'BR', 'label': 'LABEL', 'define': 'DEFINE', 'null': 'NULL', 'global': 'GLOBAL', 'type': 'TYPE', 'to': 'TO', 'void': 'VOID', 'i1': 'I1', 'i8': 'I8', 'i32': 'I32', 'i64': 'I64', } tokens = [ 'INT', 'STRING', 'ASTERIX', 'LPAREN', 'RPAREN', 'LBRACK', 'RBRACK', 'LBRACE', 'RBRACE', 'ASSIGN', 'COLON', 'COMMA', 'PercentID', 'AtID', 'ID' ] t_ignore = ' \t' t_string_ignore = '' t_ASTERIX = r'\*' t_LPAREN = r'\(' t_RPAREN = r'\)' t_LBRACK = r'\[' t_RBRACK = r'\]' t_LBRACE = r'{' t_RBRACE = r'}' t_ASSIGN = r'=' t_COLON = r':' t_COMMA = r',' def t_COMMENT(self, t): r'(;|declare|target).*' pass def t_newline(self, t): r'\n+' t.lexer.lineno += len(t.value) self.line_begin = t.lexpos def t_string_newline(self, t): r'\n' print("{}:{}: Newline is not allowed inside strings." .format(t.lineno, t.lexpos - self.line_begin)) t.lexer.lineno += 1 self.line_begin = t.lexpos def t_INT(self, t): r'\d+' t.value = int(t.value) return t def t_begin_string(self, t): r'c\"' t.lexer.begin('string') self.current_string = "" def t_string_doublequote(self, t): r'\\"' self.current_string += '"' def t_string_backslash(self, t): r'\\\\' self.current_string += '\\' def t_string_hex(self, t): r'\\[0-9a-f][0-9a-f]' code = int(t.value[1:], 16) self.current_string += chr(code) def t_string_singlebackslash(self, t): r'\\' print("{}:{}: Single backslash is not allowed inside strings." .format(t.lineno, t.lexpos - self.line_begin)) def t_string_end(self, t): r'"' t.value = self.current_string self.current_string = "" t.type = "STRING" t.lexer.begin('INITIAL') return t def t_string_meat(self, t): r'.' self.current_string += t.value def t_ID(self, t): r'[a-zA-Z0-9_-]+' t.type = self.reserved.get(t.value, 'ID') return t def t_PercentID(self, t): r'%[a-zA-Z0-9_-]+' t.value = t.value[1:] return t def t_AtID(self, t): r'@[a-zA-Z0-9_-]+' t.value = t.value[1:] return t def t_ANY_error(self, t): print("{}:{}: Illegal character '{}'" .format(t.lineno, t.lexpos - self.line_begin, t.value[0])) t.lexer.skip(1) # Parser def handle_top_decs(self, smth): tdecs = {} gdecs = {} fdecs = {} for top_dec in smth: name = top_dec.name if isinstance(top_dec, ll.FunctionDec): if name in fdecs.keys(): print('ERROR: Function {} is declared more than once' .format(name)) fdecs[name] = top_dec elif isinstance(top_dec, ll.TypeDec): if name in tdecs.keys(): print('ERROR: Type {} is declared more than once' .format(name)) tdecs[name] = top_dec elif isinstance(top_dec, ll.GlobalDec): if name in gdecs.keys(): print('ERROR: Global {} is declared more than once' .format(name)) gdecs[name] = top_dec else: print('Parser Error: Unknown top level declaration {}' .format(top_dec)) return tdecs, gdecs, fdecs def p_program(self, p): 'program : top_decs' tdecls, gdecls, fdecls = self.handle_top_decs(p[1]) p[0] = ll.Program(tdecls, gdecls, fdecls) def p_topdecs_some(self, p): '''top_decs : tdec top_decs | gdec top_decs | fdec top_decs''' p[0] = [p[1]] + p[2] def p_topdecs_empty(self, p): 'top_decs : ' p[0] = [] def p_tdec(self, p): 'tdec : PercentID ASSIGN TYPE ty' p[0] = ll.TypeDec(p[1], p[4]) def p_ty_void(self, p): 'ty : VOID' p[0] = ll.SimpleType.Void def p_ty_i1(self, p): 'ty : I1' p[0] = ll.SimpleType.I1 def p_ty_i8(self, p): 'ty : I8' p[0] = ll.SimpleType.I8 def p_ty_i32(self, p): 'ty : I32' p[0] = ll.SimpleType.I32 def p_ty_i64(self, p): 'ty : I64' p[0] = ll.SimpleType.I64 def p_ty_ptr(self, p): 'ty : ty ASTERIX' p[0] = ll.PointerType(p[1]) def p_ty_struct(self, p): 'ty : LBRACE ty_list RBRACE' p[0] = ll.StructType(p[2]) def p_ty_array(self, p): 'ty : LBRACK INT ID ty RBRACK' if p[3] == 'x': p[0] = ll.ArrayType(p[2], p[4]) else: print('Invalid name in array definition: {}\n It should have been an x.' .format(p[3])) raise SyntaxError def p_ty_fun(self, p): 'ty : ty LPAREN ty_list RPAREN' p[0] = ll.FunctionType(p[1], p[3]) def p_ty_id(self, p): 'ty : PercentID' p[0] = ll.NamedType(p[1]) def p_ty_list_single(self, p): 'ty_list : ty' p[0] = [p[1]] def p_ty_list_multiple(self, p): 'ty_list : ty COMMA ty_list' p[0] = [p[1]] + p[3] def p_ty_list_empty(self, p): 'ty_list : ' p[0] = [] def p_gdec(self, p): 'gdec : AtID ASSIGN GLOBAL ty ginit' p[0] = ll.GlobalDec(p[1], p[4], p[5]) def p_ginit_null(self, p): 'ginit : NULL' p[0] = ll.GNull() def p_ginit_id(self, p): 'ginit : AtID' p[0] = ll.GGid(p[1]) def p_ginit_int(self, p): 'ginit : INT' p[0] = ll.GInt(p[1]) def p_ginit_string(self, p): 'ginit : STRING' p[0] = ll.GString(p[1]) def p_ginit_array(self, p): 'ginit : LBRACK ty_ginit_list RBRACK' # TODO This syntax seems weird p[0] = ll.GArray(p[2]) def p_ginit_struct(self, p): 'ginit : LBRACE ty_ginit_list RBRACE' p[0] = ll.GStruct(p[2]) def p_ty_ginit_list_single(self, p): 'ty_ginit_list : ty ginit' p[0] = [(p[1], p[2])] def p_ty_ginit_list_multiple(self, p): 'ty_ginit_list : ty ginit COMMA ty_ginit_list' p[0] = [(p[1], p[2])] + p[4] def p_ty_ginit_list_empty(self, p): 'ty_ginit_list : ' p[0] = [] def p_fdec(self, p): 'fdec : DEFINE ty AtID LPAREN ty_id_list RPAREN LBRACE fbody RBRACE' p[0] = ll.FunctionDec(p[2], p[3], p[5], p[8]) def p_ty_id_list_single(self, p): 'ty_id_list : ty PercentID' p[0] = [(p[1], p[2])] def p_ty_id_list_multiple(self, p): 'ty_id_list : ty PercentID COMMA ty_id_list' p[0] = [(p[1], p[2])] + p[4] def p_ty_id_list_empty(self, p): 'ty_id_list : ' p[0] = [] def p_fbody_multiple_blocks(self, p): 'fbody : block named_block_list' named_blocks = {label: block for (label, block) in p[2]} p[0] = ll.FunctionBody(p[1], named_blocks) def p_fbody_one_block(self, p): 'fbody : block' p[0] = ll.FunctionBody(p[1], []) def p_block_insns_terminator(self, p): 'block : insns terminator' p[0] = ll.Block(p[1], p[2]) def p_block_terminator(self, p): 'block : terminator' p[0] = ll.Block([], p[1]) def p_insns_single(self, p): 'insns : optionally_named_insn' p[0] = [p[1]] def p_insns_multiple(self, p): 'insns : optionally_named_insn insns' p[0] = [p[1]] + p[2] def p_optionally_named_insn_some(self, p): 'optionally_named_insn : PercentID ASSIGN insn' p[0] = (p[1], p[3]) def p_optionally_named_insn_none(self, p): 'optionally_named_insn : insn' p[0] = (None, p[1]) def p_insn_bop(self, p): 'insn : bop ty operand COMMA operand' p[0] = ll.Binop(p[1], p[2], p[3], p[5]) def p_insn_alloca(self, p): 'insn : ALLOCA ty' p[0] = ll.Alloca(p[2]) def p_insn_load(self, p): 'insn : LOAD ty COMMA ty operand' p[0] = ll.Load(p[2], p[5]) def p_insn_store(self, p): 'insn : STORE ty operand COMMA ty operand' p[0] = ll.Store(p[2], p[3], p[6]) def p_insn_icmp(self, p): 'insn : ICMP cnd ty operand COMMA operand' p[0] = ll.Icmp(p[2], p[3], p[4], p[6]) def p_insn_call(self, p): 'insn : CALL ty operand LPAREN ty_operand_list RPAREN' p[0] = ll.Call(p[2], p[3], p[5]) def p_insn_call_empty(self, p): 'insn : CALL ty operand LPAREN RPAREN' p[0] = ll.Call(p[2], p[3], []) def p_insn_bitcast(self, p): 'insn : BITCAST ty operand TO ty' p[0] = ll.Bitcast(p[2], p[3], p[5]) def p_insn_gep(self, p): 'insn : GETELEMENTPTR ty COMMA ty operand COMMA ty_operand_list' p[0] = ll.Gep(p[2], p[4], p[5], p[7]) def p_insn_gep_empty(self, p): 'insn : GETELEMENTPTR ty COMMA ty operand' p[0] = ll.Gep(p[2], p[4], p[5], []) def p_insn_zext(self, p): 'insn : ZEXT ty operand TO ty' p[0] = ll.Zext(p[2], p[3], p[4]) def p_insn_ptrtoint(self, p): 'insn : PTRTOINT ty ASTERIX operand TO ty' p[0] = ll.Ptrtoint(p[2], p[4], p[6]) def p_bop(self, p): '''bop : ADD | SUB | MUL | SHL | LSHR | ASHR | AND | OR | XOR | SDIV''' p[0] = p[1] def p_cnd(self, p): '''cnd : EQ | NE | SLT | SLE | SGT | SGE''' p[0] = p[1] def p_ty_operand_list_single(self, p): 'ty_operand_list : ty operand' p[0] = [(p[1], p[2])] def p_ty_operand_list_multiple(self, p): 'ty_operand_list : ty operand COMMA ty_operand_list' p[0] = [(p[1], p[2])] + p[4] def p_terminator_ret_void(self, p): 'terminator : RET VOID' p[0] = ll.Ret(ll.SimpleType.Void, None) def p_terminator_ret_oper(self, p): 'terminator : RET ty operand' p[0] = ll.Ret(p[2], p[3]) def p_terminator_branch(self, p): 'terminator : BR LABEL PercentID' p[0] = ll.Br(p[3]) def p_terminator_conditional_branch(self, p): 'terminator : BR ty operand COMMA LABEL PercentID COMMA LABEL PercentID' p[0] = ll.Cbr(p[2], p[3], p[6], p[9]) def p_operand_null(self, p): 'operand : NULL' p[0] = ll.Null def p_operand_const(self, p): 'operand : INT' p[0] = ll.Const(p[1]) def p_operand_gid(self, p): 'operand : AtID' p[0] = ll.Gid(p[1]) def p_operand_id(self, p): 'operand : PercentID' p[0] = ll.Id(p[1]) def p_named_block_list_single(self, p): 'named_block_list : ID COLON block' p[0] = [(p[1], p[3])] def p_named_block_list_multiple(self, p): 'named_block_list : ID COLON block named_block_list' p[0] = [(p[1], p[3])] + p[4] def p_error(self, t): if t is None: print('Syntax error at end of file') else: print('Syntax error at token {}' .format(t)) def __init__(self): self.tokens += self.reserved.values() def build(self, **kwargs): """ Build the parser. This method builds the parsing tables and initializes the internal lexer and parser variables. This method must be called prior to parsing. """ self.lexer = lex.lex(module=self, **kwargs) self.parser = yacc.yacc(module=self, **kwargs) def parse(self, text): """ Generate AST from text. Return a AST representation of the llvm program from text. """ return self.parser.parse(text, lexer=self.lexer)