From 34964c186b1c028628d0fafa744a698c52e9c397 Mon Sep 17 00:00:00 2001 From: cfreksen Date: Sun, 29 Oct 2017 13:46:12 +0100 Subject: [PATCH] Update parser. --- parser.py | 341 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 308 insertions(+), 33 deletions(-) diff --git a/parser.py b/parser.py index e0936a4..d3adbe2 100644 --- a/parser.py +++ b/parser.py @@ -5,12 +5,55 @@ from collections import namedtuple import ply.lex as lex import ply.yacc as yacc +from enum import Enum + + +class SimpleType(Enum): + Void = 1 + I1 = 2 + I8 = 3 + I32 = 4 + I64 = 5 + + # Namedtuples for storing AST Program = namedtuple('Program', ['tdecls', 'gdecls', 'fdecls']) + +TypeDec = namedtuple('TypeDec', ['name', 'body']) +PointerType = namedtuple('PointerType', ['inner_ty']) +StructType = namedtuple('StructType', ['fields']) +ArrayType = namedtuple('ArrayType', ['length', 'inner_ty']) +FunctionType = namedtuple('FunctionType', ['return_ty', 'parameters']) +NamedType = namedtuple('NamedType', ['other_name']) + +GlobalDec = namedtuple('GlobalDec', ['name', 'ty', 'body']) +GNull = namedtuple('GNull', []) +GGid = namedtuple('GGid', ['val']) +GInt = namedtuple('GInt', ['val']) +GString = namedtuple('GString', ['val']) +GArray = namedtuple('GArray', ['entries']) +GStruct = namedtuple('GStruct', ['fields']) + FunctionDec = namedtuple('FunctionDec', ['return_type', 'name', 'parameters', 'body']) FunctionBody = namedtuple('FunctionBody', ['first_block', 'named_blocks']) Block = namedtuple('Block', ['insns', 'terminator']) +Binop = namedtuple('Binop', ['bop', 'ty', 'left', 'right']) +Alloca = namedtuple('Alloca', ['ty']) +Load = namedtuple('Load', ['ty', 'oper']) +Store = namedtuple('Store', ['ty', 'value', 'location']) +Icmp = namedtuple('Icmp', ['cnd', 'ty', 'left', 'right']) +Call = namedtuple('Call', ['return_ty', 'callee', 'arguments']) +Bitcast = namedtuple('Bitcast', ['from_ty', 'oper', 'to_ty']) +Gep = namedtuple('Gep', ['base_ty', 'oper_ty', 'oper', 'steps']) +Zext = namedtuple('Zext', ['from_ty', 'oper', 'to_ty']) +Ptrtoint = namedtuple('Ptrtoint', ['pointer_ty', 'oper', 'to_ty']) + +Ret = namedtuple('Ret', ['ty', 'oper']) +Br = namedtuple('Br', ['label']) +Cbr = namedtuple('Cbr', ['ty', 'oper', 'then_label', 'else_label']) + + class LLVMParser(object): # Lexer @@ -22,10 +65,6 @@ class LLVMParser(object): ] reserved = { - 'if': 'IF', - 'then': 'THEN', - 'else': 'ELSE', - 'while': 'WHILE', 'add': 'ADD', 'sub': 'SUB', 'mul': 'MUL', @@ -70,8 +109,7 @@ class LLVMParser(object): 'INT', 'STRING', 'ASTERIX', 'LPAREN', 'RPAREN', 'LBRACK', 'RBRACK', 'LBRACE', 'RBRACE', 'ASSIGN', 'COLON', 'COMMA', - 'PercentID', 'AtID', 'ID', - 'COMMENT' + 'PercentID', 'AtID', 'ID' ] t_ignore = ' \t' @@ -87,18 +125,15 @@ class LLVMParser(object): t_COLON = r':' t_COMMA = r',' - def t_COMMENT(self, t): r'(;|declare|target).*' pass - def t_newline(self, t): r'\n+' t.lexer.lineno += len(t.value) self.line_begin = t.lexpos - def t_string_newline(self, t): r'\n' print("{}:{}: Newline is not allowed inside strings." @@ -106,7 +141,6 @@ class LLVMParser(object): t.lexer.lineno += 1 self.line_begin = t.lexpos - def t_INT(self, t): r'\d+' t.value = int(t.value) @@ -121,24 +155,20 @@ class LLVMParser(object): r'\\"' self.current_string += '"' - def t_string_backslash(self, t): r'\\\\' self.current_string += '\\' - def t_string_hex(self, t): r'\\[0-9a-f][0-9a-f]' code = int(t.value[1:], 16) self.current_string += chr(code) - def t_string_singlebackslash(self, t): r'\\' print("{}:{}: Single backslash is not allowed inside strings." .format(t.lineno, t.lexpos - self.line_begin)) - def t_string_end(self, t): r'"' t.value = self.current_string @@ -147,18 +177,15 @@ class LLVMParser(object): t.lexer.begin('INITIAL') return t - def t_string_meat(self, t): r'.' self.current_string += t.value - def t_ID(self, t): r'[a-zA-Z0-9_-]+' t.type = self.reserved.get(t.value, 'ID') return t - def t_PercentID(self, t): r'%[a-zA-Z0-9_-]+' t.value = t.value[1:] @@ -169,43 +196,159 @@ class LLVMParser(object): t.value = t.value[1:] return t - def t_ANY_error(self, t): print("{}:{}: Illegal character '{}'" .format(t.lineno, t.lexpos - self.line_begin, t.value[0])) t.lexer.skip(1) - # Parser def handle_top_decs(self, smth): - # TODO: Implement - return [], [], smth + tdecs = {} + gdecs = {} + fdecs = {} + for top_dec in smth: + name = top_dec.name + if isinstance(top_dec, FunctionDec): + if name in fdecs.keys(): + print('ERROR: Function {} is declared more than once' + .format(name)) + fdecs[name] = top_dec + elif isinstance(top_dec, TypeDec): + if name in tdecs.keys(): + print('ERROR: Type {} is declared more than once' + .format(name)) + tdecs[name] = top_dec + elif isinstance(top_dec, GlobalDec): + if name in gdecs.keys(): + print('ERROR: Global {} is declared more than once' + .format(name)) + gdecs[name] = top_dec + else: + print('Parser Error: Unknown top level declaration {}' + .format(top_dec)) + return tdecs, gdecs, fdecs def p_program(self, p): 'program : top_decs' tdecls, gdecls, fdecls = self.handle_top_decs(p[1]) p[0] = Program(tdecls, gdecls, fdecls) - def p_topdecs_fdec(self, p): - 'top_decs : fdec top_decs' + def p_topdecs_some(self, p): + '''top_decs : tdec top_decs + | gdec top_decs + | fdec top_decs''' p[0] = [p[1]] + p[2] def p_topdecs_empty(self, p): 'top_decs : ' p[0] = [] + def p_tdec(self, p): + 'tdec : PercentID ASSIGN TYPE ty' + p[0] = TypeDec(p[1], p[4]) + + def p_ty_void(self, p): + 'ty : VOID' + p[0] = SimpleType.Void + + def p_ty_i1(self, p): + 'ty : I1' + p[0] = SimpleType.I1 + + def p_ty_i8(self, p): + 'ty : I8' + p[0] = SimpleType.I8 + + def p_ty_i32(self, p): + 'ty : I32' + p[0] = SimpleType.I32 + + def p_ty_i64(self, p): + 'ty : I64' + p[0] = SimpleType.I64 + + def p_ty_ptr(self, p): + 'ty : ty ASTERIX' + p[0] = PointerType(p[1]) + + def p_ty_struct(self, p): + 'ty : LBRACE ty_list RBRACE' + p[0] = StructType(p[2]) + + def p_ty_array(self, p): + 'ty : LBRACK INT ID ty RBRACK' + if p[3] == 'x': + p[0] = ArrayType(p[2], p[4]) + else: + print('Invalid name in array definition: {}\n It should have been an x.' + .format(p[3])) + raise SyntaxError + + def p_ty_fun(self, p): + 'ty : ty LPAREN ty_list RPAREN' + p[0] = FunctionType(p[1], p[3]) + + def p_ty_id(self, p): + 'ty : PercentID' + p[0] = NamedType(p[1]) + + def p_ty_list_single(self, p): + 'ty_list : ty' + p[0] = [p[1]] + + def p_ty_list_multiple(self, p): + 'ty_list : ty COMMA ty_list' + p[0] = [p[1]] + p[3] + + def p_ty_list_empty(self, p): + 'ty_list : ' + p[0] = [] + + def p_gdec(self, p): + 'gdec : AtID ASSIGN GLOBAL ty ginit' + p[0] = GlobalDec(p[1], p[4], p[5]) + + def p_ginit_null(self, p): + 'ginit : NULL' + p[0] = GNull() + + def p_ginit_id(self, p): + 'ginit : AtID' + p[0] = GGid(p[1]) + + def p_ginit_int(self, p): + 'ginit : INT' + p[0] = GInt(p[1]) + + def p_ginit_string(self, p): + 'ginit : STRING' + p[0] = GString(p[1]) + + def p_ginit_array(self, p): + 'ginit : LBRACK ty_ginit_list RBRACK' + # TODO This syntax seems weird + p[0] = GArray(p[2]) + + def p_ginit_struct(self, p): + 'ginit : LBRACE ty_ginit_list RBRACE' + p[0] = GStruct(p[2]) + + def p_ty_ginit_list_single(self, p): + 'ty_ginit_list : ty ginit' + p[0] = [(p[1], p[2])] + + def p_ty_ginit_list_multiple(self, p): + 'ty_ginit_list : ty ginit COMMA ty_ginit_list' + p[0] = [(p[1], p[2])] + p[4] + + def p_ty_ginit_list_empty(self, p): + 'ty_ginit_list : ' + p[0] = [] + def p_fdec(self, p): 'fdec : DEFINE ty AtID LPAREN ty_id_list RPAREN LBRACE fbody RBRACE' p[0] = FunctionDec(p[2], p[3], p[5], p[8]) - def p_ty_simple(self, p): - '''ty : VOID - | I1 - | I8 - | I32 - | I64''' - p[0] = p[1] - def p_ty_id_list_single(self, p): 'ty_id_list : ty PercentID' p[0] = [(p[1], p[2])] @@ -218,25 +361,155 @@ class LLVMParser(object): 'ty_id_list : ' p[0] = [] + def p_fbody_multiple_blocks(self, p): + 'fbody : block named_block_list' + p[0] = FunctionBody(p[1], p[2]) + def p_fbody_one_block(self, p): 'fbody : block' p[0] = FunctionBody(p[1], []) + def p_block_insns_terminator(self, p): + 'block : insns terminator' + p[0] = Block(p[1], p[2]) + def p_block_terminator(self, p): 'block : terminator' p[0] = Block([], p[1]) + def p_insns_single(self, p): + 'insns : optionally_named_insn' + p[0] = [p[1]] + + def p_insns_multiple(self, p): + 'insns : optionally_named_insn insns' + p[0] = [p[1]] + p[2] + + def p_optionally_named_insn_some(self, p): + 'optionally_named_insn : PercentID ASSIGN insn' + p[0] = (p[1], p[3]) + + def p_optionally_named_insn_none(self, p): + 'optionally_named_insn : insn' + p[0] = (None, p[1]) + + def p_insn_bop(self, p): + 'insn : bop ty operand COMMA operand' + p[0] = Binop(p[1], p[2], p[3], p[5]) + + def p_insn_alloca(self, p): + 'insn : ALLOCA ty' + p[0] = Alloca(p[2]) + + def p_insn_load(self, p): + 'insn : LOAD ty COMMA ty operand' + p[0] = Load(p[2], p[5]) + + def p_insn_store(self, p): + 'insn : STORE ty operand COMMA ty operand' + p[0] = Store(p[2], p[3], p[6]) + + def p_insn_icmp(self, p): + 'insn : ICMP cnd ty operand COMMA operand' + p[0] = Icmp(p[2], p[3], p[4], p[6]) + + def p_insn_call(self, p): + 'insn : CALL ty operand LPAREN ty_operand_list RPAREN' + p[0] = Call(p[2], p[3], p[5]) + + def p_insn_call_empty(self, p): + 'insn : CALL ty operand LPAREN RPAREN' + p[0] = Call(p[2], p[3], []) + + def p_insn_bitcast(self, p): + 'insn : BITCAST ty operand TO ty' + p[0] = Bitcast(p[2], p[3], p[5]) + + def p_insn_gep(self, p): + 'insn : GETELEMENTPTR ty COMMA ty operand COMMA ty_operand_list' + p[0] = Gep(p[2], p[4], p[5], p[7]) + + def p_insn_gep_empty(self, p): + 'insn : GETELEMENTPTR ty COMMA ty operand' + p[0] = Gep(p[2], p[4], p[5], []) + + def p_insn_zext(self, p): + 'insn : ZEXT ty operand TO ty' + p[0] = Zext(p[2], p[3], p[4]) + + def p_insn_ptrtoint(self, p): + 'insn : PTRTOINT ty ASTERIX operand TO ty' + p[0] = Ptrtoint(p[2], p[4], p[6]) + + def p_bop(self, p): + '''bop : ADD + | SUB + | MUL + | SHL + | LSHR + | ASHR + | AND + | OR + | XOR + | SDIV''' + p[0] = p[1] + + def p_cnd(self, p): + '''cnd : EQ + | NE + | SLT + | SLE + | SGT + | SGE''' + p[0] = p[1] + + def p_ty_operand_list_single(self, p): + 'ty_operand_list : ty operand' + p[0] = [(p[1], p[2])] + + def p_ty_operand_list_multiple(self, p): + 'ty_operand_list : ty operand COMMA ty_operand_list' + p[0] = [(p[1], p[2])] + p[4] + + def p_terminator_ret_void(self, p): + 'terminator : RET VOID' + p[0] = Ret(SimpleType.Void, None) + def p_terminator_ret_oper(self, p): 'terminator : RET ty operand' - p[0] = (p[2], p[3]) + p[0] = Ret(p[2], p[3]) + + def p_terminator_branch(self, p): + 'terminator : BR LABEL PercentID' + p[0] = Br(p[3]) + + def p_terminator_conditional_branch(self, p): + 'terminator : BR ty operand COMMA LABEL PercentID COMMA LABEL PercentID' + p[0] = Cbr(p[2], p[3], p[6], p[9]) def p_operand(self, p): '''operand : NULL | INT | AtID | PercentID''' + # TODO: distinguish ids p[0] = p[1] + def p_named_block_list_single(self, p): + 'named_block_list : ID COLON block' + p[0] = [(p[1], p[3])] + + def p_named_block_list_multiple(self, p): + 'named_block_list : ID COLON block named_block_list' + p[0] = [(p[1], p[3])] + p[4] + + def p_error(self, t): + if t is None: + print('Syntax error at end of file') + else: + print('Syntax error at token {}' + .format(t)) + def __init__(self): self.tokens += self.reserved.values() @@ -248,12 +521,14 @@ class LLVMParser(object): result = self.parser.parse(data, lexer=self.lexer) print(result) + if __name__ == '__main__': p = LLVMParser() p.build() data = r''' define void @tigermain (i64 %U_mainSL_8, i64 %U_mainDummy_9) { - ret i64 8 + %a = add i64 3, 5 ; please be 8 + ret i64 %a } ''' p.test(data)