From 85ec8d8e4e61e840b38bfafe89c7bfb69ede633e Mon Sep 17 00:00:00 2001 From: Alexander Munch-Hansen Date: Tue, 20 Mar 2018 17:29:29 +0100 Subject: [PATCH] Added tesauro + sigmoid --- board.py | 36 ++++++++++++++++++++++++++++++++++-- network.py | 35 +++++++++++++++++++++++++---------- 2 files changed, 59 insertions(+), 12 deletions(-) diff --git a/board.py b/board.py index bfa7998..cc500f3 100644 --- a/board.py +++ b/board.py @@ -34,8 +34,40 @@ class Board: board.append(15 - sum(positives)) board.append(-15 - sum(negatives)) return tuple(board) - - + + # The original tesauro also takes in the player, so [1,0] for one of them and [0,1] for the other + # Not sure if this should be included + @staticmethod + def map_to_tesauro(board): + features = [] + for i in range(1,25): + idx = list(board)[i] + place = [0]*8 + if (idx != 0): + if idx > 0: + for i in range(min(int(idx),3)): + place[i]=1. + if idx>3: + place[3]+=(idx-3)/2 + else: + for i in range(min(abs(int(idx)),3)): + place[i+4]=1. + if idx>3: + place[3+4]+=(idx-3)/2 + features+=place + + nega_hits = list(board)[0]/2 + posi_hits = list(board)[25]/2 + positives = [x if x > 0 else 0 for x in board] + negatives = [x if x < 0 else 0 for x in board] + posi_home = ((15 - sum(positives))/15) + nega_home = ((-15 - sum(negatives))/15) + features.append(nega_hits) + features.append(posi_hits) + features.append(posi_home) + features.append(nega_home) +# print(features) + return features @staticmethod diff --git a/network.py b/network.py index 62b1d17..069826a 100644 --- a/network.py +++ b/network.py @@ -10,7 +10,7 @@ from eval import Eval class Network: hidden_size = 40 - input_size = 26 + input_size = 196 output_size = 1 # Can't remember the best learning_rate, look this up learning_rate = 0.1 @@ -43,17 +43,20 @@ class Network: b_2 = tf.get_variable("b_2", (Network.output_size,), initializer=tf.zeros_initializer) - value_after_input = self.custom_tanh(tf.matmul(self.x, W_1) + b_1, name='hidden_layer') + value_after_input = tf.sigmoid(tf.matmul(self.x, W_1) + b_1, name='hidden_layer') - self.value = self.custom_tanh(tf.matmul(value_after_input, W_2) + b_2, name='output_layer') + + + + self.value = tf.sigmoid(tf.matmul(value_after_input, W_2) + b_2, name='output_layer') # tf.reduce_sum basically finds the sum of its input, so this gives the # difference between the two values, in case they should be lists, which # they might be if our input changes # TODO: Alexander thinks that self.value will be computed twice (instead of once) - difference_in_values = tf.reduce_sum(self.value_next - self.value, name='difference') - + difference_in_values = tf.reduce_sum(tf.subtract(self.value_next, self.value, name='difference')) + trainable_vars = tf.trainable_variables() gradients = tf.gradients(self.value, trainable_vars) @@ -140,7 +143,7 @@ class Network: # Have a circular dependency, #fuck, need to rewrite something def adjust_weights(self, board, v_next): # print("lol") - board = np.array(board).reshape((1,26)) + board = np.array(board).reshape((1,-1)) self.session.run(self.training_op, feed_dict = { self.x: board, self.value_next: v_next }) @@ -156,7 +159,7 @@ class Network: def make_move(self, board, roll): # print(Board.pretty(board)) legal_moves = Board.calculate_legal_states(board, 1, roll) - moves_and_scores = [ (move, self.eval_state(np.array(move).reshape(1,26))) for move in legal_moves ] + moves_and_scores = [ (move, self.eval_state(np.array(Board.map_to_tesauro(move)).reshape(1,-1))) for move in legal_moves ] scores = [ x[1] for x in moves_and_scores ] best_score_index = np.array(scores).argmax() best_move_pair = moves_and_scores[best_score_index] @@ -181,20 +184,31 @@ class Network: outcomes = [] for episode in range(1, episodes + 1): sys.stderr.write("[TRAIN] Episode {}".format(episode + trained_eps)) +# print("greerggeregr"*10000) # TODO decide which player should be here player = 1 roll = (random.randrange(1,7), random.randrange(1,7)) + + def tesaurofi(board): + return Board.map_to_tesauro(board) + prev_board, _ = self.make_move(Board.flip(Board.initial_state) if player == -1 else Board.initial_state, roll) + if player == -1: prev_board = Board.flip(prev_board) - + + # print("board:",prev_board) + # print(len(prev_board)) + # find the best move here, make this move, then change turn as the # first thing inside of the while loop and then call # best_move_and_score to get V_t+1 # i = 0 while Board.outcome(prev_board) is None: + #print(prev_board) + # print("-"*30) # print(i) # print(roll) @@ -206,10 +220,11 @@ class Network: roll = (random.randrange(1,7), random.randrange(1,7)) cur_board, cur_board_value = self.make_move(Board.flip(prev_board) if player == -1 else prev_board, roll) + #print("pls",cur_board_value) if player == -1: cur_board = Board.flip(cur_board) - self.adjust_weights(prev_board, cur_board_value) + self.adjust_weights(tesaurofi(prev_board), cur_board_value) prev_board = cur_board @@ -217,7 +232,7 @@ class Network: sys.stderr.write("\t outcome {}".format(Board.outcome(final_board)[1])) outcomes.append(Board.outcome(final_board)[1]) final_score = np.array([ Board.outcome(final_board)[1] ]) - self.adjust_weights(prev_board, final_score.reshape((1, 1))) + self.adjust_weights(tesaurofi(prev_board), final_score.reshape((1, 1))) sys.stderr.write("\n")