From 77d82f688389e990189d08e2850976c7cce4c5fd Mon Sep 17 00:00:00 2001 From: Pownie Date: Sun, 22 Apr 2018 15:07:19 +0200 Subject: [PATCH] Added code for 2-ply look-ahead --- network.py | 193 +++++++++++++++++++---------------------------------- 1 file changed, 67 insertions(+), 126 deletions(-) diff --git a/network.py b/network.py index 0c64997..6c38216 100644 --- a/network.py +++ b/network.py @@ -8,6 +8,7 @@ import sys import random from eval import Eval import glob +from operator import itemgetter class Network: # board_features_quack has size 28 @@ -35,13 +36,16 @@ class Network: ] self.output_size = 1 self.hidden_size = 40 - # Can't remember the best learning_rate, look this up self.max_learning_rate = 0.1 self.min_learning_rate = 0.001 - # self.learning_rate = 0.01 self.global_step = tf.Variable(0, trainable=False, name="global_step") - self.learning_rate = tf.maximum(self.min_learning_rate, tf.train.exponential_decay(self.max_learning_rate, self.global_step, 50000, 0.96, staircase=True), name="learning_rate") + self.learning_rate = tf.maximum(self.min_learning_rate, + tf.train.exponential_decay(self.max_learning_rate, + self.global_step, 50000, + 0.96, + staircase=True), + name="learning_rate") @@ -53,7 +57,6 @@ class Network: else: self.episodes_trained = 0 - # input = x self.x = tf.placeholder('float', [1, self.input_size], name='input') self.value_next = tf.placeholder('float', [1, self.output_size], name="value_next") @@ -105,40 +108,6 @@ class Network: self.saver = tf.train.Saver(max_to_keep=1) def eval_state(self, sess, state): - # Run state through a network - - # Remember to create placeholders for everything because wtf tensorflow - # and graphs - - # Remember to create the dense layers - - # Figure out a way of giving a layer a custom activiation function (we - # want something which gives [-2,2]. Naively tahn*2, however I fell this - # is wrong. - - # tf.group, groups a bunch of actions, so calculate the different - # gradients for the different weights, by using tf.trainable_variables() - # to find all variables and tf.gradients(current_value, - # trainable_variables) to find all the gradients. We can then loop - # through this and calculate the trace for each gradient and variable - # pair (note, zip can be used to combine the two lists found before), - # and then we can calculate the overall change in weights, based on the - # formula listed in tesauro (learning_rate * difference_in_values * - # trace), this calculation can be assigned to a tf variable and put in a - # list and then this can be grouped into a single operation, essentially - # building our own backprop function. - - # Grouping them is done by - # tf.group(*the_gradients_from_before_we_want_to_apply, - # name="training_op") - - # If we remove the eligibily trace to begin with, we only have to - # implement learning_rate * (difference_in_values) * gradients (the - # before-mentioned calculation. - - # print("Network is evaluating") - # print("eval ({})".format(self.name), state, val, sep="\n") - return sess.run(self.value, feed_dict={self.x: state}) def save_model(self, sess, episode_count, global_step): @@ -170,16 +139,69 @@ class Network: def make_move(self, sess, board, roll, player): - # print(Board.pretty(board)) legal_moves = Board.calculate_legal_states(board, player, roll) moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves] scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores] best_score_index = np.array(scores).argmax() best_move_pair = moves_and_scores[best_score_index] - # print("Found the best state, being:", np.array(move_scores).argmax()) return best_move_pair + + def gen_21_rolls(self): + a = [] + for x in range(1,7): + for y in range(1,7): + if not [x,y] in a and not [y,x] in a: + a.append([x,y]) + + return a + + def calculate_2_ply(self, sess, board, roll, player): + """ + Find the best move based on a 2-ply look-ahead. First the best move is found for a single ply and then an + exhaustive search is performed on the best 15 moves from the single ply. + + :param sess: + :param board: + :param roll: The original roll + :param player: The current player + :return: Best possible move based on 2-ply look-ahead + + """ + + + init_legal_states = Board.calculate_legal_states(board, player, roll) + zero_ply_moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in init_legal_states] + + # pythons reverse is in place and I can't call [:15] on it, without applying it to an object like so. Fuck. + best_fifteen = sorted(zero_ply_moves_and_scores, key=itemgetter(1)) + best_fifteen.reverse() + best_fifteen_boards = [x[0] for x in best_fifteen[:15]] + + all_rolls = self.gen_21_rolls() + + all_rolls_scores = [] + for a_board in best_fifteen_boards: + a_board_scores = [] + for roll in all_rolls: + spec_roll_scores = [] + all_rolls_boards = Board.calculate_legal_states(a_board, player*-1, roll) + + spec_roll_scores.append([self.eval_state(sess, self.board_trans_func(new_board, player*-1)) for new_board in all_rolls_boards]) + + best_score = max(spec_roll_scores) + + a_board_scores.append(best_score) + + all_rolls_scores.append(sum(a_board_scores)/len(a_board_scores)) + + best_score_index = np.array(all_rolls_scores).argmax() + best_board = best_fifteen_boards[best_score_index] + + return [best_board, max(all_rolls_scores)] + def eval(self, episode_count, trained_eps = 0, tf_session = None): + def do_eval(sess, method, episodes = 1000, trained_eps = 0): start_time = time.time() @@ -198,47 +220,20 @@ class Network: sys.stderr.write( "[EVAL ] Evaluating {eps} episode(s) with method '{method}'\n".format(eps=episodes, method=method)) - if method == 'random': - outcomes = [] - """for i in range(1, episodes + 1): - sys.stderr.write("[EVAL ] Episode {}".format(i)) - board = Board.initial_state - while Board.outcome(board) is None: - roll = (random.randrange(1, 7), random.randrange(1, 7)) - board = (self.p1.make_move(sess, board, self.p1.get_sym(), roll))[0] - roll = (random.randrange(1, 7), random.randrange(1, 7)) - board = Board.flip(Eval.make_random_move(Board.flip(board), 1, roll)) - sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1])) - outcomes.append(Board.outcome(board)[1]) - sys.stderr.write("\n") - if i % 50 == 0: - print_time_estimate(i)""" - return outcomes - elif method == 'pubeval': + if method == 'pubeval': outcomes = [] - # Add the evaluation code for pubeval, the bot has a method make_pubeval_move(board, sym, roll), - # which can be used to get the best move according to pubeval for i in range(1, episodes + 1): sys.stderr.write("[EVAL ] Episode {}".format(i)) board = Board.initial_state - # print("init:", board, sep="\n") while Board.outcome(board) is None: - # print("-"*30) roll = (random.randrange(1, 7), random.randrange(1, 7)) - # print(roll) - # prev_board = tuple(board) board = (self.make_move(sess, board, roll, 1))[0] - # print("post p1:", board, sep="\n") - # print("."*30) roll = (random.randrange(1, 7), random.randrange(1, 7)) - # print(roll) - # prev_board = tuple(board) board = Eval.make_pubeval_move(board, -1, roll)[0][0:26] - # print("post pubeval:", board, sep="\n") sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1])) outcomes.append(Board.outcome(board)[1]) @@ -251,28 +246,17 @@ class Network: elif method == 'dumbeval': outcomes = [] - # Add the evaluation code for pubeval, the bot has a method make_pubeval_move(board, sym, roll), - # which can be used to get the best move according to pubeval for i in range(1, episodes + 1): sys.stderr.write("[EVAL ] Episode {}".format(i)) board = Board.initial_state - # print("init:", board, sep="\n") while Board.outcome(board) is None: - # print("-"*30) roll = (random.randrange(1, 7), random.randrange(1, 7)) - # print(roll) - # prev_board = tuple(board) board = (self.make_move(sess, board, roll, 1))[0] - # print("post p1:", board, sep="\n") - # print("."*30) roll = (random.randrange(1, 7), random.randrange(1, 7)) - # print(roll) - # prev_board = tuple(board) board = Eval.make_dumbeval_move(board, -1, roll)[0][0:26] - # print("post pubeval:", board, sep="\n") sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1])) outcomes.append(Board.outcome(board)[1]) @@ -283,31 +267,6 @@ class Network: return outcomes - elif method == 'dumbmodel': - outcomes = [] - """ - config_prime = self.config.copy() - config_prime['model_path'] = os.path.join(config_prime['model_storage_path'], 'dumbmodel') - eval_bot = Bot(1, config = config_prime, name = "dumbmodel") - #print(self.config, "\n", config_prime) - outcomes = [] - for i in range(1, episodes + 1): - sys.stderr.write("[EVAL ] Episode {}".format(i)) - board = Board.initial_state - while Board.outcome(board) is None: - roll = (random.randrange(1,7), random.randrange(1,7)) - board = (self.make_move(board, self.p1.get_sym(), roll))[0] - - roll = (random.randrange(1,7), random.randrange(1,7)) - board = Board.flip(eval_bot.make_move(Board.flip(board), self.p1.get_sym(), roll)[0]) - sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1])) - outcomes.append(Board.outcome(board)[1]) - sys.stderr.write("\n") - - if i % 50 == 0: - print_time_estimate(i) - """ - return outcomes else: sys.stderr.write("[EVAL ] Evaluation method '{}' is not defined\n".format(method)) return [0] @@ -363,27 +322,20 @@ class Network: sys.stderr.write("[TRAIN] Training {} episodes and save_step_size {}\n".format(episodes, save_step_size)) outcomes = [] for episode in range(1, episodes + 1): + sys.stderr.write("[TRAIN] Episode {}".format(episode + trained_eps)) # TODO decide which player should be here player = 1 - prev_board = Board.initial_state - - # find the best move here, make this move, then change turn as the - # first thing inside of the while loop and then call - # best_move_and_score to get V_t+1 - i = 0 while Board.outcome(prev_board) is None: i += 1 - #print("PREEEV_BOOOOAAARD:",prev_board) cur_board, cur_board_value = self.make_move(sess, prev_board, - (random.randrange(1, 7), random.randrange(1, 7)), player) - - #print("The current value:",cur_board_value) + (random.randrange(1, 7), random.randrange(1, 7)), + player) # adjust weights sess.run(self.training_op, @@ -392,7 +344,6 @@ class Network: player *= -1 - prev_board = cur_board final_board = prev_board @@ -400,8 +351,6 @@ class Network: outcomes.append(Board.outcome(final_board)[1]) final_score = np.array([Board.outcome(final_board)[1]]) scaled_final_score = ((final_score + 2) / 4) - #print("The difference in values:", scaled_final_score - cur_board_value) - # print("scaled_final_score",scaled_final_score) with tf.name_scope("final"): merged = tf.summary.merge_all() @@ -424,12 +373,4 @@ class Network: writer.close() - return outcomes - - - # take turn, which finds the best state and picks it, based on the current network - # save current state - # run training operation (session.run(self.training_op, {x:x, value_next, value_next})), (something which does the backprop, based on the state after having taken a turn, found before, and the state we saved in the beginning and from now we'll save it at the end of the turn - # save the current state again, so we can continue running backprop based on the "previous" turn. - - # NOTE: We need to make a method so that we can take a single turn or at least just pick the next best move, so we know how to evaluate according to TD-learning. Right now, our game just continues in a while loop without nothing to stop it! + return outcomes \ No newline at end of file