import tensorflow as tf import numpy as np from board import Board import os import time import sys import random from eval import Eval import glob from operator import itemgetter class Network: # board_features_quack has size 28 # board_features_quack_fat has size 30 # board_features_tesauro has size 198 board_reps = { 'quack-fat' : (30, Board.board_features_quack_fat), 'quack' : (28, Board.board_features_quack), 'tesauro' : (198, Board.board_features_tesauro) } def custom_tanh(self, x, name=None): return tf.scalar_mul(tf.constant(2.00), tf.tanh(x, name)) def __init__(self, config, name): self.config = config self.checkpoint_path = os.path.join(config['model_storage_path'], config['model']) self.name = name # Set board representation from config self.input_size, self.board_trans_func = Network.board_reps[ self.config['board_representation'] ] self.output_size = 1 self.hidden_size = 40 self.max_learning_rate = 0.1 self.min_learning_rate = 0.001 self.global_step = tf.Variable(0, trainable=False, name="global_step") self.learning_rate = tf.maximum(self.min_learning_rate, tf.train.exponential_decay(self.max_learning_rate, self.global_step, 50000, 0.96, staircase=True), name="learning_rate") # Restore trained episode count for model episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained") if os.path.isfile(episode_count_path): with open(episode_count_path, 'r') as f: self.episodes_trained = int(f.read()) else: self.episodes_trained = 0 self.x = tf.placeholder('float', [1, self.input_size], name='input') self.value_next = tf.placeholder('float', [1, self.output_size], name="value_next") xavier_init = tf.contrib.layers.xavier_initializer() W_1 = tf.get_variable("w_1", (self.input_size, self.hidden_size), initializer=xavier_init) W_2 = tf.get_variable("w_2", (self.hidden_size, self.output_size), initializer=xavier_init) b_1 = tf.get_variable("b_1", (self.hidden_size,), initializer=tf.zeros_initializer) b_2 = tf.get_variable("b_2", (self.output_size,), initializer=tf.zeros_initializer) value_after_input = tf.sigmoid(tf.matmul(self.x, W_1) + b_1, name='hidden_layer') self.value = tf.sigmoid(tf.matmul(value_after_input, W_2) + b_2, name='output_layer') # TODO: Alexander thinks that self.value will be computed twice (instead of once) difference_in_values = tf.reshape(tf.subtract(self.value_next, self.value, name='difference_in_values'), []) tf.summary.scalar("difference_in_values", tf.abs(difference_in_values)) trainable_vars = tf.trainable_variables() gradients = tf.gradients(self.value, trainable_vars) apply_gradients = [] global_step_op = self.global_step.assign_add(1) with tf.variable_scope('apply_gradients'): for gradient, trainable_var in zip(gradients, trainable_vars): backprop_calc = self.learning_rate * difference_in_values * gradient grad_apply = trainable_var.assign_add(backprop_calc) apply_gradients.append(grad_apply) with tf.control_dependencies([global_step_op]): self.training_op = tf.group(*apply_gradients, name='training_op') self.saver = tf.train.Saver(max_to_keep=1) def eval_state(self, sess, state): return sess.run(self.value, feed_dict={self.x: state}) def save_model(self, sess, episode_count, global_step): self.saver.save(sess, os.path.join(self.checkpoint_path, 'model.ckpt'), global_step=global_step) with open(os.path.join(self.checkpoint_path, "episodes_trained"), 'w+') as f: print("[NETWK] ({name}) Saving model to:".format(name=self.name), os.path.join(self.checkpoint_path, 'model.ckpt')) f.write(str(episode_count) + "\n") def restore_model(self, sess): if glob.glob(os.path.join(self.checkpoint_path, 'model.ckpt*.index')): latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_path) print("[NETWK] ({name}) Restoring model from:".format(name=self.name), str(latest_checkpoint)) self.saver.restore(sess, latest_checkpoint) variables_names = [v.name for v in tf.trainable_variables()] values = sess.run(variables_names) for k, v in zip(variables_names, values): print("Variable: ", k) print("Shape: ", v.shape) print(v) # Restore trained episode count for model episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained") if os.path.isfile(episode_count_path): with open(episode_count_path, 'r') as f: self.config['start_episode'] = int(f.read()) def make_move(self, sess, board, roll, player): legal_moves = Board.calculate_legal_states(board, player, roll) moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves] scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores] best_score_index = np.array(scores).argmax() best_move_pair = moves_and_scores[best_score_index] return best_move_pair def gen_21_rolls(self): """ Calculate all possible rolls, [[1,1], [1,2] ..] :return: All possible rolls """ a = [] for x in range(1,7): for y in range(1,7): if not [x,y] in a and not [y,x] in a: a.append([x,y]) return a def calculate_2_ply(self, sess, board, roll, player): """ Find the best move based on a 2-ply look-ahead. First the best move is found for a single ply and then an exhaustive search is performed on the best 15 moves from the single ply. :param sess: :param board: :param roll: The original roll :param player: The current player :return: Best possible move based on 2-ply look-ahead """ init_legal_states = Board.calculate_legal_states(board, player, roll) zero_ply_moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in init_legal_states] # pythons reverse is in place and I can't call [:15] on it, without applying it to an object like so. Fuck. best_fifteen = sorted(zero_ply_moves_and_scores, key=itemgetter(1)) best_fifteen.reverse() best_fifteen_boards = [x[0] for x in best_fifteen[:15]] all_rolls = self.gen_21_rolls() all_rolls_scores = [] for a_board in best_fifteen_boards: a_board_scores = [] for roll in all_rolls: spec_roll_scores = [] all_rolls_boards = Board.calculate_legal_states(a_board, player*-1, roll) spec_roll_scores.append( [self.eval_state(sess, self.board_trans_func(new_board, player*-1)) for new_board in all_rolls_boards] ) best_score = max(spec_roll_scores) a_board_scores.append(best_score) all_rolls_scores.append(sum(a_board_scores)/len(a_board_scores)) best_score_index = np.array(all_rolls_scores).argmax() best_board = best_fifteen_boards[best_score_index] return [best_board, max(all_rolls_scores)] def eval(self, episode_count, trained_eps = 0, tf_session = None): """ Used to evaluate a model. Can either use pubeval, a model playing at an intermediate level, or dumbeval a model which has been given random weights, so it acts deterministically random. :param episode_count: The amount of episodes to run :param trained_eps: The amount of episodes the model we want to evaluate, has trained :param tf_session: :return: outcomes: The outcomes of the evaluation session """ def do_eval(sess, method, episodes = 1000, trained_eps = 0): """ Do the actual evaluation :param sess: :param method: Either pubeval or dumbeval :param episodes: Amount of episodes to use in the evaluation :param trained_eps: :return: outcomes : Described above """ start_time = time.time() def print_time_estimate(eps_completed): cur_time = time.time() time_diff = cur_time - start_time eps_per_sec = eps_completed / time_diff secs_per_ep = time_diff / eps_completed eps_remaining = (episodes - eps_completed) sys.stderr.write( "[EVAL ] Averaging {per_sec} episodes per second\n".format(per_sec=round(eps_per_sec, 2))) sys.stderr.write( "[EVAL ] {eps_remaining} episodes remaining; approx. {time_remaining} seconds remaining\n".format( eps_remaining=eps_remaining, time_remaining=int(eps_remaining * secs_per_ep))) sys.stderr.write( "[EVAL ] Evaluating {eps} episode(s) with method '{method}'\n".format(eps=episodes, method=method)) if method == 'pubeval': outcomes = [] for i in range(1, episodes + 1): sys.stderr.write("[EVAL ] Episode {}".format(i)) board = Board.initial_state while Board.outcome(board) is None: roll = (random.randrange(1, 7), random.randrange(1, 7)) board = (self.make_move(sess, board, roll, 1))[0] roll = (random.randrange(1, 7), random.randrange(1, 7)) board = Eval.make_pubeval_move(board, -1, roll)[0][0:26] sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1])) outcomes.append(Board.outcome(board)[1]) sys.stderr.write("\n") if i % 10 == 0: print_time_estimate(i) return outcomes elif method == 'dumbeval': outcomes = [] for i in range(1, episodes + 1): sys.stderr.write("[EVAL ] Episode {}".format(i)) board = Board.initial_state while Board.outcome(board) is None: roll = (random.randrange(1, 7), random.randrange(1, 7)) board = (self.make_move(sess, board, roll, 1))[0] roll = (random.randrange(1, 7), random.randrange(1, 7)) board = Eval.make_dumbeval_move(board, -1, roll)[0][0:26] sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1])) outcomes.append(Board.outcome(board)[1]) sys.stderr.write("\n") if i % 10 == 0: print_time_estimate(i) return outcomes else: sys.stderr.write("[EVAL ] Evaluation method '{}' is not defined\n".format(method)) return [0] if tf_session == None: with tf.Session() as session: session.run(tf.global_variables_initializer()) self.restore_model(session) outcomes = [ (method, do_eval(session, method, episode_count, trained_eps = trained_eps)) for method in self.config['eval_methods'] ] return outcomes else: outcomes = [ (method, do_eval(tf_session, method, episode_count, trained_eps = trained_eps)) for method in self.config['eval_methods'] ] return outcomes def train_model(self, episodes=1000, save_step_size=100, trained_eps=0): with tf.Session() as sess: writer = tf.summary.FileWriter("/tmp/log/tf", sess.graph) sess.run(tf.global_variables_initializer()) self.restore_model(sess) variables_names = [v.name for v in tf.trainable_variables()] values = sess.run(variables_names) for k, v in zip(variables_names, values): print("Variable: ", k) print("Shape: ", v.shape) print(v) start_time = time.time() def print_time_estimate(eps_completed): cur_time = time.time() time_diff = cur_time - start_time eps_per_sec = eps_completed / time_diff secs_per_ep = time_diff / eps_completed eps_remaining = (episodes - eps_completed) sys.stderr.write( "[TRAIN] Averaging {per_sec} episodes per second\n".format(per_sec=round(eps_per_sec, 2))) sys.stderr.write( "[TRAIN] {eps_remaining} episodes remaining; approx. {time_remaining} seconds remaining\n".format( eps_remaining=eps_remaining, time_remaining=int(eps_remaining * secs_per_ep))) sys.stderr.write("[TRAIN] Training {} episodes and save_step_size {}\n".format(episodes, save_step_size)) outcomes = [] for episode in range(1, episodes + 1): sys.stderr.write("[TRAIN] Episode {}".format(episode + trained_eps)) # TODO decide which player should be here player = 1 prev_board = Board.initial_state i = 0 while Board.outcome(prev_board) is None: i += 1 cur_board, cur_board_value = self.make_move(sess, prev_board, (random.randrange(1, 7), random.randrange(1, 7)), player) # print("The evaluation of the previous state:\n", self.eval_state(sess, self.board_trans_func(prev_board, player))) # print("The evaluation of the current_state:\n", cur_board_value) # adjust weights sess.run(self.training_op, feed_dict={self.x: self.board_trans_func(prev_board, player), self.value_next: cur_board_value}) player *= -1 prev_board = cur_board final_board = prev_board sys.stderr.write("\t outcome {}\t turns {}".format(Board.outcome(final_board)[1], i)) outcomes.append(Board.outcome(final_board)[1]) final_score = np.array([Board.outcome(final_board)[1]]) scaled_final_score = ((final_score + 2) / 4) with tf.name_scope("final"): merged = tf.summary.merge_all() global_step, summary, _ = sess.run([self.global_step, merged, self.training_op], feed_dict={self.x: self.board_trans_func(prev_board, player), self.value_next: scaled_final_score.reshape((1, 1))}) writer.add_summary(summary, episode + trained_eps) sys.stderr.write("\n") if episode % min(save_step_size, episodes) == 0: sys.stderr.write("[TRAIN] Saving model...\n") self.save_model(sess, episode + trained_eps, global_step) if episode % 50 == 0: print_time_estimate(episode) sys.stderr.write("[TRAIN] Saving model for final episode...\n") self.save_model(sess, episode+trained_eps, global_step) writer.close() return outcomes