From 98c9af72e7a0c9332f31ab7c88987851630eca1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoffer=20M=C3=BCller=20Madsen?= Date: Thu, 22 Mar 2018 15:30:47 +0100 Subject: [PATCH 01/17] rework network --- main.py | 46 +++++------ network.py | 230 +++++++++++++++++++++++++++++------------------------ plot.py | 2 +- 3 files changed, 151 insertions(+), 127 deletions(-) diff --git a/main.py b/main.py index 8d42e2b..bc8de09 100644 --- a/main.py +++ b/main.py @@ -102,29 +102,29 @@ if args.list_models: sys.stderr.write(" {name}: {eps_trained}\n".format(name = model[0], eps_trained = model[1])) exit() - -# Set up network -from network import Network -network = Network(config, config['model']) -eps = config['start_episode'] -# Set up variables -episode_count = config['episode_count'] +if __name__ == "__main__": + # Set up network + from network import Network + network = Network(config, config['model']) + start_episode = network.episodes_trained + + # Set up variables + episode_count = config['episode_count'] -if args.train: - while True: - train_outcome = network.train_model(episodes = episode_count, trained_eps = eps) - eps += episode_count - log_train_outcome(train_outcome, trained_eps = eps) - if config['eval_after_train']: - eval_outcomes = network.eval(trained_eps = eps) - log_eval_outcomes(eval_outcomes, trained_eps = eps) - if not config['train_perpetually']: - break -elif args.eval: - eps = config['start_episode'] - outcomes = network.eval() - log_eval_outcomes(outcomes, trained_eps = eps) -#elif args.play: -# g.play(episodes = episode_count) + if args.train: + while True: + train_outcome = network.train_model(episodes = episode_count, trained_eps = start_episode) + start_episode += episode_count + log_train_outcome(train_outcome, trained_eps = start_episode) + if config['eval_after_train']: + eval_outcomes = network.eval(trained_eps = start_episode) + log_eval_outcomes(eval_outcomes, trained_eps = start_episode) + if not config['train_perpetually']: + break + elif args.eval: + outcomes = network.eval() + log_eval_outcomes(outcomes, trained_eps = start_episode) + # elif args.play: + # g.play(episodes = episode_count) diff --git a/network.py b/network.py index 62b1d17..f058d48 100644 --- a/network.py +++ b/network.py @@ -13,7 +13,7 @@ class Network: input_size = 26 output_size = 1 # Can't remember the best learning_rate, look this up - learning_rate = 0.1 + learning_rate = 0.05 # TODO: Actually compile tensorflow properly #os.environ["TF_CPP_MIN_LOG_LEVEL"]="2" @@ -23,12 +23,20 @@ class Network: def __init__(self, config, name): self.config = config - self.session = tf.Session() self.checkpoint_path = config['model_path'] + self.name = name + + # Restore trained episode count for model + episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained") + if os.path.isfile(episode_count_path): + with open(episode_count_path, 'r') as f: + self.episodes_trained = int(f.read()) + else: + self.episodes_trained = 0 # input = x - self.x = tf.placeholder('float', [1, Network.input_size], name='x') + self.x = tf.placeholder('float', [1, Network.input_size], name='input') self.value_next = tf.placeholder('float', [1, Network.output_size], name="value_next") xavier_init = tf.contrib.layers.xavier_initializer() @@ -43,20 +51,22 @@ class Network: b_2 = tf.get_variable("b_2", (Network.output_size,), initializer=tf.zeros_initializer) - value_after_input = self.custom_tanh(tf.matmul(self.x, W_1) + b_1, name='hidden_layer') + normalized_input = tf.nn.l2_normalize(self.x) + value_after_input = tf.sigmoid(tf.matmul(normalized_input, W_1) + b_1, name='hidden_layer') - self.value = self.custom_tanh(tf.matmul(value_after_input, W_2) + b_2, name='output_layer') + self.value = tf.sigmoid(tf.matmul(value_after_input, W_2) + b_2, name='output_layer') # tf.reduce_sum basically finds the sum of its input, so this gives the # difference between the two values, in case they should be lists, which # they might be if our input changes # TODO: Alexander thinks that self.value will be computed twice (instead of once) - difference_in_values = tf.reduce_sum(self.value_next - self.value, name='difference') + difference_in_values = tf.reshape(tf.subtract(self.value_next, self.value, name='difference_in_values'), []) + tf.summary.scalar("difference_in_values", tf.abs(difference_in_values)) trainable_vars = tf.trainable_variables() gradients = tf.gradients(self.value, trainable_vars) - + apply_gradients = [] with tf.variable_scope('apply_gradients'): @@ -67,13 +77,10 @@ class Network: apply_gradients.append(grad_apply) self.training_op = tf.group(*apply_gradients, name='training_op') - - self.saver = tf.train.Saver(max_to_keep=1) - self.session.run(tf.global_variables_initializer()) - self.restore_model() + self.saver = tf.train.Saver(max_to_keep=1) - def eval_state(self, state): + def eval_state(self, sess, state): # Run state through a network # Remember to create placeholders for everything because wtf tensorflow @@ -107,25 +114,25 @@ class Network: # print("Network is evaluating") - val = self.session.run(self.value, feed_dict={self.x: state}) #print("eval ({})".format(self.name), state, val, sep="\n") - return val + return sess.run(self.value, feed_dict={self.x: state}) - def save_model(self, episode_count): - self.saver.save(self.session, os.path.join(self.checkpoint_path, 'model.ckpt')) + + def save_model(self, sess, episode_count): + self.saver.save(sess, os.path.join(self.checkpoint_path, 'model.ckpt')) with open(os.path.join(self.checkpoint_path, "episodes_trained"), 'w+') as f: print("[NETWK] ({name}) Saving model to:".format(name = self.name), os.path.join(self.checkpoint_path, 'model.ckpt')) f.write(str(episode_count) + "\n") - def restore_model(self): + def restore_model(self, sess): if os.path.isfile(os.path.join(self.checkpoint_path, 'model.ckpt.index')): latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_path) print("[NETWK] ({name}) Restoring model from:".format(name = self.name), str(latest_checkpoint)) - self.saver.restore(self.session, latest_checkpoint) + self.saver.restore(sess, latest_checkpoint) variables_names = [v.name for v in tf.trainable_variables()] - values = self.session.run(variables_names) + values = sess.run(variables_names) for k, v in zip(variables_names, values): print("Variable: ", k) print("Shape: ", v.shape) @@ -137,26 +144,10 @@ class Network: with open(episode_count_path, 'r') as f: self.config['start_episode'] = int(f.read()) - # Have a circular dependency, #fuck, need to rewrite something - def adjust_weights(self, board, v_next): -# print("lol") - board = np.array(board).reshape((1,26)) - self.session.run(self.training_op, feed_dict = { self.x: board, - self.value_next: v_next }) - - - # while game isn't done: - #x_next = g.next_move() - #value_next = network.eval_state(x_next) - #self.session.run(self.training_op, feed_dict={self.x: x, self.value_next: value_next}) - #x = x_next - - - - def make_move(self, board, roll): + def make_move(self, sess, board, roll): # print(Board.pretty(board)) legal_moves = Board.calculate_legal_states(board, 1, roll) - moves_and_scores = [ (move, self.eval_state(np.array(move).reshape(1,26))) for move in legal_moves ] + moves_and_scores = [ (move, self.eval_state(sess, np.array(move).reshape(1,26))) for move in legal_moves ] scores = [ x[1] for x in moves_and_scores ] best_score_index = np.array(scores).argmax() best_move_pair = moves_and_scores[best_score_index] @@ -165,73 +156,101 @@ class Network: def train_model(self, episodes=1000, save_step_size = 100, trained_eps = 0): - start_time = time.time() + with tf.Session() as sess: + writer = tf.summary.FileWriter("/tmp/log/tf", sess.graph) + + sess.run(tf.global_variables_initializer()) + self.restore_model(sess) + + variables_names = [v.name for v in tf.trainable_variables()] + values = sess.run(variables_names) + for k, v in zip(variables_names, values): + print("Variable: ", k) + print("Shape: ", v.shape) + print(v) - def print_time_estimate(eps_completed): - cur_time = time.time() - time_diff = cur_time - start_time - eps_per_sec = eps_completed / time_diff - secs_per_ep = time_diff / eps_completed - eps_remaining = (episodes - eps_completed) - sys.stderr.write("[TRAIN] Averaging {per_sec} episodes per second\n".format(per_sec = round(eps_per_sec, 2))) - sys.stderr.write("[TRAIN] {eps_remaining} episodes remaining; approx. {time_remaining} seconds remaining\n".format(eps_remaining = eps_remaining, time_remaining = int(eps_remaining * secs_per_ep))) + start_time = time.time() + + def print_time_estimate(eps_completed): + cur_time = time.time() + time_diff = cur_time - start_time + eps_per_sec = eps_completed / time_diff + secs_per_ep = time_diff / eps_completed + eps_remaining = (episodes - eps_completed) + sys.stderr.write("[TRAIN] Averaging {per_sec} episodes per second\n".format(per_sec = round(eps_per_sec, 2))) + sys.stderr.write("[TRAIN] {eps_remaining} episodes remaining; approx. {time_remaining} seconds remaining\n".format(eps_remaining = eps_remaining, time_remaining = int(eps_remaining * secs_per_ep))) - sys.stderr.write("[TRAIN] Training {} episodes and save_step_size {}\n".format(episodes, save_step_size)) - outcomes = [] - for episode in range(1, episodes + 1): - sys.stderr.write("[TRAIN] Episode {}".format(episode + trained_eps)) - # TODO decide which player should be here - player = 1 + sys.stderr.write("[TRAIN] Training {} episodes and save_step_size {}\n".format(episodes, save_step_size)) + outcomes = [] + for episode in range(1, episodes + 1): + sys.stderr.write("[TRAIN] Episode {}".format(episode + trained_eps)) + # TODO decide which player should be here + player = 1 - roll = (random.randrange(1,7), random.randrange(1,7)) - prev_board, _ = self.make_move(Board.flip(Board.initial_state) if player == -1 else Board.initial_state, roll) - if player == -1: - prev_board = Board.flip(prev_board) - - # find the best move here, make this move, then change turn as the - # first thing inside of the while loop and then call - # best_move_and_score to get V_t+1 - - # i = 0 - while Board.outcome(prev_board) is None: - # print("-"*30) - # print(i) - # print(roll) - # print(Board.pretty(prev_board)) - # print("/"*30) - # i += 1 - - player *= -1 roll = (random.randrange(1,7), random.randrange(1,7)) - - cur_board, cur_board_value = self.make_move(Board.flip(prev_board) if player == -1 else prev_board, roll) + prev_board, _ = self.make_move(sess, Board.flip(Board.initial_state) if player == -1 else Board.initial_state, roll) if player == -1: - cur_board = Board.flip(cur_board) - - self.adjust_weights(prev_board, cur_board_value) - - prev_board = cur_board - - final_board = prev_board - sys.stderr.write("\t outcome {}".format(Board.outcome(final_board)[1])) - outcomes.append(Board.outcome(final_board)[1]) - final_score = np.array([ Board.outcome(final_board)[1] ]) - self.adjust_weights(prev_board, final_score.reshape((1, 1))) - - sys.stderr.write("\n") + prev_board = Board.flip(prev_board) - if episode % min(save_step_size, episodes) == 0: - sys.stderr.write("[TRAIN] Saving model...\n") - self.save_model(episode+trained_eps) + # find the best move here, make this move, then change turn as the + # first thing inside of the while loop and then call + # best_move_and_score to get V_t+1 - if episode % 50 == 0: - print_time_estimate(episode) + # i = 0 + while Board.outcome(prev_board) is None: + # print("-"*30) + # print(i) + # print(roll) + # print(Board.pretty(prev_board)) + # print("/"*30) + # i += 1 + + player *= -1 + roll = (random.randrange(1,7), random.randrange(1,7)) - sys.stderr.write("[TRAIN] Saving model for final episode...\n") - self.save_model(episode+trained_eps) + cur_board, cur_board_value = self.make_move(sess, Board.flip(prev_board) if player == -1 else prev_board, roll) + if player == -1: + cur_board = Board.flip(cur_board) + + # print("cur_board_value:", cur_board_value) + + # adjust weights + sess.run(self.training_op, + feed_dict = { self.x: np.array(prev_board).reshape((1,26)), + self.value_next: cur_board_value }) + prev_board = cur_board + + final_board = prev_board + sys.stderr.write("\t outcome {}".format(Board.outcome(final_board)[1])) + outcomes.append(Board.outcome(final_board)[1]) + final_score = np.array([ Board.outcome(final_board)[1] ]) + scaled_final_score = ((final_score + 2) / 4) + + # print("scaled_final_score",scaled_final_score) + + with tf.name_scope("final"): + merged = tf.summary.merge_all() + summary, _ = sess.run([merged, self.training_op], + feed_dict = { self.x: np.array(prev_board).reshape((1,26)), + self.value_next: scaled_final_score.reshape((1, 1)) }) + writer.add_summary(summary, episode + trained_eps) + + sys.stderr.write("\n") + + if episode % min(save_step_size, episodes) == 0: + sys.stderr.write("[TRAIN] Saving model...\n") + self.save_model(sess, episode+trained_eps) + + if episode % 50 == 0: + print_time_estimate(episode) + + sys.stderr.write("[TRAIN] Saving model for final episode...\n") + self.save_model(sess, episode+trained_eps) + + writer.close() - return outcomes + return outcomes # take turn, which finds the best state and picks it, based on the current network @@ -244,7 +263,7 @@ class Network: def eval(self, trained_eps = 0): - def do_eval(method, episodes = 1000, trained_eps = 0): + def do_eval(sess, method, episodes = 1000, trained_eps = 0): start_time = time.time() def print_time_estimate(eps_completed): @@ -265,7 +284,7 @@ class Network: board = Board.initial_state while Board.outcome(board) is None: roll = (random.randrange(1,7), random.randrange(1,7)) - board = (self.p1.make_move(board, self.p1.get_sym(), roll))[0] + board = (self.p1.make_move(sess, board, self.p1.get_sym(), roll))[0] roll = (random.randrange(1,7), random.randrange(1,7)) board = Board.flip(Eval.make_random_move(Board.flip(board), 1, roll)) sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1])) @@ -288,7 +307,7 @@ class Network: #print(roll) prev_board = tuple(board) - board = (self.make_move(board, roll))[0] + board = (self.make_move(sess, board, roll))[0] #print("post p1:", board, sep="\n") #print("."*30) @@ -336,9 +355,14 @@ class Network: else: sys.stderr.write("[EVAL ] Evaluation method '{}' is not defined\n".format(method)) return [0] - - return [ (method, do_eval(method, - self.config['episode_count'], - trained_eps = trained_eps)) - for method - in self.config['eval_methods'] ] + + with tf.Session() as session: + session .run(tf.global_variables_initializer()) + self.restore_model(session) + outcomes = [ (method, do_eval(session, + method, + self.config['episode_count'], + trained_eps = trained_eps)) + for method + in self.config['eval_methods'] ] + return outcomes diff --git a/plot.py b/plot.py index 8261cde..5a94f51 100644 --- a/plot.py +++ b/plot.py @@ -44,7 +44,7 @@ if __name__ == '__main__': plt.show() while True: - df = dataframes('default')['eval'] + df = dataframes('a')['eval'] print(df) From 1f1e806306eb0aded61c2582f416b55655145d94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoffer=20M=C3=BCller=20Madsen?= Date: Mon, 26 Mar 2018 15:55:48 +0200 Subject: [PATCH 02/17] fix errant whitespace --- network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/network.py b/network.py index f058d48..d32c6b9 100644 --- a/network.py +++ b/network.py @@ -357,7 +357,7 @@ class Network: return [0] with tf.Session() as session: - session .run(tf.global_variables_initializer()) + session.run(tf.global_variables_initializer()) self.restore_model(session) outcomes = [ (method, do_eval(session, method, From 4c43bf19a3f38bf424922631157ea09f3902ea34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoffer=20M=C3=BCller=20Madsen?= Date: Mon, 26 Mar 2018 16:45:26 +0200 Subject: [PATCH 03/17] Add evaluation variance benchmark To do a benchmark for `pubeval`, run `python3 main.py --bench-eval-scores --eval-methods pubeval` Logs will be placed in directory `bench` Use `plot_bench(data_path)` in `plot.py` for plotting --- main.py | 159 +++++++++++++++++++++++++++++++++++++---------------- network.py | 28 +++++++--- plot.py | 12 ++++ 3 files changed, 143 insertions(+), 56 deletions(-) diff --git a/main.py b/main.py index bc8de09..b5a8ad0 100644 --- a/main.py +++ b/main.py @@ -3,38 +3,6 @@ import sys import os import time -model_storage_path = 'models' - -# Create models folder -if not os.path.exists(model_storage_path): - os.makedirs(model_storage_path) - -# Define helper functions -def log_train_outcome(outcome, trained_eps = 0): - format_vars = { 'trained_eps': trained_eps, - 'count': len(train_outcome), - 'sum': sum(train_outcome), - 'mean': sum(train_outcome) / len(train_outcome), - 'time': int(time.time()) - } - with open(os.path.join(config['model_path'], 'logs', "train.log"), 'a+') as f: - f.write("{time};{trained_eps};{count};{sum};{mean}".format(**format_vars) + "\n") - - -def log_eval_outcomes(outcomes, trained_eps = 0): - for outcome in outcomes: - scores = outcome[1] - format_vars = { 'trained_eps': trained_eps, - 'method': outcome[0], - 'count': len(scores), - 'sum': sum(scores), - 'mean': sum(scores) / len(scores), - 'time': int(time.time()) - } - with open(os.path.join(config['model_path'], 'logs', "eval.log"), 'a+') as f: - f.write("{time};{method};{trained_eps};{count};{sum};{mean}".format(**format_vars) + "\n") - - # Parse command line arguments parser = argparse.ArgumentParser(description="Backgammon games") parser.add_argument('--episodes', action='store', dest='episode_count', @@ -47,13 +15,15 @@ parser.add_argument('--eval-methods', action='store', default=['random'], nargs='*', help='specifies evaluation methods') parser.add_argument('--eval', action='store_true', - help='whether to evaluate the neural network with a random choice bot') + help='evaluate the neural network with a random choice bot') +parser.add_argument('--bench-eval-scores', action='store_true', + help='benchmark scores of evaluation measures. episode counts and model specified as options are ignored.') parser.add_argument('--train', action='store_true', - help='whether to train the neural network') + help='train the neural network') parser.add_argument('--eval-after-train', action='store_true', dest='eval_after_train', - help='whether to evaluate after each training session') + help='evaluate after each training session') parser.add_argument('--play', action='store_true', - help='whether to play with the neural network') + help='play with the neural network') parser.add_argument('--start-episode', action='store', dest='start_episode', type=int, default=0, help='episode count to start at; purely for display purposes') @@ -66,27 +36,73 @@ args = parser.parse_args() config = { 'model': args.model, - 'model_path': os.path.join(model_storage_path, args.model), 'episode_count': args.episode_count, 'eval_methods': args.eval_methods, 'train': args.train, 'play': args.play, 'eval': args.eval, + 'bench_eval_scores': args.bench_eval_scores, 'eval_after_train': args.eval_after_train, 'start_episode': args.start_episode, 'train_perpetually': args.train_perpetually, - 'model_storage_path': model_storage_path + 'model_storage_path': 'models', + 'bench_storage_path': 'bench' } +# Create models folder +if not os.path.exists(config['model_storage_path']): + os.makedirs(config['model_storage_path']) + +model_path = lambda: os.path.join(config['model_storage_path'], config['model']) + # Make sure directories exist -model_path = os.path.join(config['model_path']) -log_path = os.path.join(model_path, 'logs') -if not os.path.isdir(model_path): - os.mkdir(model_path) +log_path = os.path.join(model_path(), 'logs') +if not os.path.isdir(model_path()): + os.mkdir(model_path()) if not os.path.isdir(log_path): os.mkdir(log_path) + + + +# Define helper functions +def log_train_outcome(outcome, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "train.log")): + format_vars = { 'trained_eps': trained_eps, + 'count': len(train_outcome), + 'sum': sum(train_outcome), + 'mean': sum(train_outcome) / len(train_outcome), + 'time': int(time.time()) + } + with open(log_path, 'a+') as f: + f.write("{time};{trained_eps};{count};{sum};{mean}".format(**format_vars) + "\n") +def log_eval_outcomes(outcomes, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "eval.log")): + for outcome in outcomes: + scores = outcome[1] + format_vars = { 'trained_eps': trained_eps, + 'method': outcome[0], + 'count': len(scores), + 'sum': sum(scores), + 'mean': sum(scores) / len(scores), + 'time': int(time.time()) + } + with open(log_path, 'a+') as f: + f.write("{time};{method};{trained_eps};{count};{sum};{mean}".format(**format_vars) + "\n") + +def log_bench_eval_outcomes(outcomes, log_path, index, time, trained_eps = 0): + for outcome in outcomes: + scores = outcome[1] + format_vars = { 'trained_eps': trained_eps, + 'method': outcome[0], + 'count': len(scores), + 'sum': sum(scores), + 'mean': sum(scores) / len(scores), + 'time': time, + 'index': index, + } + with open(log_path, 'a+') as f: + f.write("{method};{count};{index};{time};{sum};{mean}".format(**format_vars) + "\n") + # Do actions specified by command-line if args.list_models: def get_eps_trained(folder): @@ -94,7 +110,7 @@ if args.list_models: return int(f.read()) model_folders = [ f.path for f - in os.scandir(model_storage_path) + in os.scandir(config['model_storage_path']) if f.is_dir() ] models = [ (folder, get_eps_trained(folder)) for folder in model_folders ] sys.stderr.write("Found {} model(s)\n".format(len(models))) @@ -106,13 +122,13 @@ if args.list_models: if __name__ == "__main__": # Set up network from network import Network - network = Network(config, config['model']) - start_episode = network.episodes_trained # Set up variables episode_count = config['episode_count'] if args.train: + network = Network(config, config['model']) + start_episode = network.episodes_trained while True: train_outcome = network.train_model(episodes = episode_count, trained_eps = start_episode) start_episode += episode_count @@ -122,9 +138,58 @@ if __name__ == "__main__": log_eval_outcomes(eval_outcomes, trained_eps = start_episode) if not config['train_perpetually']: break + + elif args.eval: - outcomes = network.eval() + network = Network(config, config['model']) + start_episode = network.episodes_trained + # Evaluation measures are described in `config` + outcomes = network.eval(config['episode_count']) log_eval_outcomes(outcomes, trained_eps = start_episode) # elif args.play: # g.play(episodes = episode_count) - + + + elif args.bench_eval_scores: + # Make sure benchmark directory exists + if not os.path.isdir(config['bench_storage_path']): + os.mkdir(config['bench_storage_path']) + + config = config.copy() + config['model'] = 'bench' + + network = Network(config, config['model']) + start_episode = network.episodes_trained + + if start_episode == 0: + print("Model not trained! Beware of using non-existing models!") + exit() + + sample_count = 20 + episode_counts = [25, 50, 100, 250, 500, 1000, 2500, 5000, + 10000, 20000] + + def do_eval(sess): + for eval_method in config['eval_methods']: + result_path = os.path.join(config['bench_storage_path'], + eval_method) + "-{}.log".format(int(time.time())) + for n in episode_counts: + for i in range(sample_count): + start_time = time.time() + # Evaluation measure to be benchmarked are described in `config` + outcomes = network.eval(episode_count = n, + tf_session = sess) + time_diff = time.time() - start_time + log_bench_eval_outcomes(outcomes, + time = time_diff, + index = i, + trained_eps = start_episode, + log_path = result_path) + + # CMM: oh no + import tensorflow as tf + with tf.Session() as session: + network.restore_model(session) + do_eval(session) + + diff --git a/network.py b/network.py index d32c6b9..d9a9f52 100644 --- a/network.py +++ b/network.py @@ -13,7 +13,7 @@ class Network: input_size = 26 output_size = 1 # Can't remember the best learning_rate, look this up - learning_rate = 0.05 + learning_rate = 0.01 # TODO: Actually compile tensorflow properly #os.environ["TF_CPP_MIN_LOG_LEVEL"]="2" @@ -23,7 +23,7 @@ class Network: def __init__(self, config, name): self.config = config - self.checkpoint_path = config['model_path'] + self.checkpoint_path = os.path.join(config['model_storage_path'], config['model']) self.name = name @@ -262,7 +262,7 @@ class Network: - def eval(self, trained_eps = 0): + def eval(self, episode_count, trained_eps = 0, tf_session = None): def do_eval(sess, method, episodes = 1000, trained_eps = 0): start_time = time.time() @@ -356,13 +356,23 @@ class Network: sys.stderr.write("[EVAL ] Evaluation method '{}' is not defined\n".format(method)) return [0] - with tf.Session() as session: - session.run(tf.global_variables_initializer()) - self.restore_model(session) - outcomes = [ (method, do_eval(session, + + if tf_session == None: + with tf.Session(): + session.run(tf.global_variables_initializer()) + self.restore_model(session) + outcomes = [ (method, do_eval(session, + method, + episode_count, + trained_eps = trained_eps)) + for method + in self.config['eval_methods'] ] + return outcomes + else: + outcomes = [ (method, do_eval(tf_session, method, - self.config['episode_count'], + episode_count, trained_eps = trained_eps)) for method in self.config['eval_methods'] ] - return outcomes + return outcomes diff --git a/plot.py b/plot.py index 5a94f51..c820c55 100644 --- a/plot.py +++ b/plot.py @@ -9,9 +9,21 @@ import matplotlib.dates as mdates train_headers = ['timestamp', 'eps_train', 'eps_trained_session', 'sum', 'mean'] eval_headers = ['timestamp', 'method', 'eps_train', 'eval_eps_used', 'sum', 'mean'] +bench_headers = ['method', 'sample_count', 'i', 'time', 'sum', 'mean'] model_path = 'models' +def plot_bench(data_path): + df = pd.read_csv(data_path, sep=";", + names=bench_headers, index_col=[0,1,2]) + for method_label in df.index.levels[0]: + cur_df = df.loc[method_label] + plot = df[['mean']].loc['pubeval'].unstack().T.plot.box() + plot.set_title("Evaluation variance, {}".format(method_label)) + plot.set_xlabel("Sample count") + plot.set_ylabel("Mean score") + plt.show(plot.figure) + del cur_df, plot def dataframes(model_name): def df_timestamp_to_datetime(df): From 9b2bbfb4d1a855f4fbcc4f3bed5f93929a0d7aaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoffer=20M=C3=BCller=20Madsen?= Date: Mon, 26 Mar 2018 17:06:12 +0200 Subject: [PATCH 04/17] print variances when plotting evaluation variance benchmark --- plot.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/plot.py b/plot.py index c820c55..5957854 100644 --- a/plot.py +++ b/plot.py @@ -17,13 +17,18 @@ def plot_bench(data_path): df = pd.read_csv(data_path, sep=";", names=bench_headers, index_col=[0,1,2]) for method_label in df.index.levels[0]: - cur_df = df.loc[method_label] - plot = df[['mean']].loc['pubeval'].unstack().T.plot.box() + df_prime = df[['mean']].loc[method_label].unstack().T + plot = df_prime.plot.box() plot.set_title("Evaluation variance, {}".format(method_label)) plot.set_xlabel("Sample count") plot.set_ylabel("Mean score") plt.show(plot.figure) - del cur_df, plot + + # for later use: + variances = df_prime.var() + print(variances) + + del df_prime, plot, variances def dataframes(model_name): def df_timestamp_to_datetime(df): From 006f7917279bbee478e22b8504b84b84c47339da Mon Sep 17 00:00:00 2001 From: alex Date: Tue, 27 Mar 2018 02:26:15 +0200 Subject: [PATCH 05/17] Functioning network using board representation shamelessly ripped from Tesauro --- board.py | 29 ++- eval.py | 13 ++ game.py | 5 +- network.py | 412 ++++++++++++++++++++++++------------------ pubeval/dumbeval.c | 170 +++++++++++++++++ pubeval/setup_dumb.py | 9 + 6 files changed, 456 insertions(+), 182 deletions(-) create mode 100644 pubeval/dumbeval.c create mode 100644 pubeval/setup_dumb.py diff --git a/board.py b/board.py index bfa7998..a2b205e 100644 --- a/board.py +++ b/board.py @@ -34,8 +34,33 @@ class Board: board.append(15 - sum(positives)) board.append(-15 - sum(negatives)) return tuple(board) - - + + @staticmethod + def board_features_to_tesauro(board, cur_player): + features = [] + for player in [-1,1]: + sum = 0.0 + for board_range in range(1,25): + pin = board[board_range] + #print("PIIIN:",pin) + feature = [0.0]*4 + if np.sign(pin) == np.sign(player): + sum += abs(pin) + for i in range(min(abs(pin), 3)): + feature[i] = 1 + if (abs(pin) > 3): + feature[3] = (abs(pin)-3)/2 + features += feature + #print("SUUUM:",sum) + # Append the amount of men on the bar of the current player divided by 2 + features.append((board[0] if np.sign(player) < 0 else board[25]) / 2.0) + # Calculate how many pieces there must be in the home state and divide it by 15 + features.append((15 - sum) / 15) + features += ([1,0] if np.sign(cur_player) > 0 else [1,0]) + test = np.array(features).reshape(1,-1) + #print("TEST:",test) + return test + @staticmethod diff --git a/eval.py b/eval.py index 1d02a4b..7be0098 100644 --- a/eval.py +++ b/eval.py @@ -2,6 +2,7 @@ from board import Board import numpy as np import pubeval +import dumbeval class Eval: @@ -24,4 +25,16 @@ class Eval: return best_move_pair + @staticmethod + def make_dumbeval_move(board, sym, roll): + legal_moves = Board.calculate_legal_states(board, sym, roll) + moves_and_scores = [ ( board, + dumbeval.eval(False, Board.board_features_to_pubeval(board, sym))) + for board + in legal_moves ] + scores = [ x[1] for x in moves_and_scores ] + best_move_pair = moves_and_scores[np.array(scores).argmax()] + + return best_move_pair + diff --git a/game.py b/game.py index 9469b57..443ac41 100644 --- a/game.py +++ b/game.py @@ -23,18 +23,21 @@ class Game: def roll(self): return self.cup.roll() - + ''' def best_move_and_score(self): roll = self.roll() move_and_val = self.p1.make_move(self.board, self.p1.get_sym(), roll) self.board = move_and_val[0] return move_and_val + ''' + ''' def next_round(self): roll = self.roll() #print(roll) self.board = Board.flip(self.p2.make_move(Board.flip(self.board), self.p2.get_sym(), roll)[0]) return self.board + ''' def board_state(self): return self.board diff --git a/network.py b/network.py index f058d48..8f8ef18 100644 --- a/network.py +++ b/network.py @@ -8,19 +8,20 @@ import sys import random from eval import Eval + class Network: hidden_size = 40 - input_size = 26 + input_size = 198 output_size = 1 # Can't remember the best learning_rate, look this up - learning_rate = 0.05 + learning_rate = 0.01 # TODO: Actually compile tensorflow properly - #os.environ["TF_CPP_MIN_LOG_LEVEL"]="2" + # os.environ["TF_CPP_MIN_LOG_LEVEL"]="2" def custom_tanh(self, x, name=None): return tf.scalar_mul(tf.constant(2.00), tf.tanh(x, name)) - + def __init__(self, config, name): self.config = config self.checkpoint_path = config['model_path'] @@ -34,13 +35,13 @@ class Network: self.episodes_trained = int(f.read()) else: self.episodes_trained = 0 - + # input = x self.x = tf.placeholder('float', [1, Network.input_size], name='input') self.value_next = tf.placeholder('float', [1, Network.output_size], name="value_next") xavier_init = tf.contrib.layers.xavier_initializer() - + W_1 = tf.get_variable("w_1", (Network.input_size, Network.hidden_size), initializer=xavier_init) W_2 = tf.get_variable("w_2", (Network.hidden_size, Network.output_size), @@ -51,8 +52,8 @@ class Network: b_2 = tf.get_variable("b_2", (Network.output_size,), initializer=tf.zeros_initializer) - normalized_input = tf.nn.l2_normalize(self.x) - value_after_input = tf.sigmoid(tf.matmul(normalized_input, W_1) + b_1, name='hidden_layer') + + value_after_input = tf.sigmoid(tf.matmul(self.x, W_1) + b_1, name='hidden_layer') self.value = tf.sigmoid(tf.matmul(value_after_input, W_2) + b_2, name='output_layer') @@ -63,23 +64,23 @@ class Network: # TODO: Alexander thinks that self.value will be computed twice (instead of once) difference_in_values = tf.reshape(tf.subtract(self.value_next, self.value, name='difference_in_values'), []) tf.summary.scalar("difference_in_values", tf.abs(difference_in_values)) - + trainable_vars = tf.trainable_variables() gradients = tf.gradients(self.value, trainable_vars) - + apply_gradients = [] - + with tf.variable_scope('apply_gradients'): for gradient, trainable_var in zip(gradients, trainable_vars): # Hopefully this is Δw_t = α(V_t+1 - V_t)▿_wV_t. backprop_calc = Network.learning_rate * difference_in_values * gradient grad_apply = trainable_var.assign_add(backprop_calc) apply_gradients.append(grad_apply) - + self.training_op = tf.group(*apply_gradients, name='training_op') self.saver = tf.train.Saver(max_to_keep=1) - + def eval_state(self, sess, state): # Run state through a network @@ -112,23 +113,22 @@ class Network: # implement learning_rate * (difference_in_values) * gradients (the # before-mentioned calculation. - # print("Network is evaluating") - #print("eval ({})".format(self.name), state, val, sep="\n") - return sess.run(self.value, feed_dict={self.x: state}) + # print("eval ({})".format(self.name), state, val, sep="\n") + return sess.run(self.value, feed_dict={self.x: state}) def save_model(self, sess, episode_count): self.saver.save(sess, os.path.join(self.checkpoint_path, 'model.ckpt')) with open(os.path.join(self.checkpoint_path, "episodes_trained"), 'w+') as f: - print("[NETWK] ({name}) Saving model to:".format(name = self.name), + print("[NETWK] ({name}) Saving model to:".format(name=self.name), os.path.join(self.checkpoint_path, 'model.ckpt')) f.write(str(episode_count) + "\n") - + def restore_model(self, sess): if os.path.isfile(os.path.join(self.checkpoint_path, 'model.ckpt.index')): latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_path) - print("[NETWK] ({name}) Restoring model from:".format(name = self.name), + print("[NETWK] ({name}) Restoring model from:".format(name=self.name), str(latest_checkpoint)) self.saver.restore(sess, latest_checkpoint) variables_names = [v.name for v in tf.trainable_variables()] @@ -144,24 +144,173 @@ class Network: with open(episode_count_path, 'r') as f: self.config['start_episode'] = int(f.read()) - def make_move(self, sess, board, roll): + def make_move(self, sess, board, roll, player): # print(Board.pretty(board)) - legal_moves = Board.calculate_legal_states(board, 1, roll) - moves_and_scores = [ (move, self.eval_state(sess, np.array(move).reshape(1,26))) for move in legal_moves ] - scores = [ x[1] for x in moves_and_scores ] + legal_moves = Board.calculate_legal_states(board, player, roll) + moves_and_scores = [(move, self.eval_state(sess, Board.board_features_to_tesauro(move, player))) for move in legal_moves] + scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores] best_score_index = np.array(scores).argmax() best_move_pair = moves_and_scores[best_score_index] - #print("Found the best state, being:", np.array(move_scores).argmax()) + # print("Found the best state, being:", np.array(move_scores).argmax()) return best_move_pair - - - def train_model(self, episodes=1000, save_step_size = 100, trained_eps = 0): + + def eval(self, trained_eps=0): + def do_eval(sess, method, episodes=1000, trained_eps=trained_eps): + start_time = time.time() + + def print_time_estimate(eps_completed): + cur_time = time.time() + time_diff = cur_time - start_time + eps_per_sec = eps_completed / time_diff + secs_per_ep = time_diff / eps_completed + eps_remaining = (episodes - eps_completed) + sys.stderr.write( + "[EVAL ] Averaging {per_sec} episodes per second\n".format(per_sec=round(eps_per_sec, 2))) + sys.stderr.write( + "[EVAL ] {eps_remaining} episodes remaining; approx. {time_remaining} seconds remaining\n".format( + eps_remaining=eps_remaining, time_remaining=int(eps_remaining * secs_per_ep))) + + sys.stderr.write( + "[EVAL ] Evaluating {eps} episode(s) with method '{method}'\n".format(eps=episodes, method=method)) + + if method == 'random': + outcomes = [] + """for i in range(1, episodes + 1): + sys.stderr.write("[EVAL ] Episode {}".format(i)) + board = Board.initial_state + while Board.outcome(board) is None: + roll = (random.randrange(1, 7), random.randrange(1, 7)) + board = (self.p1.make_move(sess, board, self.p1.get_sym(), roll))[0] + roll = (random.randrange(1, 7), random.randrange(1, 7)) + board = Board.flip(Eval.make_random_move(Board.flip(board), 1, roll)) + sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1])) + outcomes.append(Board.outcome(board)[1]) + sys.stderr.write("\n") + + if i % 50 == 0: + print_time_estimate(i)""" + return outcomes + elif method == 'pubeval': + outcomes = [] + # Add the evaluation code for pubeval, the bot has a method make_pubeval_move(board, sym, roll), + # which can be used to get the best move according to pubeval + for i in range(1, episodes + 1): + sys.stderr.write("[EVAL ] Episode {}".format(i)) + board = Board.initial_state + # print("init:", board, sep="\n") + while Board.outcome(board) is None: + # print("-"*30) + roll = (random.randrange(1, 7), random.randrange(1, 7)) + # print(roll) + + # prev_board = tuple(board) + board = (self.make_move(sess, board, roll, 1))[0] + # print("post p1:", board, sep="\n") + + # print("."*30) + roll = (random.randrange(1, 7), random.randrange(1, 7)) + # print(roll) + + # prev_board = tuple(board) + board = Eval.make_pubeval_move(board, -1, roll)[0][0:26] + # print("post pubeval:", board, sep="\n") + + # print("*"*30) + # print(board) + # print("+"*30) + sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1])) + outcomes.append(Board.outcome(board)[1]) + sys.stderr.write("\n") + + if i % 10 == 0: + print_time_estimate(i) + + return outcomes + + elif method == 'dumbeval': + outcomes = [] + # Add the evaluation code for pubeval, the bot has a method make_pubeval_move(board, sym, roll), + # which can be used to get the best move according to pubeval + for i in range(1, episodes + 1): + sys.stderr.write("[EVAL ] Episode {}".format(i)) + board = Board.initial_state + # print("init:", board, sep="\n") + while Board.outcome(board) is None: + # print("-"*30) + roll = (random.randrange(1, 7), random.randrange(1, 7)) + # print(roll) + + # prev_board = tuple(board) + board = (self.make_move(sess, board, roll, 1))[0] + # print("post p1:", board, sep="\n") + + # print("."*30) + roll = (random.randrange(1, 7), random.randrange(1, 7)) + # print(roll) + + # prev_board = tuple(board) + board = Eval.make_dumbeval_move(board, -1, roll)[0][0:26] + # print("post pubeval:", board, sep="\n") + + # print("*"*30) + # print(board) + # print("+"*30) + sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1])) + outcomes.append(Board.outcome(board)[1]) + sys.stderr.write("\n") + + if i % 10 == 0: + print_time_estimate(i) + + return outcomes + + elif method == 'dumbmodel': + outcomes = [] + """ + config_prime = self.config.copy() + config_prime['model_path'] = os.path.join(config_prime['model_storage_path'], 'dumbmodel') + eval_bot = Bot(1, config = config_prime, name = "dumbmodel") + #print(self.config, "\n", config_prime) + outcomes = [] + for i in range(1, episodes + 1): + sys.stderr.write("[EVAL ] Episode {}".format(i)) + board = Board.initial_state + while Board.outcome(board) is None: + roll = (random.randrange(1,7), random.randrange(1,7)) + board = (self.make_move(board, self.p1.get_sym(), roll))[0] + + roll = (random.randrange(1,7), random.randrange(1,7)) + board = Board.flip(eval_bot.make_move(Board.flip(board), self.p1.get_sym(), roll)[0]) + sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1])) + outcomes.append(Board.outcome(board)[1]) + sys.stderr.write("\n") + + if i % 50 == 0: + print_time_estimate(i) + """ + return outcomes + else: + sys.stderr.write("[EVAL ] Evaluation method '{}' is not defined\n".format(method)) + return [0] + + with tf.Session() as session: + session.run(tf.global_variables_initializer()) + self.restore_model(session) + outcomes = [(method, do_eval(session, + method, + self.config['episode_count'], + trained_eps=trained_eps)) + for method + in self.config['eval_methods']] + return outcomes + + def train_model(self, episodes=1000, save_step_size=100, trained_eps=0): with tf.Session() as sess: writer = tf.summary.FileWriter("/tmp/log/tf", sess.graph) - + sess.run(tf.global_variables_initializer()) self.restore_model(sess) - + variables_names = [v.name for v in tf.trainable_variables()] values = sess.run(variables_names) for k, v in zip(variables_names, values): @@ -172,197 +321,102 @@ class Network: start_time = time.time() def print_time_estimate(eps_completed): - cur_time = time.time() - time_diff = cur_time - start_time - eps_per_sec = eps_completed / time_diff - secs_per_ep = time_diff / eps_completed + cur_time = time.time() + time_diff = cur_time - start_time + eps_per_sec = eps_completed / time_diff + secs_per_ep = time_diff / eps_completed eps_remaining = (episodes - eps_completed) - sys.stderr.write("[TRAIN] Averaging {per_sec} episodes per second\n".format(per_sec = round(eps_per_sec, 2))) - sys.stderr.write("[TRAIN] {eps_remaining} episodes remaining; approx. {time_remaining} seconds remaining\n".format(eps_remaining = eps_remaining, time_remaining = int(eps_remaining * secs_per_ep))) + sys.stderr.write( + "[TRAIN] Averaging {per_sec} episodes per second\n".format(per_sec=round(eps_per_sec, 2))) + sys.stderr.write( + "[TRAIN] {eps_remaining} episodes remaining; approx. {time_remaining} seconds remaining\n".format( + eps_remaining=eps_remaining, time_remaining=int(eps_remaining * secs_per_ep))) - sys.stderr.write("[TRAIN] Training {} episodes and save_step_size {}\n".format(episodes, save_step_size)) outcomes = [] for episode in range(1, episodes + 1): sys.stderr.write("[TRAIN] Episode {}".format(episode + trained_eps)) # TODO decide which player should be here + + + # TEST + #if episode % 1000 == 0: + # self.config['eval_methods'] = 'dumbeval' + # self.config['episodes'] = 300 + # outcomes = self.eval(trained_eps) + # self.log_eval_outcomes(outcomes, trained_eps=self.episodes_trained) + + #player = random.choice([-1, 1]) player = 1 - - roll = (random.randrange(1,7), random.randrange(1,7)) - prev_board, _ = self.make_move(sess, Board.flip(Board.initial_state) if player == -1 else Board.initial_state, roll) - if player == -1: - prev_board = Board.flip(prev_board) - + + prev_board = Board.initial_state + # find the best move here, make this move, then change turn as the # first thing inside of the while loop and then call # best_move_and_score to get V_t+1 # i = 0 while Board.outcome(prev_board) is None: - # print("-"*30) - # print(i) - # print(roll) - # print(Board.pretty(prev_board)) - # print("/"*30) - # i += 1 + + #print("PREEEV_BOOOOAAARD:",prev_board) + cur_board, cur_board_value = self.make_move(sess, + prev_board, + (random.randrange(1, 7), random.randrange(1, 7)), player) - player *= -1 - roll = (random.randrange(1,7), random.randrange(1,7)) + #print("The current value:",cur_board_value) - cur_board, cur_board_value = self.make_move(sess, Board.flip(prev_board) if player == -1 else prev_board, roll) - if player == -1: - cur_board = Board.flip(cur_board) - - # print("cur_board_value:", cur_board_value) - # adjust weights sess.run(self.training_op, - feed_dict = { self.x: np.array(prev_board).reshape((1,26)), - self.value_next: cur_board_value }) + feed_dict={self.x: Board.board_features_to_tesauro(prev_board, player), + self.value_next: cur_board_value}) + + player *= -1 + + prev_board = cur_board final_board = prev_board sys.stderr.write("\t outcome {}".format(Board.outcome(final_board)[1])) outcomes.append(Board.outcome(final_board)[1]) - final_score = np.array([ Board.outcome(final_board)[1] ]) + final_score = np.array([Board.outcome(final_board)[1]]) scaled_final_score = ((final_score + 2) / 4) - + #print("The difference in values:", scaled_final_score - cur_board_value) # print("scaled_final_score",scaled_final_score) with tf.name_scope("final"): merged = tf.summary.merge_all() summary, _ = sess.run([merged, self.training_op], - feed_dict = { self.x: np.array(prev_board).reshape((1,26)), - self.value_next: scaled_final_score.reshape((1, 1)) }) + feed_dict={self.x: Board.board_features_to_tesauro(prev_board, player), + self.value_next: scaled_final_score.reshape((1, 1))}) writer.add_summary(summary, episode + trained_eps) - + sys.stderr.write("\n") - + if episode % min(save_step_size, episodes) == 0: sys.stderr.write("[TRAIN] Saving model...\n") - self.save_model(sess, episode+trained_eps) + self.save_model(sess, episode + trained_eps) if episode % 50 == 0: print_time_estimate(episode) sys.stderr.write("[TRAIN] Saving model for final episode...\n") - self.save_model(sess, episode+trained_eps) - + self.save_model(sess, episode + trained_eps) + writer.close() - + return outcomes - - # take turn, which finds the best state and picks it, based on the current network - # save current state - # run training operation (session.run(self.training_op, {x:x, value_next, value_next})), (something which does the backprop, based on the state after having taken a turn, found before, and the state we saved in the beginning and from now we'll save it at the end of the turn - # save the current state again, so we can continue running backprop based on the "previous" turn. + # take turn, which finds the best state and picks it, based on the current network + # save current state + # run training operation (session.run(self.training_op, {x:x, value_next, value_next})), + # (something which does the backprop, based on the state after having taken a turn, + # found before, and the state we saved in the beginning and from now we'll + # save it at the end of the turn - # NOTE: We need to make a method so that we can take a single turn or at least just pick the next best move, so we know how to evaluate according to TD-learning. Right now, our game just continues in a while loop without nothing to stop it! - + # save the current state again, so we can continue running backprop based on the "previous" turn. + + # NOTE: We need to make a method so that we can take a single turn or at least + # just pick the next best move, so we know how to evaluate according to TD-learning. + # Right now, our game just continues in a while loop without nothing to stop it! - def eval(self, trained_eps = 0): - def do_eval(sess, method, episodes = 1000, trained_eps = 0): - start_time = time.time() - - def print_time_estimate(eps_completed): - cur_time = time.time() - time_diff = cur_time - start_time - eps_per_sec = eps_completed / time_diff - secs_per_ep = time_diff / eps_completed - eps_remaining = (episodes - eps_completed) - sys.stderr.write("[EVAL ] Averaging {per_sec} episodes per second\n".format(per_sec = round(eps_per_sec, 2))) - sys.stderr.write("[EVAL ] {eps_remaining} episodes remaining; approx. {time_remaining} seconds remaining\n".format(eps_remaining = eps_remaining, time_remaining = int(eps_remaining * secs_per_ep))) - - sys.stderr.write("[EVAL ] Evaluating {eps} episode(s) with method '{method}'\n".format(eps=episodes, method=method)) - - if method == 'random': - outcomes = [] - for i in range(1, episodes + 1): - sys.stderr.write("[EVAL ] Episode {}".format(i)) - board = Board.initial_state - while Board.outcome(board) is None: - roll = (random.randrange(1,7), random.randrange(1,7)) - board = (self.p1.make_move(sess, board, self.p1.get_sym(), roll))[0] - roll = (random.randrange(1,7), random.randrange(1,7)) - board = Board.flip(Eval.make_random_move(Board.flip(board), 1, roll)) - sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1])) - outcomes.append(Board.outcome(board)[1]) - sys.stderr.write("\n") - - if i % 50 == 0: - print_time_estimate(i) - return outcomes - elif method == 'pubeval': - outcomes = [] - # Add the evaluation code for pubeval, the bot has a method make_pubeval_move(board, sym, roll), which can be used to get the best move according to pubeval - for i in range(1, episodes + 1): - sys.stderr.write("[EVAL ] Episode {}".format(i)) - board = Board.initial_state - #print("init:", board, sep="\n") - while Board.outcome(board) is None: - #print("-"*30) - roll = (random.randrange(1,7), random.randrange(1,7)) - #print(roll) - - prev_board = tuple(board) - board = (self.make_move(sess, board, roll))[0] - #print("post p1:", board, sep="\n") - - #print("."*30) - roll = (random.randrange(1,7), random.randrange(1,7)) - #print(roll) - - prev_board = tuple(board) - board = Eval.make_pubeval_move(board, -1, roll)[0][0:26] - #print("post pubeval:", board, sep="\n") - - - #print("*"*30) - #print(board) - #print("+"*30) - sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1])) - outcomes.append(Board.outcome(board)[1]) - sys.stderr.write("\n") - - if i % 10 == 0: - print_time_estimate(i) - - return outcomes - # elif method == 'dumbmodel': - # config_prime = self.config.copy() - # config_prime['model_path'] = os.path.join(config_prime['model_storage_path'], 'dumbmodel') - # eval_bot = Bot(1, config = config_prime, name = "dumbmodel") - # #print(self.config, "\n", config_prime) - # outcomes = [] - # for i in range(1, episodes + 1): - # sys.stderr.write("[EVAL ] Episode {}".format(i)) - # board = Board.initial_state - # while Board.outcome(board) is None: - # roll = (random.randrange(1,7), random.randrange(1,7)) - # board = (self.make_move(board, self.p1.get_sym(), roll))[0] - - # roll = (random.randrange(1,7), random.randrange(1,7)) - # board = Board.flip(eval_bot.make_move(Board.flip(board), self.p1.get_sym(), roll)[0]) - # sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1])) - # outcomes.append(Board.outcome(board)[1]) - # sys.stderr.write("\n") - - # if i % 50 == 0: - # print_time_estimate(i) - # return outcomes - else: - sys.stderr.write("[EVAL ] Evaluation method '{}' is not defined\n".format(method)) - return [0] - - with tf.Session() as session: - session .run(tf.global_variables_initializer()) - self.restore_model(session) - outcomes = [ (method, do_eval(session, - method, - self.config['episode_count'], - trained_eps = trained_eps)) - for method - in self.config['eval_methods'] ] - return outcomes diff --git a/pubeval/dumbeval.c b/pubeval/dumbeval.c new file mode 100644 index 0000000..f9e6039 --- /dev/null +++ b/pubeval/dumbeval.c @@ -0,0 +1,170 @@ +#include + +static PyObject* DumbevalError; + +static float x[122]; + +static const float wc[122] = { +5.6477, 6.316649999999999, 7.05515, 6.65315, 9.3171, 17.9777, 2.0235499999999993, 5.1129500000000005, 7.599200000000001, 9.68525, 3.1762, 8.05335, 16.153499999999998, 8.02445, 10.55345, 15.489600000000001, 10.525199999999998, 16.438850000000002, 12.27405, 9.6362, 12.7152, 13.2859, 1.6932499999999995, 26.79045, 10.521899999999999, 6.79635, 5.28135, 6.2059, 10.2306, 10.5485, 3.6000500000000004, 4.07825, 6.951700000000001, 4.413749999999999, 11.271450000000002, 12.9361, 11.087299999999999, 13.10085, 10.411999999999999, 8.084050000000001, 12.4893, 5.96055, 4.69195, 18.9482, 9.0946, 9.1954, 6.2592, 16.180300000000003, 8.3376, 23.24915, 14.32525, -2.6699000000000006, 19.156, 5.81445, 4.7214, 7.63055, 7.039, 5.88075, 2.00765, 14.596800000000002, 11.5208, -3.79, -3.8541000000000003, 5.358499999999999, 14.4516, 2.49015, 11.284799999999999, 14.1066, 16.2306, 5.82875, 9.34505, 16.13685, 8.1893, 2.93145, 7.83185, 12.86765, 6.90115, 20.07255, 8.93355, -0.12434999999999974, 12.0587, 11.83985, 6.34155, 7.1963, 10.571200000000001, 22.38365, 6.50745, 8.94595, 12.0434, 10.79885, 14.055800000000001, 0.022100000000000453, 10.39255, 4.088850000000001, 3.6421499999999996, 38.1298, 6.8957, 0.9804999999999997, 5.9599, 13.16055, 11.55305, 10.65015, 4.6673, 15.770999999999999, 27.700050000000005, 4.4329, 12.6349, 7.037800000000001, 3.4897, 18.91945, 10.239899999999999, 5.4625, 10.29705, 10.492799999999999, 8.850900000000001, -10.575999999999999, 10.6893, 15.30845, 17.8083, 31.88275, 11.225000000000001, 4.4806}; + +static const float wr[122] = { +-0.7856, -0.50352, 0.12392, -1.00316, -2.46556, -0.1627, 0.18966, 0.0043, 0.0, +0.13681, 1.11245, 0.0, 0.0, -0.02781, -2.77982, 0.0, -0.91035, 0.60015, +-1.27266, 0.0, 0.0, 0.0, 0.0, -7.26713, -0.19412, -1.05121, 0.27448, -4.94251, + -0.06844, 0.37183, -3.66465, -0.8305, 0.09266, 0.07217, 0.0, 0.29906, -1.26062, +0.17405, 0.48302, 2.00366, 0.92321, -0.10839, 1.06349, 0.39521, 3.4204, +0.00576, 5.35, 3.8539, -0.09308, 0.17253, 0.13978, 0.2701, -0.52728, 0.88296, +0.2252, 0.0, 0.0, -0.12707, 3.05454, 0.31202, -0.88035, -0.01351, 0.0, +-3.40177, -0.22082, -0.13022, -0.09795, -2.29847, -12.32252, 0.0, -0.13597, +0.12039, 0.85631, 0.0, 0.0, -0.3424, 0.24855, 0.20178, 2.30052, 1.5068, +0.0, -0.07456, 5.16874, 0.01418, -1.3464, -1.29506, 0.0, 0.0, -1.40375, +0.0, -0.11696, 0.05281, -9.67677, 0.05685, -1.09167, 0.0, 0.0, -2.56906, +2.19605, 0.0, 0.68178, -0.08471, 0.0, -2.34631, 1.49549, -2.16183, 0.0, +1.16242, 1.08744, -0.1716, 0.25236, 0.13246, -0.37646, 0.0, -2.87401, +0.74427, 1.07274, -0.01591, -0.14818, -0.06285, 0.08302, -1.03508 +}; + +void setx(int pos[]) +{ + /* sets input vector x[] given board position pos[] */ + extern float x[]; + int j, jm1, n; + /* initialize */ + for(j=0;j<122;++j) x[j] = 0.0; + + /* first encode board locations 24-1 */ + for(j=1;j<=24;++j) { + jm1 = j - 1; + n = pos[25-j]; + if(n!=0) { + if(n==-1) x[5*jm1+0] = 1.0; + if(n==1) x[5*jm1+1] = 1.0; + if(n>=2) x[5*jm1+2] = 1.0; + if(n==3) x[5*jm1+3] = 1.0; + if(n>=4) x[5*jm1+4] = (float)(n-3)/2.0; + } + } + /* encode opponent barmen */ + x[120] = -(float)(pos[0])/2.0; + /* encode computer's menoff */ + x[121] = (float)(pos[26])/15.0; +} + +float dumbeval(int race, int pos[]) +{ + /* Backgammon move-selection evaluation function + for benchmark comparisons. Computes a linear + evaluation function: Score = W * X, where X is + an input vector encoding the board state (using + a raw encoding of the number of men at each location), + and W is a weight vector. Separate weight vectors + are used for racing positions and contact positions. + Makes lots of obvious mistakes, but provides a + decent level of play for benchmarking purposes. */ + + /* Provided as a public service to the backgammon + programming community by Gerry Tesauro, IBM Research. + (e-mail: tesauro@watson.ibm.com) */ + + /* The following inputs are needed for this routine: + + race is an integer variable which should be set + based on the INITIAL position BEFORE the move. + Set race=1 if the position is a race (i.e. no contact) + and 0 if the position is a contact position. + + pos[] is an integer array of dimension 28 which + should represent a legal final board state after + the move. Elements 1-24 correspond to board locations + 1-24 from computer's point of view, i.e. computer's + men move in the negative direction from 24 to 1, and + opponent's men move in the positive direction from + 1 to 24. Computer's men are represented by positive + integers, and opponent's men are represented by negative + integers. Element 25 represents computer's men on the + bar (positive integer), and element 0 represents opponent's + men on the bar (negative integer). Element 26 represents + computer's men off the board (positive integer), and + element 27 represents opponent's men off the board + (negative integer). */ + + /* Also, be sure to call rdwts() at the start of your + program to read in the weight values. Happy hacking] */ + + int i; + float score; + + if(pos[26]==15) return(99999999.); + /* all men off, best possible move */ + + setx(pos); /* sets input array x[] */ + score = 0.0; + if(race) { /* use race weights */ + for(i=0;i<122;++i) score += wr[i]*x[i]; + } + else { /* use contact weights */ + for(i=0;i<122;++i) score += wc[i]*x[i]; + } + return(score); +} + +static PyObject* +dumbeval_eval(PyObject *self, PyObject *args) { + int race; + long numValues; + int board[28]; + float eval_score; + + PyObject* tuple_obj; + PyObject* val_obj; + + if (! PyArg_ParseTuple(args, "pO!", &race, &PyTuple_Type, &tuple_obj)) + return NULL; + + numValues = PyTuple_Size(tuple_obj); + + if (numValues < 0) return NULL; + if (numValues != 28) { + PyErr_SetString(DumbevalError, "Tuple must have 28 entries"); + return NULL; + } + + // Iterate over tuple to retreive positions + for (int i=0; i Date: Tue, 27 Mar 2018 02:41:58 +0200 Subject: [PATCH 06/17] Initialized weights completely randomly for dumbeval --- pubeval/dumbeval.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/pubeval/dumbeval.c b/pubeval/dumbeval.c index f9e6039..0c33ccc 100644 --- a/pubeval/dumbeval.c +++ b/pubeval/dumbeval.c @@ -5,7 +5,32 @@ static PyObject* DumbevalError; static float x[122]; static const float wc[122] = { -5.6477, 6.316649999999999, 7.05515, 6.65315, 9.3171, 17.9777, 2.0235499999999993, 5.1129500000000005, 7.599200000000001, 9.68525, 3.1762, 8.05335, 16.153499999999998, 8.02445, 10.55345, 15.489600000000001, 10.525199999999998, 16.438850000000002, 12.27405, 9.6362, 12.7152, 13.2859, 1.6932499999999995, 26.79045, 10.521899999999999, 6.79635, 5.28135, 6.2059, 10.2306, 10.5485, 3.6000500000000004, 4.07825, 6.951700000000001, 4.413749999999999, 11.271450000000002, 12.9361, 11.087299999999999, 13.10085, 10.411999999999999, 8.084050000000001, 12.4893, 5.96055, 4.69195, 18.9482, 9.0946, 9.1954, 6.2592, 16.180300000000003, 8.3376, 23.24915, 14.32525, -2.6699000000000006, 19.156, 5.81445, 4.7214, 7.63055, 7.039, 5.88075, 2.00765, 14.596800000000002, 11.5208, -3.79, -3.8541000000000003, 5.358499999999999, 14.4516, 2.49015, 11.284799999999999, 14.1066, 16.2306, 5.82875, 9.34505, 16.13685, 8.1893, 2.93145, 7.83185, 12.86765, 6.90115, 20.07255, 8.93355, -0.12434999999999974, 12.0587, 11.83985, 6.34155, 7.1963, 10.571200000000001, 22.38365, 6.50745, 8.94595, 12.0434, 10.79885, 14.055800000000001, 0.022100000000000453, 10.39255, 4.088850000000001, 3.6421499999999996, 38.1298, 6.8957, 0.9804999999999997, 5.9599, 13.16055, 11.55305, 10.65015, 4.6673, 15.770999999999999, 27.700050000000005, 4.4329, 12.6349, 7.037800000000001, 3.4897, 18.91945, 10.239899999999999, 5.4625, 10.29705, 10.492799999999999, 8.850900000000001, -10.575999999999999, 10.6893, 15.30845, 17.8083, 31.88275, 11.225000000000001, 4.4806}; +1.5790816238841092, 1.6374860177130541, -1.7131823639980923, -0.9286186784962336, -1.0732080528763888, + -0.33851674519289876, 1.5798155080270462, 2.3161915581553414, 1.5625330782392322, 0.9397141260075461, +0.8386342522957442, 1.2380864901133144, -2.803703105809909, -1.6033863837759044, -1.9297462408169208, +2.804924084193149, 0.9270839975087402, 0.9877927467766145, -1.0075116465703597, -0.9456578829797895, +-2.592017567014881, 0.6309857231907587, 2.04590249003744, -0.7982917574924828, -1.4539868823698936, +1.0841407450630234, 0.45211788236898887, -1.2713606178159307, 0.8688872440724307, -0.6732738151904405, +2.2362742485632294, -0.6581729637609781, -1.7948051663967473, 2.1883788452643564, 2.1598171424723214, +0.40802272166662146, -0.9708789129385202, -0.28407011999124165, 1.132858480655588, 0.35009713673111253, +2.396877030228498, -2.9621397724422653, 1.607067798976531, 1.0644990486021744, 0.31954763526104113, +1.3044736141405133, -2.7454899725805606, -2.7379143210889545, -1.803990720175892, 0.46979843403681576, +-1.7142750941084806, -0.8151527229519924, -2.009462889335147, -0.3918389579023729, -1.2877598286852634, +2.555703689627613, 0.9185193346378826, -2.4440956502956404, -1.5557875467629176, 1.6171292628313898, +-0.7350519162308693, 2.9185129503030653, -0.02369662637182124, 0.9957404325370858, -0.6504711593915609, + 2.6190546093943468, -0.36103491516117003, -0.5988376927918715, 0.16399156134136383, 0.3254074568551131, +-1.5638349190057885, 0.8561543642997189, -0.0880209333042492, 1.323918411026094, -0.9498883976797834, +2.3050169940592458, -2.859322940360703, 2.1798224505428836, 0.03769734441005257, 2.806706515762855, +-0.514728418369482, -2.7130236727731454, 1.343193402901159, -1.542350700154035, 1.1197565339573625, +-1.4498511795864624, 1.3472224178544003, 0.7044576479382245, -2.284211306571646, -1.7289596273930532, +-1.7276292685923906, -0.1945401442950634, 2.0338744133468643, 2.001064062247366, 1.9649901287717713, + 1.5235253273336475, 0.40016636047698606, -1.3276206938801058, 0.8496121993449899, 1.054662320349336, +-1.1897996492934584, 0.49610727347392025, -1.8539475848522708, 0.4713599305742626, -2.8424352653158573, +-2.526691049928613, 2.1369664337786274, 1.0616438676464632, 1.9487914860665452, 2.822108017102477, +-0.3393405083020449, 2.787144781914554, -2.401723402781605, -1.1675562811241997, -1.1542961327714207, +0.18253192955355502, -2.418436664206371, 0.7423935287565309, 2.9903418274144666, -1.3503112004693552, +-2.649146174480099, -0.5447080156947952 +}; static const float wr[122] = { -0.7856, -0.50352, 0.12392, -1.00316, -2.46556, -0.1627, 0.18966, 0.0043, 0.0, From f43108c239bb2160bf0bb2ed83fd0b19f335ea4a Mon Sep 17 00:00:00 2001 From: alex Date: Tue, 27 Mar 2018 04:06:08 +0200 Subject: [PATCH 07/17] Training using slightly revamped version of our own board rep. Not sure if works yet. --- board.py | 11 +++++++++++ network.py | 27 +++++---------------------- pubeval/dumbeval.c | 6 +++++- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/board.py b/board.py index a2b205e..767ae45 100644 --- a/board.py +++ b/board.py @@ -35,6 +35,17 @@ class Board: board.append(-15 - sum(negatives)) return tuple(board) + @staticmethod + def board_features_to_own(board, player): + board = list(board) + positives = [x if x > 0 else 0 for x in board] + negatives = [x if x < 0 else 0 for x in board] + board.append(15 - sum(positives)) + board.append(-15 - sum(negatives)) + board += ([1, 0] if np.sign(player) > 0 else [1, 0]) + return np.array(board).reshape(1,-1) + + @staticmethod def board_features_to_tesauro(board, cur_player): features = [] diff --git a/network.py b/network.py index 8f8ef18..1dc4b62 100644 --- a/network.py +++ b/network.py @@ -11,13 +11,11 @@ from eval import Eval class Network: hidden_size = 40 - input_size = 198 + input_size = 30 output_size = 1 # Can't remember the best learning_rate, look this up learning_rate = 0.01 - - # TODO: Actually compile tensorflow properly - # os.environ["TF_CPP_MIN_LOG_LEVEL"]="2" + board_rep = Board.board_features_to_own def custom_tanh(self, x, name=None): return tf.scalar_mul(tf.constant(2.00), tf.tanh(x, name)) @@ -147,7 +145,7 @@ class Network: def make_move(self, sess, board, roll, player): # print(Board.pretty(board)) legal_moves = Board.calculate_legal_states(board, player, roll) - moves_and_scores = [(move, self.eval_state(sess, Board.board_features_to_tesauro(move, player))) for move in legal_moves] + moves_and_scores = [(move, self.eval_state(sess, Network.board_rep(move, player))) for move in legal_moves] scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores] best_score_index = np.array(scores).argmax() best_move_pair = moves_and_scores[best_score_index] @@ -338,15 +336,6 @@ class Network: sys.stderr.write("[TRAIN] Episode {}".format(episode + trained_eps)) # TODO decide which player should be here - - # TEST - #if episode % 1000 == 0: - # self.config['eval_methods'] = 'dumbeval' - # self.config['episodes'] = 300 - # outcomes = self.eval(trained_eps) - # self.log_eval_outcomes(outcomes, trained_eps=self.episodes_trained) - - #player = random.choice([-1, 1]) player = 1 prev_board = Board.initial_state @@ -355,7 +344,6 @@ class Network: # first thing inside of the while loop and then call # best_move_and_score to get V_t+1 - # i = 0 while Board.outcome(prev_board) is None: #print("PREEEV_BOOOOAAARD:",prev_board) @@ -367,7 +355,7 @@ class Network: # adjust weights sess.run(self.training_op, - feed_dict={self.x: Board.board_features_to_tesauro(prev_board, player), + feed_dict={self.x: Network.board_rep(prev_board, player), self.value_next: cur_board_value}) player *= -1 @@ -386,7 +374,7 @@ class Network: with tf.name_scope("final"): merged = tf.summary.merge_all() summary, _ = sess.run([merged, self.training_op], - feed_dict={self.x: Board.board_features_to_tesauro(prev_board, player), + feed_dict={self.x: Network.board_rep(prev_board, player), self.value_next: scaled_final_score.reshape((1, 1))}) writer.add_summary(summary, episode + trained_eps) @@ -415,8 +403,3 @@ class Network: # save the current state again, so we can continue running backprop based on the "previous" turn. - # NOTE: We need to make a method so that we can take a single turn or at least - # just pick the next best move, so we know how to evaluate according to TD-learning. - # Right now, our game just continues in a while loop without nothing to stop it! - - diff --git a/pubeval/dumbeval.c b/pubeval/dumbeval.c index 0c33ccc..6e29de3 100644 --- a/pubeval/dumbeval.c +++ b/pubeval/dumbeval.c @@ -5,6 +5,10 @@ static PyObject* DumbevalError; static float x[122]; static const float wc[122] = { +5.6477, 6.316649999999999, 7.05515, 6.65315, 9.3171, 17.9777, 2.0235499999999993, 5.1129500000000005, 7.599200000000001, 9.68525, 3.1762, 8.05335, 16.153499999999998, 8.02445, 10.55345, 15.489600000000001, 10.525199999999998, 16.438850000000002, 12.27405, 9.6362, 12.7152, 13.2859, 1.6932499999999995, 26.79045, 10.521899999999999, 6.79635, 5.28135, 6.2059, 10.2306, 10.5485, 3.6000500000000004, 4.07825, 6.951700000000001, 4.413749999999999, 11.271450000000002, 12.9361, 11.087299999999999, 13.10085, 10.411999999999999, 8.084050000000001, 12.4893, 5.96055, 4.69195, 18.9482, 9.0946, 9.1954, 6.2592, 16.180300000000003, 8.3376, 23.24915, 14.32525, -2.6699000000000006, 19.156, 5.81445, 4.7214, 7.63055, 7.039, 5.88075, 2.00765, 14.596800000000002, 11.5208, -3.79, -3.8541000000000003, 5.358499999999999, 14.4516, 2.49015, 11.284799999999999, 14.1066, 16.2306, 5.82875, 9.34505, 16.13685, 8.1893, 2.93145, 7.83185, 12.86765, 6.90115, 20.07255, 8.93355, -0.12434999999999974, 12.0587, 11.83985, 6.34155, 7.1963, 10.571200000000001, 22.38365, 6.50745, 8.94595, 12.0434, 10.79885, 14.055800000000001, 0.022100000000000453, 10.39255, 4.088850000000001, 3.6421499999999996, 38.1298, 6.8957, 0.9804999999999997, 5.9599, 13.16055, 11.55305, 10.65015, 4.6673, 15.770999999999999, 27.700050000000005, 4.4329, 12.6349, 7.037800000000001, 3.4897, 18.91945, 10.239899999999999, 5.4625, 10.29705, 10.492799999999999, 8.850900000000001, -10.575999999999999, 10.6893, 15.30845, 17.8083, 31.88275, 11.225000000000001, 4.4806}; + + +/* 1.5790816238841092, 1.6374860177130541, -1.7131823639980923, -0.9286186784962336, -1.0732080528763888, -0.33851674519289876, 1.5798155080270462, 2.3161915581553414, 1.5625330782392322, 0.9397141260075461, 0.8386342522957442, 1.2380864901133144, -2.803703105809909, -1.6033863837759044, -1.9297462408169208, @@ -30,7 +34,7 @@ static const float wc[122] = { -0.3393405083020449, 2.787144781914554, -2.401723402781605, -1.1675562811241997, -1.1542961327714207, 0.18253192955355502, -2.418436664206371, 0.7423935287565309, 2.9903418274144666, -1.3503112004693552, -2.649146174480099, -0.5447080156947952 -}; +};*/ static const float wr[122] = { -0.7856, -0.50352, 0.12392, -1.00316, -2.46556, -0.1627, 0.18966, 0.0043, 0.0, From 0eac5434d65db4a91b2a6a401421b7257c36aee4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoffer=20M=C3=BCller=20Madsen?= Date: Tue, 27 Mar 2018 11:55:32 +0200 Subject: [PATCH 08/17] update .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 08bc86a..03ee050 100644 --- a/.gitignore +++ b/.gitignore @@ -169,3 +169,6 @@ venv.bak/ README.* !README.org models/ +.DS_Store +bench/ + From 8822af81e6b6d892c8d9c8e522161ab7402bd937 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoffer=20M=C3=BCller=20Madsen?= Date: Tue, 27 Mar 2018 12:23:15 +0200 Subject: [PATCH 09/17] move dumbeval code to separate directory --- dumbeval/.gitignore | 1 + {pubeval => dumbeval}/dumbeval.c | 0 pubeval/setup_dumb.py => dumbeval/setup.py | 0 3 files changed, 1 insertion(+) create mode 100644 dumbeval/.gitignore rename {pubeval => dumbeval}/dumbeval.c (100%) rename pubeval/setup_dumb.py => dumbeval/setup.py (100%) diff --git a/dumbeval/.gitignore b/dumbeval/.gitignore new file mode 100644 index 0000000..567609b --- /dev/null +++ b/dumbeval/.gitignore @@ -0,0 +1 @@ +build/ diff --git a/pubeval/dumbeval.c b/dumbeval/dumbeval.c similarity index 100% rename from pubeval/dumbeval.c rename to dumbeval/dumbeval.c diff --git a/pubeval/setup_dumb.py b/dumbeval/setup.py similarity index 100% rename from pubeval/setup_dumb.py rename to dumbeval/setup.py From 28b82e8228030fe58322b0b2f3ae9c433fcab29e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoffer=20M=C3=BCller=20Madsen?= Date: Tue, 27 Mar 2018 12:57:06 +0200 Subject: [PATCH 10/17] update dumbeval weights --- dumbeval/dumbeval.c | 87 +++++++++++++++++++++------------------------ dumbeval/weights.py | 14 ++++++++ 2 files changed, 55 insertions(+), 46 deletions(-) create mode 100644 dumbeval/weights.py diff --git a/dumbeval/dumbeval.c b/dumbeval/dumbeval.c index 6e29de3..4d2579a 100644 --- a/dumbeval/dumbeval.c +++ b/dumbeval/dumbeval.c @@ -4,54 +4,49 @@ static PyObject* DumbevalError; static float x[122]; + +/* With apologies to Gerry Tesauro */ + +/* Weights generated by weights.py */ static const float wc[122] = { -5.6477, 6.316649999999999, 7.05515, 6.65315, 9.3171, 17.9777, 2.0235499999999993, 5.1129500000000005, 7.599200000000001, 9.68525, 3.1762, 8.05335, 16.153499999999998, 8.02445, 10.55345, 15.489600000000001, 10.525199999999998, 16.438850000000002, 12.27405, 9.6362, 12.7152, 13.2859, 1.6932499999999995, 26.79045, 10.521899999999999, 6.79635, 5.28135, 6.2059, 10.2306, 10.5485, 3.6000500000000004, 4.07825, 6.951700000000001, 4.413749999999999, 11.271450000000002, 12.9361, 11.087299999999999, 13.10085, 10.411999999999999, 8.084050000000001, 12.4893, 5.96055, 4.69195, 18.9482, 9.0946, 9.1954, 6.2592, 16.180300000000003, 8.3376, 23.24915, 14.32525, -2.6699000000000006, 19.156, 5.81445, 4.7214, 7.63055, 7.039, 5.88075, 2.00765, 14.596800000000002, 11.5208, -3.79, -3.8541000000000003, 5.358499999999999, 14.4516, 2.49015, 11.284799999999999, 14.1066, 16.2306, 5.82875, 9.34505, 16.13685, 8.1893, 2.93145, 7.83185, 12.86765, 6.90115, 20.07255, 8.93355, -0.12434999999999974, 12.0587, 11.83985, 6.34155, 7.1963, 10.571200000000001, 22.38365, 6.50745, 8.94595, 12.0434, 10.79885, 14.055800000000001, 0.022100000000000453, 10.39255, 4.088850000000001, 3.6421499999999996, 38.1298, 6.8957, 0.9804999999999997, 5.9599, 13.16055, 11.55305, 10.65015, 4.6673, 15.770999999999999, 27.700050000000005, 4.4329, 12.6349, 7.037800000000001, 3.4897, 18.91945, 10.239899999999999, 5.4625, 10.29705, 10.492799999999999, 8.850900000000001, -10.575999999999999, 10.6893, 15.30845, 17.8083, 31.88275, 11.225000000000001, 4.4806}; - - -/* -1.5790816238841092, 1.6374860177130541, -1.7131823639980923, -0.9286186784962336, -1.0732080528763888, - -0.33851674519289876, 1.5798155080270462, 2.3161915581553414, 1.5625330782392322, 0.9397141260075461, -0.8386342522957442, 1.2380864901133144, -2.803703105809909, -1.6033863837759044, -1.9297462408169208, -2.804924084193149, 0.9270839975087402, 0.9877927467766145, -1.0075116465703597, -0.9456578829797895, --2.592017567014881, 0.6309857231907587, 2.04590249003744, -0.7982917574924828, -1.4539868823698936, -1.0841407450630234, 0.45211788236898887, -1.2713606178159307, 0.8688872440724307, -0.6732738151904405, -2.2362742485632294, -0.6581729637609781, -1.7948051663967473, 2.1883788452643564, 2.1598171424723214, -0.40802272166662146, -0.9708789129385202, -0.28407011999124165, 1.132858480655588, 0.35009713673111253, -2.396877030228498, -2.9621397724422653, 1.607067798976531, 1.0644990486021744, 0.31954763526104113, -1.3044736141405133, -2.7454899725805606, -2.7379143210889545, -1.803990720175892, 0.46979843403681576, --1.7142750941084806, -0.8151527229519924, -2.009462889335147, -0.3918389579023729, -1.2877598286852634, -2.555703689627613, 0.9185193346378826, -2.4440956502956404, -1.5557875467629176, 1.6171292628313898, --0.7350519162308693, 2.9185129503030653, -0.02369662637182124, 0.9957404325370858, -0.6504711593915609, - 2.6190546093943468, -0.36103491516117003, -0.5988376927918715, 0.16399156134136383, 0.3254074568551131, --1.5638349190057885, 0.8561543642997189, -0.0880209333042492, 1.323918411026094, -0.9498883976797834, -2.3050169940592458, -2.859322940360703, 2.1798224505428836, 0.03769734441005257, 2.806706515762855, --0.514728418369482, -2.7130236727731454, 1.343193402901159, -1.542350700154035, 1.1197565339573625, --1.4498511795864624, 1.3472224178544003, 0.7044576479382245, -2.284211306571646, -1.7289596273930532, --1.7276292685923906, -0.1945401442950634, 2.0338744133468643, 2.001064062247366, 1.9649901287717713, - 1.5235253273336475, 0.40016636047698606, -1.3276206938801058, 0.8496121993449899, 1.054662320349336, --1.1897996492934584, 0.49610727347392025, -1.8539475848522708, 0.4713599305742626, -2.8424352653158573, --2.526691049928613, 2.1369664337786274, 1.0616438676464632, 1.9487914860665452, 2.822108017102477, --0.3393405083020449, 2.787144781914554, -2.401723402781605, -1.1675562811241997, -1.1542961327714207, -0.18253192955355502, -2.418436664206371, 0.7423935287565309, 2.9903418274144666, -1.3503112004693552, --2.649146174480099, -0.5447080156947952 -};*/ +-1.91222, 1.45979, 0.40657, -1.39159, 3.64558, -0.45381, -0.03157, + 0.14539, 0.80232, 0.87558, 2.36202, -2.01887, -0.88918, 2.65871, + -1.31587, 1.07476, 0.30491, -1.32892, 0.38018, -0.30714, -1.16178, + 0.71481, -1.01334, -0.44373, 0.51255, -0.17171, -0.88886, 0.02071, + -0.53279, -0.22139, -1.02436, 0.17948, 0.95697, 0.49272, 0.31848, + -0.58293, 0.14484, 0.22063, 1.0336 , -1.90554, 1.10291, -2.05589, + -0.16964, -0.82442, 1.27217, -1.24968, -0.90372, 0.05546, 0.2535 , + -0.03533, -0.31773, 0.43704, 0.21699, 0.10519, 2.12775, -0.48196, + -0.08445, -0.13156, -0.68362, 0.64765, 0.32537, 0.79493, 1.94577, + -0.63827, 0.97057, -0.46039, 1.51801, -0.62955, -0.43632, 0.25876, + -0.46623, -0.46963, 1.3532 , -0.07362, -1.53211, 0.69676, -0.92407, + 0.07153, 0.67173, 0.27661, -0.51579, -0.49019, 1.06603, -0.97673, + -1.21231, -1.54966, -0.07795, 0.32697, 0.02873, 1.38703, 0.41725, + 0.78326, -0.7257 , 0.54165, 1.38882, 0.27304, 1.0739 , 0.74654, + 1.35561, 1.18697, 1.09146, 0.17552, -0.30773, 0.27812, -1.674 , + -0.31073, -0.40745, 0.51546, -1.10875, 2.0081 , -1.27931, -1.16321, + 0.95652, 0.7487 , -0.2347 , 0.20324, -0.41417, 0.05929, 0.72632, + -1.15223, 1.2745 , -0.15947 }; static const float wr[122] = { --0.7856, -0.50352, 0.12392, -1.00316, -2.46556, -0.1627, 0.18966, 0.0043, 0.0, -0.13681, 1.11245, 0.0, 0.0, -0.02781, -2.77982, 0.0, -0.91035, 0.60015, --1.27266, 0.0, 0.0, 0.0, 0.0, -7.26713, -0.19412, -1.05121, 0.27448, -4.94251, - -0.06844, 0.37183, -3.66465, -0.8305, 0.09266, 0.07217, 0.0, 0.29906, -1.26062, -0.17405, 0.48302, 2.00366, 0.92321, -0.10839, 1.06349, 0.39521, 3.4204, -0.00576, 5.35, 3.8539, -0.09308, 0.17253, 0.13978, 0.2701, -0.52728, 0.88296, -0.2252, 0.0, 0.0, -0.12707, 3.05454, 0.31202, -0.88035, -0.01351, 0.0, --3.40177, -0.22082, -0.13022, -0.09795, -2.29847, -12.32252, 0.0, -0.13597, -0.12039, 0.85631, 0.0, 0.0, -0.3424, 0.24855, 0.20178, 2.30052, 1.5068, -0.0, -0.07456, 5.16874, 0.01418, -1.3464, -1.29506, 0.0, 0.0, -1.40375, -0.0, -0.11696, 0.05281, -9.67677, 0.05685, -1.09167, 0.0, 0.0, -2.56906, -2.19605, 0.0, 0.68178, -0.08471, 0.0, -2.34631, 1.49549, -2.16183, 0.0, -1.16242, 1.08744, -0.1716, 0.25236, 0.13246, -0.37646, 0.0, -2.87401, -0.74427, 1.07274, -0.01591, -0.14818, -0.06285, 0.08302, -1.03508 -}; + 0.13119, -0.13164, -1.2736 , 1.06352, -1.34749, -1.03086, -0.27417, + -0.27762, 0.79454, -1.12623, 2.1134 , -0.7003 , 0.26056, -1.13518, + -1.64548, -1.30828, -0.96589, -0.36258, -1.14323, -0.2006 , -1.00307, + 0.57739, -0.62693, 0.29721, -0.36996, -0.17462, 0.96704, 0.08902, + 1.4337 , -0.47107, 0.82156, 0.14988, 1.74034, 1.13313, -0.32083, + -0.00048, -0.86622, 1.12808, 0.99875, 0.8049 , -0.16841, -0.42677, + -1.9409 , -0.53565, -0.83708, 0.69603, 0.32079, 0.56942, 0.67965, + 1.49328, -1.65885, 0.96284, 0.63196, -0.27504, 0.39174, 0.71225, + -0.3614 , 0.88761, 1.12882, 0.77764, 1.02618, -0.20245, -0.39245, + -1.56799, 1.04888, -1.20858, -0.24361, -1.85157, -0.16912, 0.50512, + -2.93122, 0.70477, -0.93066, 1.74867, 0.23963, -0.00699, -1.27183, + -0.30604, 1.71039, 0.82202, -1.36734, -1.08352, -1.25054, 0.49436, + -1.5037 , -0.73143, 0.74189, 0.32365, 0.30539, -0.72169, 0.41088, + -1.56632, -0.63526, 0.58779, -0.05653, 0.76713, -1.40898, -0.33683, + 1.86802, 0.59773, 1.28668, -0.65817, 2.46829, -0.09331, 2.9034 , + 1.04809, 0.73222, -0.44372, 0.53044, -1.9274 , -1.57183, -1.14068, + 1.26036, -0.9296 , 0.06662, -0.26572, -0.30862, 0.72915, 0.98977, + 0.63513, -1.43917, -0.12523 }; void setx(int pos[]) { @@ -179,7 +174,7 @@ static PyMethodDef dumbeval_methods[] = { static struct PyModuleDef dumbeval_definition = { PyModuleDef_HEAD_INIT, "dumbeval", - "A Python module that implements Gerald Tesauro's dumbeval function for evaluation backgammon positions.", + "A Python module that implements Gerald Tesauro's pubeval function for evaluation backgammon positions with badly initialized weights.", -1, dumbeval_methods }; diff --git a/dumbeval/weights.py b/dumbeval/weights.py new file mode 100644 index 0000000..bf02340 --- /dev/null +++ b/dumbeval/weights.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 +import numpy as np +import re + +re.DOTALL = True + +np.set_printoptions(precision=5, suppress=True, threshold=np.nan) +def random_array_string(): + return re.sub(r'^\[(.*)\]$(?s)', r'{\n\1 };', np.array2string(np.random.normal(0,1,122), separator=', ')) + +print("/* Weights generated by weights.py */") +print("static const float wc[122] =", random_array_string()) +print() +print("static const float wr[122] =", random_array_string()) From 26540062225dcc8a5399b5f74baf926f4b1e4d77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoffer=20M=C3=BCller=20Madsen?= Date: Tue, 27 Mar 2018 13:02:36 +0200 Subject: [PATCH 11/17] fix wrongful mergings --- network.py | 75 ++++++++++++++++-------------------------------------- 1 file changed, 22 insertions(+), 53 deletions(-) diff --git a/network.py b/network.py index 6358761..6ceefbe 100644 --- a/network.py +++ b/network.py @@ -152,8 +152,8 @@ class Network: # print("Found the best state, being:", np.array(move_scores).argmax()) return best_move_pair - def eval(self, trained_eps=0): - def do_eval(sess, method, episodes=1000, trained_eps=trained_eps): + def eval(self, episode_count, trained_eps = 0, tf_session = None): + def do_eval(sess, method, episodes = 1000, trained_eps = 0): start_time = time.time() def print_time_estimate(eps_completed): @@ -290,17 +290,26 @@ class Network: else: sys.stderr.write("[EVAL ] Evaluation method '{}' is not defined\n".format(method)) return [0] - - with tf.Session() as session: - session.run(tf.global_variables_initializer()) - self.restore_model(session) - outcomes = [(method, do_eval(session, - method, - self.config['episode_count'], - trained_eps=trained_eps)) - for method - in self.config['eval_methods']] - return outcomes + + if tf_session == None: + with tf.Session(): + session.run(tf.global_variables_initializer()) + self.restore_model(session) + outcomes = [ (method, do_eval(session, + method, + episode_count, + trained_eps = trained_eps)) + for method + in self.config['eval_methods'] ] + return outcomes + else: + outcomes = [ (method, do_eval(tf_session, + method, + episode_count, + trained_eps = trained_eps)) + for method + in self.config['eval_methods'] ] + return outcomes def train_model(self, episodes=1000, save_step_size=100, trained_eps=0): with tf.Session() as sess: @@ -401,43 +410,3 @@ class Network: # save the current state again, so we can continue running backprop based on the "previous" turn. # NOTE: We need to make a method so that we can take a single turn or at least just pick the next best move, so we know how to evaluate according to TD-learning. Right now, our game just continues in a while loop without nothing to stop it! - - - - def eval(self, episode_count, trained_eps = 0, tf_session = None): - def do_eval(sess, method, episodes = 1000, trained_eps = 0): - start_time = time.time() - - writer.close() - - return outcomes - - # take turn, which finds the best state and picks it, based on the current network - # save current state - # run training operation (session.run(self.training_op, {x:x, value_next, value_next})), - # (something which does the backprop, based on the state after having taken a turn, - # found before, and the state we saved in the beginning and from now we'll - # save it at the end of the turn - - # save the current state again, so we can continue running backprop based on the "previous" turn. - - - if tf_session == None: - with tf.Session(): - session.run(tf.global_variables_initializer()) - self.restore_model(session) - outcomes = [ (method, do_eval(session, - method, - episode_count, - trained_eps = trained_eps)) - for method - in self.config['eval_methods'] ] - return outcomes - else: - outcomes = [ (method, do_eval(tf_session, - method, - episode_count, - trained_eps = trained_eps)) - for method - in self.config['eval_methods'] ] - return outcomes From 785ae6a5be445e4511d389d3b4ec780094cad276 Mon Sep 17 00:00:00 2001 From: alex Date: Wed, 28 Mar 2018 00:16:50 +0200 Subject: [PATCH 12/17] Fixed wrongful appending of current player to board rep --- board.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/board.py b/board.py index 767ae45..35038ed 100644 --- a/board.py +++ b/board.py @@ -42,7 +42,7 @@ class Board: negatives = [x if x < 0 else 0 for x in board] board.append(15 - sum(positives)) board.append(-15 - sum(negatives)) - board += ([1, 0] if np.sign(player) > 0 else [1, 0]) + board += ([1, 0] if np.sign(player) > 0 else [0, 1]) return np.array(board).reshape(1,-1) @@ -67,7 +67,7 @@ class Board: features.append((board[0] if np.sign(player) < 0 else board[25]) / 2.0) # Calculate how many pieces there must be in the home state and divide it by 15 features.append((15 - sum) / 15) - features += ([1,0] if np.sign(cur_player) > 0 else [1,0]) + features += ([1,0] if np.sign(cur_player) > 0 else [0,1]) test = np.array(features).reshape(1,-1) #print("TEST:",test) return test From 95b12a6c35e94e01efffe54fc6d8204960994d51 Mon Sep 17 00:00:00 2001 From: alex Date: Wed, 28 Mar 2018 00:33:39 +0200 Subject: [PATCH 13/17] Added another board_rep --- board.py | 6 ++++++ network.py | 3 +++ 2 files changed, 9 insertions(+) diff --git a/board.py b/board.py index 35038ed..1b018d9 100644 --- a/board.py +++ b/board.py @@ -35,6 +35,12 @@ class Board: board.append(-15 - sum(negatives)) return tuple(board) + @staticmethod + def board_features_to_slimmed_down_own(board, player): + board = list(board) + board += ([1, 0] if np.sign(player) > 0 else [0, 1]) + return np.array(board).reshape(1, -1) + @staticmethod def board_features_to_own(board, player): board = list(board) diff --git a/network.py b/network.py index 6ceefbe..082f322 100644 --- a/network.py +++ b/network.py @@ -15,6 +15,9 @@ class Network: output_size = 1 # Can't remember the best learning_rate, look this up learning_rate = 0.01 + # board_features_to_own has size 30 + # board_features_to_tesauro has size 198 + # board_features_to_slimmed_down_own has size 28 board_rep = Board.board_features_to_own def custom_tanh(self, x, name=None): From abce56dd4009d946ff9f792f0d4e3987dac4db0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoffer=20M=C3=BCller=20Madsen?= Date: Tue, 27 Mar 2018 23:13:59 +0000 Subject: [PATCH 14/17] fix typo --- network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/network.py b/network.py index 082f322..f1f1859 100644 --- a/network.py +++ b/network.py @@ -295,7 +295,7 @@ class Network: return [0] if tf_session == None: - with tf.Session(): + with tf.Session() as session: session.run(tf.global_variables_initializer()) self.restore_model(session) outcomes = [ (method, do_eval(session, From fda2c6e08d39eca1e6c618db9bcba9fe5092f15c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoffer=20M=C3=BCller=20Madsen?= Date: Wed, 28 Mar 2018 12:00:47 +0200 Subject: [PATCH 15/17] parametric board representation in network --- board.py | 13 ++++++++----- main.py | 3 ++- network.py | 47 ++++++++++++++++++++++++++++------------------- 3 files changed, 38 insertions(+), 25 deletions(-) diff --git a/board.py b/board.py index 1b018d9..33303c1 100644 --- a/board.py +++ b/board.py @@ -31,29 +31,32 @@ class Board: board = list(board) positives = [x if x > 0 else 0 for x in board] negatives = [x if x < 0 else 0 for x in board] - board.append(15 - sum(positives)) + board.append( 15 - sum(positives)) board.append(-15 - sum(negatives)) return tuple(board) + # quack @staticmethod - def board_features_to_slimmed_down_own(board, player): + def board_features_quack(board, player): board = list(board) board += ([1, 0] if np.sign(player) > 0 else [0, 1]) return np.array(board).reshape(1, -1) + # quack-fat @staticmethod - def board_features_to_own(board, player): + def board_features_quack_fat(board, player): board = list(board) positives = [x if x > 0 else 0 for x in board] negatives = [x if x < 0 else 0 for x in board] - board.append(15 - sum(positives)) + board.append( 15 - sum(positives)) board.append(-15 - sum(negatives)) board += ([1, 0] if np.sign(player) > 0 else [0, 1]) return np.array(board).reshape(1,-1) + # tesauro @staticmethod - def board_features_to_tesauro(board, cur_player): + def board_features_tesauro(board, cur_player): features = [] for player in [-1,1]: sum = 0.0 diff --git a/main.py b/main.py index b5a8ad0..f6a375e 100644 --- a/main.py +++ b/main.py @@ -46,7 +46,8 @@ config = { 'start_episode': args.start_episode, 'train_perpetually': args.train_perpetually, 'model_storage_path': 'models', - 'bench_storage_path': 'bench' + 'bench_storage_path': 'bench', + 'board_representation': 'quack' } # Create models folder diff --git a/network.py b/network.py index f1f1859..d19f23c 100644 --- a/network.py +++ b/network.py @@ -10,15 +10,15 @@ from eval import Eval class Network: - hidden_size = 40 - input_size = 30 - output_size = 1 - # Can't remember the best learning_rate, look this up - learning_rate = 0.01 - # board_features_to_own has size 30 - # board_features_to_tesauro has size 198 - # board_features_to_slimmed_down_own has size 28 - board_rep = Board.board_features_to_own + # board_features_quack has size 28 + # board_features_quack_fat has size 30 + # board_features_tesauro has size 198 + + board_reps = { + 'quack-fat' : (30, Board.board_features_quack_fat), + 'quack' : (28, Board.board_features_quack), + 'tesauro' : (198, Board.board_features_tesauro) + } def custom_tanh(self, x, name=None): return tf.scalar_mul(tf.constant(2.00), tf.tanh(x, name)) @@ -29,6 +29,15 @@ class Network: self.name = name + # Set board representation from config + self.input_size, self.board_trans_func = Network.board_reps[ + self.config['board_representation'] + ] + self.output_size = 1 + self.hidden_size = 40 + # Can't remember the best learning_rate, look this up + self.learning_rate = 0.01 + # Restore trained episode count for model episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained") if os.path.isfile(episode_count_path): @@ -38,19 +47,19 @@ class Network: self.episodes_trained = 0 # input = x - self.x = tf.placeholder('float', [1, Network.input_size], name='input') - self.value_next = tf.placeholder('float', [1, Network.output_size], name="value_next") + self.x = tf.placeholder('float', [1, self.input_size], name='input') + self.value_next = tf.placeholder('float', [1, self.output_size], name="value_next") xavier_init = tf.contrib.layers.xavier_initializer() - W_1 = tf.get_variable("w_1", (Network.input_size, Network.hidden_size), + W_1 = tf.get_variable("w_1", (self.input_size, self.hidden_size), initializer=xavier_init) - W_2 = tf.get_variable("w_2", (Network.hidden_size, Network.output_size), + W_2 = tf.get_variable("w_2", (self.hidden_size, self.output_size), initializer=xavier_init) - b_1 = tf.get_variable("b_1", (Network.hidden_size,), + b_1 = tf.get_variable("b_1", (self.hidden_size,), initializer=tf.zeros_initializer) - b_2 = tf.get_variable("b_2", (Network.output_size,), + b_2 = tf.get_variable("b_2", (self.output_size,), initializer=tf.zeros_initializer) @@ -74,7 +83,7 @@ class Network: with tf.variable_scope('apply_gradients'): for gradient, trainable_var in zip(gradients, trainable_vars): # Hopefully this is Δw_t = α(V_t+1 - V_t)▿_wV_t. - backprop_calc = Network.learning_rate * difference_in_values * gradient + backprop_calc = self.learning_rate * difference_in_values * gradient grad_apply = trainable_var.assign_add(backprop_calc) apply_gradients.append(grad_apply) @@ -148,7 +157,7 @@ class Network: def make_move(self, sess, board, roll, player): # print(Board.pretty(board)) legal_moves = Board.calculate_legal_states(board, player, roll) - moves_and_scores = [(move, self.eval_state(sess, Network.board_rep(move, player))) for move in legal_moves] + moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves] scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores] best_score_index = np.array(scores).argmax() best_move_pair = moves_and_scores[best_score_index] @@ -367,7 +376,7 @@ class Network: # adjust weights sess.run(self.training_op, - feed_dict={self.x: Network.board_rep(prev_board, player), + feed_dict={self.x: self.board_trans_func(prev_board, player), self.value_next: cur_board_value}) player *= -1 @@ -386,7 +395,7 @@ class Network: with tf.name_scope("final"): merged = tf.summary.merge_all() summary, _ = sess.run([merged, self.training_op], - feed_dict={self.x: Network.board_rep(prev_board, player), + feed_dict={self.x: self.board_trans_func(prev_board, player), self.value_next: scaled_final_score.reshape((1, 1))}) writer.add_summary(summary, episode + trained_eps) From 17f5b62e9b3fc3e9662f41a8b69ff94ccb506f8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoffer=20M=C3=BCller=20Madsen?= Date: Wed, 28 Mar 2018 14:36:52 +0200 Subject: [PATCH 16/17] proper Tesauro board representation --- board.py | 53 +++++----- network.py | 6 +- test.py | 306 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 338 insertions(+), 27 deletions(-) diff --git a/board.py b/board.py index 33303c1..2136e47 100644 --- a/board.py +++ b/board.py @@ -57,30 +57,33 @@ class Board: # tesauro @staticmethod def board_features_tesauro(board, cur_player): - features = [] - for player in [-1,1]: - sum = 0.0 - for board_range in range(1,25): - pin = board[board_range] - #print("PIIIN:",pin) - feature = [0.0]*4 - if np.sign(pin) == np.sign(player): - sum += abs(pin) - for i in range(min(abs(pin), 3)): - feature[i] = 1 - if (abs(pin) > 3): - feature[3] = (abs(pin)-3)/2 - features += feature - #print("SUUUM:",sum) - # Append the amount of men on the bar of the current player divided by 2 - features.append((board[0] if np.sign(player) < 0 else board[25]) / 2.0) - # Calculate how many pieces there must be in the home state and divide it by 15 - features.append((15 - sum) / 15) - features += ([1,0] if np.sign(cur_player) > 0 else [0,1]) - test = np.array(features).reshape(1,-1) - #print("TEST:",test) - return test + def ordinary_trans(val, player): + abs_val = val * player + if abs_val <= 0: return (0,0,0,0) + elif abs_val == 1: return (1,0,0,0) + elif abs_val == 2: return (1,1,0,0) + elif abs_val == 3: return (1,1,1,0) + else: return (1,1,1, (abs_val - 3) / 2) + def bar_trans(board, player): + if player == 1: return (abs(board[0]/2),) + elif player == -1: return (abs(board[25]/2),) + + # def ordinary_trans_board(board, player): + # return np.array( + # [ordinary_trans(x, player) for x in board[1:25]] + # ).flatten() + + board_rep = [] + for player in [1,-1]: + for x in board[1:25]: + board_rep += ordinary_trans(x, player) + board_rep += bar_trans(board, player) + board_rep += (15 - Board.num_of_checkers_for_player(board, player),) + + board_rep += ([1,0] if cur_player == 1 else [0,1]) + + return np.array(board_rep).reshape(1,198) @staticmethod @@ -295,9 +298,9 @@ class Board: return """ 13 14 15 16 17 18 19 20 21 22 23 24 +--------------------------------------------------------------------------+ -| {12}| {11}| {10}| {9}| {8}| {7}| bar -1: {0} | {6}| {5}| {4}| {3}| {2}| {1}| end -1: TODO| +| {13}| {14}| {15}| {16}| {17}| {18}| bar -1: {25} | {19}| {20}| {21}| {22}| {23}| {24}| end -1: TODO| |---|---|---|---|---|---|------------|---|---|---|---|---|---| | -| {13}| {14}| {15}| {16}| {17}| {18}| bar 1: {25} | {19}| {20}| {21}| {22}| {23}| {24}| end 1: TODO| +| {12}| {11}| {10}| {9}| {8}| {7}| bar 1: {0} | {6}| {5}| {4}| {3}| {2}| {1}| end 1: TODO| +--------------------------------------------------------------------------+ 12 11 10 9 8 7 6 5 4 3 2 1 """.format(*temp) diff --git a/network.py b/network.py index d19f23c..2722f6a 100644 --- a/network.py +++ b/network.py @@ -365,13 +365,15 @@ class Network: # first thing inside of the while loop and then call # best_move_and_score to get V_t+1 + i = 0 while Board.outcome(prev_board) is None: + i += 1 #print("PREEEV_BOOOOAAARD:",prev_board) cur_board, cur_board_value = self.make_move(sess, prev_board, (random.randrange(1, 7), random.randrange(1, 7)), player) - + #print("The current value:",cur_board_value) # adjust weights @@ -385,7 +387,7 @@ class Network: prev_board = cur_board final_board = prev_board - sys.stderr.write("\t outcome {}".format(Board.outcome(final_board)[1])) + sys.stderr.write("\t outcome {}\t turns {}".format(Board.outcome(final_board)[1], i)) outcomes.append(Board.outcome(final_board)[1]) final_score = np.array([Board.outcome(final_board)[1]]) scaled_final_score = ((final_score + 2) / 4) diff --git a/test.py b/test.py index efc243e..6c9c130 100644 --- a/test.py +++ b/test.py @@ -613,6 +613,312 @@ class TestBoardFlip(unittest.TestCase): -2) self.assertEqual(Board.flip(Board.flip(board)), board) + + def test_tesauro_initial(self): + board = Board.initial_state + + expected = (1,1,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 1,1,1,1, + + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 1,1,1,0, + 0,0,0,0, + + 1,1,1,1, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + + 0.0, + 0, + + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 1,1,1,1, + + 0,0,0,0, + 1,1,1,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + + 1,1,1,1, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 1,1,0,0, + + 0.0, + 0, + + 1, + 0 + ) + + import numpy as np + self.assertTrue((Board.board_features_tesauro(board, 1) == + np.array(expected).reshape(1, 198)).all()) + + def test_tesauro_bars(self): + board = list(Board.initial_state) + board[1] = 0 + board[0] = 2 + board[24] = 0 + board[25] = -2 + + board = tuple(board) + + expected = (0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 1,1,1,1, + + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 1,1,1,0, + 0,0,0,0, + + 1,1,1,1, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + + 1.0, + 0, + + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 1,1,1,1, + + 0,0,0,0, + 1,1,1,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + + 1,1,1,1, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + + 1.0, + 0, + + 1, + 0 + ) + + import numpy as np + self.assertTrue((Board.board_features_tesauro(board, 1) == + np.array(expected).reshape(1, 198)).all()) + + + def test_tesauro_home(self): + board = list(Board.initial_state) + + board[1] = 0 + board[24] = 0 + + board = tuple(board) + + expected = (0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 1,1,1,1, + + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 1,1,1,0, + 0,0,0,0, + + 1,1,1,1, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + + 0.0, + 2, + + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 1,1,1,1, + + 0,0,0,0, + 1,1,1,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + + 1,1,1,1, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + + 0.0, + 2, + + 1, + 0 + ) + + import numpy as np + self.assertTrue((Board.board_features_tesauro(board, 1) == + np.array(expected).reshape(1, 198)).all()) + + + def test_tesauro_black_player(self): + board = Board.initial_state + + expected = (1,1,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 1,1,1,1, + + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 1,1,1,0, + 0,0,0,0, + + 1,1,1,1, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + + 0.0, + 0, + + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 1,1,1,1, + + 0,0,0,0, + 1,1,1,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + + 1,1,1,1, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 0,0,0,0, + 1,1,0,0, + + 0.0, + 0, + + 0, + 1 + ) + + import numpy as np + self.assertTrue((Board.board_features_tesauro(board, -1) == + np.array(expected).reshape(1, 198)).all()) + if __name__ == '__main__': unittest.main() From 8764fadd6af82f02e348e912fc9540293d6dd50b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoffer=20M=C3=BCller=20Madsen?= Date: Wed, 28 Mar 2018 15:32:22 +0200 Subject: [PATCH 17/17] train-evaluate-save --- bin/train-evaluate-save | 47 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100755 bin/train-evaluate-save diff --git a/bin/train-evaluate-save b/bin/train-evaluate-save new file mode 100755 index 0000000..00b6411 --- /dev/null +++ b/bin/train-evaluate-save @@ -0,0 +1,47 @@ +#!/usr/bin/env ruby +def save(model_name) + require 'date' + + models_dir = 'models' + model_path = File.join(models_dir, model_name) + if not File.exists? model_path then + return false + end + + episode_count = (File.read File.join(model_path, 'episodes_trained')).to_i + + puts "Found model #{model_name} with episodes #{episode_count} trained!" + + file_name = "model-#{model_name}-#{episode_count}-#{Time.now.strftime('%Y%m%d-%H%M%S')}.tar.gz" + save_path = File.join(models_dir, 'saves', file_name) + puts "Saving to #{save_path}" + + system("tar", "-cvzf", save_path, "-C", models_dir, model_name) + + return true +end + +def train(model, episodes) + system("python3", "main.py", "--train", "--model", model, "--episodes", episodes.to_s) +end + +def evaluate(model, episodes, method) + system("python3", "main.py", "--eval" , "--model", model, "--episodes", episodes.to_s, "--eval-methods", method) +end + +model = ARGV[0] + +if model.nil? then raise "no model specified" end + +while true do + save model + train model, 1000 + save model + train model, 1000 + 3.times do + evaluate model, 250, "pubeval" + end + 3.times do + evaluate model, 250, "dumbeval" + end +end