From cb7e7b519c0123c0cf4c0dee41de3b3f00305e04 Mon Sep 17 00:00:00 2001 From: Alexander Munch-Hansen Date: Wed, 9 May 2018 22:22:12 +0200 Subject: [PATCH] Getting closer to functionality. We're capable of evaluating moves and a rework of global_step has begun, such that we now use episode_count as a way of calculating exp_decay, which have been implemented as a function. --- network.py | 73 ++++++++++++++++------------ network_test.py | 16 ++++-- tensorflow_impl_tests/eager_main.py | 33 ++++++++----- tensorflow_impl_tests/normal_main.py | 11 +++-- 4 files changed, 82 insertions(+), 51 deletions(-) diff --git a/network.py b/network.py index 4f63b75..818b886 100644 --- a/network.py +++ b/network.py @@ -43,7 +43,10 @@ class Network: self.hidden_size = 40 self.max_learning_rate = 0.1 self.min_learning_rate = 0.001 - self.global_step = "lol" + + self.global_step = tf.train.get_or_create_global_step() + + #tf.train.get_or_create_global_step() # Restore trained episode count for model episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained") if os.path.isfile(episode_count_path): @@ -62,47 +65,48 @@ class Network: + def exp_decay(self, max_lr, epi_counter, decay_rate, decay_steps): + res = max_lr * decay_rate**(epi_counter // decay_steps) + return res def do_backprop(self, prev_state, value_next): + self.learning_rate = tf.maximum(self.min_learning_rate, - tf.train.exponential_decay(self.max_learning_rate, - self.global_step, 50000, - 0.96, - staircase=True), - name="learning_rate") - + self.exp_decay(self.max_learning_rate, self.episodes_trained, 0.96, 50000), + name="learning_rate") + # self.learning_rate = 0.1 + print(tf.train.get_global_step()) with tf.GradientTape() as tape: - value = self.model(np.array(input).reshape(1, -1)) + value = self.model(prev_state.reshape(1,-1)) grads = tape.gradient(value, self.model.variables) difference_in_values = tf.reshape(tf.subtract(value_next, value, name='difference_in_values'), []) tf.summary.scalar("difference_in_values", tf.abs(difference_in_values)) - global_step_op = self.global_step.assign_add(1) + # global_step_op = self.global_step.assign_add(1) with tf.variable_scope('apply_gradients'): for grad, train_var in zip(grads, self.model.variables): backprop_calc = self.learning_rate * difference_in_values * grad train_var.assign_add(backprop_calc) - + print(self.episodes_trained) def eval_state(self, sess, state): return sess.run(self.value, feed_dict={self.x: state}) def save_model(self, episode_count, global_step): - tfe.Saver(self.model.variables).save("./tmp_ckpt", global_step=global_step) + tfe.Saver(self.model.variables).save(os.path.join(self.checkpoint_path, 'model.ckpt'), global_step=self.global_step) #self.saver.save(sess, os.path.join(self.checkpoint_path, 'model.ckpt'), global_step=global_step) - #with open(os.path.join(self.checkpoint_path, "episodes_trained"), 'w+') as f: - # print("[NETWK] ({name}) Saving model to:".format(name=self.name), - # os.path.join(self.checkpoint_path, 'model.ckpt')) - # f.write(str(episode_count) + "\n") + with open(os.path.join(self.checkpoint_path, "episodes_trained"), 'w+') as f: + print("[NETWK] ({name}) Saving model to:".format(name=self.name), + os.path.join(self.checkpoint_path, 'model.ckpt')) + f.write(str(episode_count) + "\n") def calc_vals(self, states): values = self.model.predict_on_batch(states) - self.save_model(0, 432) return values @@ -120,9 +124,9 @@ class Network: latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_path) print("[NETWK] ({name}) Restoring model from:".format(name=self.name), str(latest_checkpoint)) - tfe.Saver(model.variables).restore(latest_checkpoint) + tfe.Saver(self.model.variables).restore(latest_checkpoint) - variables_names = [v.name for v in self.model.variables] + # variables_names = [v.name for v in self.model.variables] # Restore trained episode count for model @@ -130,11 +134,11 @@ class Network: if os.path.isfile(episode_count_path): with open(episode_count_path, 'r') as f: self.config['start_episode'] = int(f.read()) - else: - latest_checkpoint = tf.train.latest_checkpoint("./") - print("[NETWK] ({name}) Restoring model from:".format(name=self.name), - str(latest_checkpoint)) - tfe.Saver(self.model.variables).restore(latest_checkpoint) + # else: + # latest_checkpoint = tf.train.latest_checkpoint("./") + # print("[NETWK] ({name}) Restoring model from:".format(name=self.name), + # str(latest_checkpoint)) + # tfe.Saver(self.model.variables).restore(latest_checkpoint) #variables_names = [v.name for v in self.model.variables] @@ -143,9 +147,9 @@ class Network: #if os.path.isfile(episode_count_path): # with open(episode_count_path, 'r') as f: # self.config['start_episode'] = int(f.read()) + tf.train.get_or_create_global_step() - - def make_move(self, sess, board, roll, player): + def make_move(self, board, roll, player): """ Find the best move given a board, roll and a player, by finding all possible states one can go to and then picking the best, by using the network to evaluate each state. The highest score is picked @@ -157,12 +161,19 @@ class Network: :param player: Current player :return: A pair of the best state to go to, together with the score of that state """ - legal_moves = Board.calculate_legal_states(board, player, roll) - moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves] - scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores] - best_score_index = np.array(scores).argmax() - best_move_pair = moves_and_scores[best_score_index] - return best_move_pair + legal_states = list(Board.calculate_legal_states(board, player, roll)) + legal_states = [list(tmp) for tmp in legal_states] + legal_states = np.array([Board.board_features_quack_fat(tmp, player)[0] for tmp in legal_states]) + legal_moves = [self.board_trans_func(board, player) for board in Board.calculate_legal_states(board, player, roll)] + + scores = self.model.predict_on_batch(legal_states) + transformed_scores = [x if np.sign(player) > 0 else 1 - x for x in scores] + + best_score_idx = np.argmax(np.array(transformed_scores)) + best_move = legal_moves[best_score_idx] + best_score = scores[best_score_idx] + self.episodes_trained += 1 + return [best_move, best_score] def make_move_n_ply(self, sess, board, roll, player, n = 1): best_pair = self.calc_n_ply(n, sess, board, player, roll) diff --git a/network_test.py b/network_test.py index 58fec8a..5fb6d6e 100644 --- a/network_test.py +++ b/network_test.py @@ -9,7 +9,7 @@ from board import Board import main config = main.config.copy() -config['model'] = "tesauro_blah" +config['model'] = "eager_testings" config['force_creation'] = True config['board_representation'] = 'quack-fat' network = Network(config, config['model']) @@ -75,10 +75,18 @@ def calculate_possible_states(board): #print(network.calculate_1_ply(session, Board.initial_state, [2,4], 1)) board = network.board_trans_func(Board.initial_state, 1) +#print(board) -input = [0, 2, 0, 0, 0, 0, -5, 0, -3, 0, 0, 0, 5, -5, 0, 0, 0, 3, 0, 5, 0, 0, 0, 0, -2, 0, 0, 0, 1, 0] -all_input = np.array([input for _ in range(20)]) -print(network.calc_vals(all_input)) +pair = network.make_move(Board.initial_state, [3,2], 1) + +print(pair[1]) + +network.do_backprop(board, 0.9) + +network.save_model(2, 342) + +# all_input = np.array([input for _ in range(20)]) +# print(network.calc_vals(all_input)) #print(" "*10 + "network_test") diff --git a/tensorflow_impl_tests/eager_main.py b/tensorflow_impl_tests/eager_main.py index b2da143..f68f65f 100644 --- a/tensorflow_impl_tests/eager_main.py +++ b/tensorflow_impl_tests/eager_main.py @@ -1,6 +1,7 @@ import time import numpy as np import tensorflow as tf +from board import Board import tensorflow.contrib.eager as tfe @@ -23,12 +24,14 @@ model = tf.keras.Sequential([ #tfe.Saver(model.variables).restore(tf.train.latest_checkpoint("./")) -input = [0, 2, 0, 0, 0, 0, -5, 0, -3, 0, 0, 0, 5, -5, 0, 0, 0, 3, 0, 5, 0, 0, 0, 0, -2, 0, 0, 0, 1, 0] - -all_input = np.array([input for _ in range(20)]) +input = [0, 2, 0, 0, 0, 0, -5, 0, -3, 0, 0, 0, 5, -5, 0, 0, 0, 3, 0, 5, 0, 0, 0, 0, -2, 0] -single_in = np.array(input).reshape(1,-1) + +all_input = np.array([Board.board_features_quack_fat(input, 1) for _ in range(20)]) + + +single_in = Board.board_features_quack_fat(input, 1) start = time.time() @@ -48,10 +51,10 @@ print(time.time() - start) print("-"*30) with tf.GradientTape() as tape: - val = model(np.array(input).reshape(1,-1)) + val = model(single_in) grads = tape.gradient(val, model.variables) -grads = [0.1*val-np.random.uniform(-1,1)+grad for grad, trainable_var in zip(grads, model.variables)] +# grads = [0.1*val-np.random.uniform(-1,1)+grad for grad, trainable_var in zip(grads, model.variables)] # print(model.variables[0][0]) weights_before = model.weights[0] @@ -60,14 +63,20 @@ start = time.time() #[trainable_var.assign_add(0.1*val-0.3+grad) for grad, trainable_var in zip(grads, model.variables)] start = time.time() -#for gradient, trainable_var in zip(grads, model.variables): -# backprop_calc = 0.1 * (val - np.random.uniform(-1, 1)) * gradient -# trainable_var.assign_add(backprop_calc) +for gradient, trainable_var in zip(grads, model.variables): + backprop_calc = 0.1 * (0.9 - val) * gradient + trainable_var.assign_add(backprop_calc) -opt.apply_gradients(zip(grads, model.variables)) +# opt.apply_gradients(zip(grads, model.variables)) print(time.time() - start) -print(model(np.array(input).reshape(1,-1))) +print(model(single_in)) -tfe.Saver(model.variables).save("./tmp_ckpt") +vals = model.predict_on_batch(all_input) +vals = list(vals) +vals[3] = 4 +print(vals) +print(np.argmax(np.array(vals))) + +# tfe.Saver(model.variables).save("./tmp_ckpt") diff --git a/tensorflow_impl_tests/normal_main.py b/tensorflow_impl_tests/normal_main.py index 865f017..8e3887d 100644 --- a/tensorflow_impl_tests/normal_main.py +++ b/tensorflow_impl_tests/normal_main.py @@ -35,15 +35,16 @@ class Everything: trainable_vars = tf.trainable_variables() gradients = tf.gradients(self.value, trainable_vars) + difference_in_values = tf.reshape(tf.subtract(0.9, self.value, name='difference_in_values'), []) with tf.variable_scope('apply_gradients'): for gradient, trainable_var in zip(gradients, trainable_vars): - backprop_calc = self.learning_rate * difference_in_values * gradient + backprop_calc = 0.1 * difference_in_values * gradient grad_apply = trainable_var.assign_add(backprop_calc) apply_gradients.append(grad_apply) - with tf.control_dependencies([global_step_op]): - self.training_op = tf.group(*apply_gradients, name='training_op') + + self.training_op = tf.group(*apply_gradients, name='training_op') @@ -56,7 +57,9 @@ class Everything: val = sess.run(self.value, feed_dict={self.input: input.reshape(1,-1)}) print(time.time() - start) print(val) - + sess.run(self.training_op, feed_dict={self.input: input.reshape(1,-1)}) + val = sess.run(self.value, feed_dict={self.input: input.reshape(1, -1)}) + print(val) everything = Everything() everything.eval()