From cb7e7b519c0123c0cf4c0dee41de3b3f00305e04 Mon Sep 17 00:00:00 2001
From: Alexander Munch-Hansen <alexmunchhansen@gmail.com>
Date: Wed, 9 May 2018 22:22:12 +0200
Subject: [PATCH] Getting closer to functionality. We're capable of evaluating
 moves and a rework of global_step has begun, such that we now use
 episode_count as a way of calculating exp_decay, which have been implemented
 as a function.

---
 network.py                           | 73 ++++++++++++++++------------
 network_test.py                      | 16 ++++--
 tensorflow_impl_tests/eager_main.py  | 33 ++++++++-----
 tensorflow_impl_tests/normal_main.py | 11 +++--
 4 files changed, 82 insertions(+), 51 deletions(-)

diff --git a/network.py b/network.py
index 4f63b75..818b886 100644
--- a/network.py
+++ b/network.py
@@ -43,7 +43,10 @@ class Network:
         self.hidden_size = 40
         self.max_learning_rate = 0.1
         self.min_learning_rate = 0.001
-        self.global_step = "lol"
+
+        self.global_step = tf.train.get_or_create_global_step()
+
+        #tf.train.get_or_create_global_step()
         # Restore trained episode count for model
         episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained")
         if os.path.isfile(episode_count_path):
@@ -62,47 +65,48 @@ class Network:
 
 
 
+    def exp_decay(self, max_lr, epi_counter, decay_rate, decay_steps):
+        res = max_lr * decay_rate**(epi_counter // decay_steps)
+        return res
 
     def do_backprop(self, prev_state, value_next):
+
         self.learning_rate = tf.maximum(self.min_learning_rate,
-                                        tf.train.exponential_decay(self.max_learning_rate,
-                                                                   self.global_step, 50000,
-                                                                   0.96,
-                                                                   staircase=True),
-                                        name="learning_rate")
-
+                                         self.exp_decay(self.max_learning_rate, self.episodes_trained, 0.96, 50000),
+                                         name="learning_rate")
 
+        # self.learning_rate = 0.1
+        print(tf.train.get_global_step())
         with tf.GradientTape() as tape:
-            value = self.model(np.array(input).reshape(1, -1))
+            value = self.model(prev_state.reshape(1,-1))
         grads = tape.gradient(value, self.model.variables)
 
         difference_in_values = tf.reshape(tf.subtract(value_next, value, name='difference_in_values'), [])
         tf.summary.scalar("difference_in_values", tf.abs(difference_in_values))
 
-        global_step_op = self.global_step.assign_add(1)
+        # global_step_op = self.global_step.assign_add(1)
 
         with tf.variable_scope('apply_gradients'):
             for grad, train_var in zip(grads, self.model.variables):
                 backprop_calc = self.learning_rate * difference_in_values * grad
                 train_var.assign_add(backprop_calc)
 
-
+        print(self.episodes_trained)
 
     def eval_state(self, sess, state):
         return sess.run(self.value, feed_dict={self.x: state})
 
     def save_model(self, episode_count, global_step):
-        tfe.Saver(self.model.variables).save("./tmp_ckpt", global_step=global_step)
+        tfe.Saver(self.model.variables).save(os.path.join(self.checkpoint_path, 'model.ckpt'), global_step=self.global_step)
         #self.saver.save(sess, os.path.join(self.checkpoint_path, 'model.ckpt'), global_step=global_step)
-        #with open(os.path.join(self.checkpoint_path, "episodes_trained"), 'w+') as f:
-        #    print("[NETWK] ({name}) Saving model to:".format(name=self.name),
-        #          os.path.join(self.checkpoint_path, 'model.ckpt'))
-        #    f.write(str(episode_count) + "\n")
+        with open(os.path.join(self.checkpoint_path, "episodes_trained"), 'w+') as f:
+            print("[NETWK] ({name}) Saving model to:".format(name=self.name),
+                  os.path.join(self.checkpoint_path, 'model.ckpt'))
+            f.write(str(episode_count) + "\n")
 
 
     def calc_vals(self, states):
         values = self.model.predict_on_batch(states)
-        self.save_model(0, 432)
         return values
 
 
@@ -120,9 +124,9 @@ class Network:
             latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_path)
             print("[NETWK] ({name}) Restoring model from:".format(name=self.name),
                   str(latest_checkpoint))
-            tfe.Saver(model.variables).restore(latest_checkpoint)
+            tfe.Saver(self.model.variables).restore(latest_checkpoint)
 
-            variables_names = [v.name for v in self.model.variables]
+            # variables_names = [v.name for v in self.model.variables]
 
 
             # Restore trained episode count for model
@@ -130,11 +134,11 @@ class Network:
             if os.path.isfile(episode_count_path):
                 with open(episode_count_path, 'r') as f:
                     self.config['start_episode'] = int(f.read())
-        else:
-            latest_checkpoint = tf.train.latest_checkpoint("./")
-            print("[NETWK] ({name}) Restoring model from:".format(name=self.name),
-                  str(latest_checkpoint))
-            tfe.Saver(self.model.variables).restore(latest_checkpoint)
+        # else:
+        #     latest_checkpoint = tf.train.latest_checkpoint("./")
+        #     print("[NETWK] ({name}) Restoring model from:".format(name=self.name),
+        #           str(latest_checkpoint))
+        #     tfe.Saver(self.model.variables).restore(latest_checkpoint)
 
             #variables_names = [v.name for v in self.model.variables]
 
@@ -143,9 +147,9 @@ class Network:
             #if os.path.isfile(episode_count_path):
             #    with open(episode_count_path, 'r') as f:
             #        self.config['start_episode'] = int(f.read())
+        tf.train.get_or_create_global_step()
 
-
-    def make_move(self, sess, board, roll, player):
+    def make_move(self, board, roll, player):
         """
         Find the best move given a board, roll and a player, by finding all possible states one can go to
         and then picking the best, by using the network to evaluate each state. The highest score is picked
@@ -157,12 +161,19 @@ class Network:
         :param player: Current player
         :return: A pair of the best state to go to, together with the score of that state
         """
-        legal_moves = Board.calculate_legal_states(board, player, roll)
-        moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves]
-        scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores]
-        best_score_index = np.array(scores).argmax()
-        best_move_pair = moves_and_scores[best_score_index]
-        return best_move_pair
+        legal_states = list(Board.calculate_legal_states(board, player, roll))
+        legal_states = [list(tmp) for tmp in legal_states]
+        legal_states = np.array([Board.board_features_quack_fat(tmp, player)[0] for tmp in legal_states])
+        legal_moves = [self.board_trans_func(board, player) for board in Board.calculate_legal_states(board, player, roll)]
+
+        scores = self.model.predict_on_batch(legal_states)
+        transformed_scores = [x if np.sign(player) > 0 else 1 - x for x in scores]
+
+        best_score_idx = np.argmax(np.array(transformed_scores))
+        best_move = legal_moves[best_score_idx]
+        best_score = scores[best_score_idx]
+        self.episodes_trained += 1
+        return [best_move, best_score]
 
     def make_move_n_ply(self, sess, board, roll, player, n = 1):
         best_pair = self.calc_n_ply(n, sess, board, player, roll)
diff --git a/network_test.py b/network_test.py
index 58fec8a..5fb6d6e 100644
--- a/network_test.py
+++ b/network_test.py
@@ -9,7 +9,7 @@ from board import Board
 import main
 
 config = main.config.copy()
-config['model'] = "tesauro_blah"
+config['model'] = "eager_testings"
 config['force_creation'] = True
 config['board_representation'] = 'quack-fat'
 network = Network(config, config['model'])
@@ -75,10 +75,18 @@ def calculate_possible_states(board):
 #print(network.calculate_1_ply(session, Board.initial_state, [2,4], 1))
 
 board = network.board_trans_func(Board.initial_state, 1)
+#print(board)
 
-input = [0, 2, 0, 0, 0, 0, -5, 0, -3, 0, 0, 0, 5, -5, 0, 0, 0, 3, 0, 5, 0, 0, 0, 0, -2, 0, 0, 0, 1, 0]
-all_input = np.array([input for _ in range(20)])
-print(network.calc_vals(all_input))
+pair = network.make_move(Board.initial_state, [3,2], 1)
+
+print(pair[1])
+
+network.do_backprop(board, 0.9)
+
+network.save_model(2, 342)
+
+# all_input = np.array([input for _ in range(20)])
+# print(network.calc_vals(all_input))
 
 
 #print(" "*10 + "network_test")
diff --git a/tensorflow_impl_tests/eager_main.py b/tensorflow_impl_tests/eager_main.py
index b2da143..f68f65f 100644
--- a/tensorflow_impl_tests/eager_main.py
+++ b/tensorflow_impl_tests/eager_main.py
@@ -1,6 +1,7 @@
 import time
 import numpy as np
 import tensorflow as tf
+from board import Board
 import tensorflow.contrib.eager as tfe
 
 
@@ -23,12 +24,14 @@ model = tf.keras.Sequential([
 
 #tfe.Saver(model.variables).restore(tf.train.latest_checkpoint("./"))
 
-input = [0, 2, 0, 0, 0, 0, -5, 0, -3, 0, 0, 0, 5, -5, 0, 0, 0, 3, 0, 5, 0, 0, 0, 0, -2, 0, 0, 0, 1, 0]
-
-all_input = np.array([input for _ in range(20)])
+input = [0, 2, 0, 0, 0, 0, -5, 0, -3, 0, 0, 0, 5, -5, 0, 0, 0, 3, 0, 5, 0, 0, 0, 0, -2, 0]
 
 
-single_in = np.array(input).reshape(1,-1)
+
+all_input = np.array([Board.board_features_quack_fat(input, 1) for _ in range(20)])
+
+
+single_in = Board.board_features_quack_fat(input, 1)
 
 
 start = time.time()
@@ -48,10 +51,10 @@ print(time.time() - start)
 
 print("-"*30)
 with tf.GradientTape() as tape:
-    val = model(np.array(input).reshape(1,-1))
+    val = model(single_in)
 grads = tape.gradient(val, model.variables)
 
-grads = [0.1*val-np.random.uniform(-1,1)+grad for grad, trainable_var in zip(grads, model.variables)]
+# grads = [0.1*val-np.random.uniform(-1,1)+grad for grad, trainable_var in zip(grads, model.variables)]
 
 # print(model.variables[0][0])
 weights_before = model.weights[0]
@@ -60,14 +63,20 @@ start = time.time()
 #[trainable_var.assign_add(0.1*val-0.3+grad) for grad, trainable_var in zip(grads, model.variables)]
 
 start = time.time()
-#for gradient, trainable_var in zip(grads, model.variables):
-#    backprop_calc = 0.1 * (val - np.random.uniform(-1, 1)) * gradient
-#    trainable_var.assign_add(backprop_calc)
+for gradient, trainable_var in zip(grads, model.variables):
+    backprop_calc = 0.1 * (0.9 - val) * gradient
+    trainable_var.assign_add(backprop_calc)
 
-opt.apply_gradients(zip(grads, model.variables))
+# opt.apply_gradients(zip(grads, model.variables))
 
 print(time.time() - start)
 
-print(model(np.array(input).reshape(1,-1)))
+print(model(single_in))
 
-tfe.Saver(model.variables).save("./tmp_ckpt")
+vals = model.predict_on_batch(all_input)
+vals = list(vals)
+vals[3] = 4
+print(vals)
+print(np.argmax(np.array(vals)))
+
+# tfe.Saver(model.variables).save("./tmp_ckpt")
diff --git a/tensorflow_impl_tests/normal_main.py b/tensorflow_impl_tests/normal_main.py
index 865f017..8e3887d 100644
--- a/tensorflow_impl_tests/normal_main.py
+++ b/tensorflow_impl_tests/normal_main.py
@@ -35,15 +35,16 @@ class Everything:
         trainable_vars = tf.trainable_variables()
         gradients = tf.gradients(self.value, trainable_vars)
 
+        difference_in_values = tf.reshape(tf.subtract(0.9, self.value, name='difference_in_values'), [])
 
         with tf.variable_scope('apply_gradients'):
             for gradient, trainable_var in zip(gradients, trainable_vars):
-                backprop_calc = self.learning_rate * difference_in_values * gradient
+                backprop_calc = 0.1 * difference_in_values * gradient
                 grad_apply = trainable_var.assign_add(backprop_calc)
                 apply_gradients.append(grad_apply)
 
-        with tf.control_dependencies([global_step_op]):
-            self.training_op = tf.group(*apply_gradients, name='training_op')
+
+        self.training_op = tf.group(*apply_gradients, name='training_op')
 
 
 
@@ -56,7 +57,9 @@ class Everything:
             val = sess.run(self.value, feed_dict={self.input: input.reshape(1,-1)})
         print(time.time() - start)
         print(val)
-
+        sess.run(self.training_op, feed_dict={self.input: input.reshape(1,-1)})
+        val = sess.run(self.value, feed_dict={self.input: input.reshape(1, -1)})
+        print(val)
 
 everything = Everything()
 everything.eval()