From 1f8485f54eacc5a9f2d81e7eabc5b90a140a3795 Mon Sep 17 00:00:00 2001
From: Alexander Munch-Hansen <alexmunchhansen@gmail.com>
Date: Sun, 6 May 2018 20:41:07 +0200
Subject: [PATCH] No longer use n_ply, shit's too slow man. Added extra
 logging, now logs the average difference in values between trainings. Also
 fixed bug with the length of quack-norm. Also added cli argument;
 use-baseline, if set, the baseline-model will be used.

---
 board.py   | 54 ++++++++++++++++++++++++++++--------------------------
 main.py    | 31 +++++++++++++++++++++----------
 network.py | 42 +++++++++++++++++-------------------------
 3 files changed, 66 insertions(+), 61 deletions(-)

diff --git a/board.py b/board.py
index 4bd586a..d32197c 100644
--- a/board.py
+++ b/board.py
@@ -62,7 +62,9 @@ class Board:
         negatives = [x if x < 0 else 0 for x in board]
         board[0] = board[0] / 2
         board[25] = board[25] / 2
-        board = [board[x] / 15 for x in range(1,25)]
+
+        board = [board[x] if x == 0 or 25 else board[x] / 15 for x in range(0, 26)]
+
         board.append(15 - sum(positives))
         board.append(-15 - sum(negatives))
         board += ([1, 0] if np.sign(player) > 0 else [0, 1])
@@ -100,31 +102,31 @@ class Board:
         return np.array(board_rep).reshape(1,198)
 
 
-    # @staticmethod
-    # def board_features_tesauro(board, cur_player):
-    #     features = []
-    #     for player in [-1,1]:
-    #         sum = 0.0
-    #         for board_range in range(1,25):
-    #             pin = board[board_range]
-    #             #print("PIIIN:",pin)
-    #             feature = [0.0]*4
-    #             if np.sign(pin) == np.sign(player):
-    #                 sum += abs(pin)
-    #                 for i in range(min(abs(pin), 3)):
-    #                     feature[i] = 1
-    #                     if (abs(pin) > 3):
-    #                         feature[3] = (abs(pin)-3)/2
-    #             features += feature
-    #         #print("SUUUM:",sum)
-    #         # Append the amount of men on the bar of the current player divided by 2
-    #         features.append((board[0] if np.sign(player) < 0 else board[25]) / 2.0)
-    #         # Calculate how many pieces there must be in the home state and divide it by 15
-    #         features.append((15 - sum) / 15)
-    #     features += ([1,0] if np.sign(cur_player) > 0 else [0,1])
-    #     test = np.array(features).reshape(1,-1)
-    #     #print("TEST:",test)
-    #     return test
+    @staticmethod
+    def board_features_tesauro_wrong(board, cur_player):
+        features = []
+        for player in [-1,1]:
+            sum = 0.0
+            for board_range in range(1,25):
+                pin = board[board_range]
+                #print("PIIIN:",pin)
+                feature = [0.0]*4
+                if np.sign(pin) == np.sign(player):
+                    sum += abs(pin)
+                    for i in range(min(abs(pin), 3)):
+                        feature[i] = 1
+                        if (abs(pin) > 3):
+                            feature[3] = (abs(pin)-3)/2
+                features += feature
+            #print("SUUUM:",sum)
+            # Append the amount of men on the bar of the current player divided by 2
+            features.append((board[0] if np.sign(player) < 0 else board[25]) / 2.0)
+            # Calculate how many pieces there must be in the home state and divide it by 15
+            features.append((15 - sum) / 15)
+        features += ([1,0] if np.sign(cur_player) > 0 else [0,1])
+        test = np.array(features).reshape(1,-1)
+        #print("TEST:",test)
+        return test
 
 
 
diff --git a/main.py b/main.py
index bcf891d..8916d3f 100644
--- a/main.py
+++ b/main.py
@@ -33,6 +33,8 @@ parser.add_argument('--list-models', action='store_true',
                     help='list all known models')
 parser.add_argument('--force-creation', action='store_true',
                     help='force model creation if model does not exist')
+parser.add_argument('--use-baseline', action='store_true',
+                    help='use the baseline model, note, has size 28')
 
 args = parser.parse_args()
 
@@ -53,8 +55,9 @@ config = {
     'train_perpetually': args.train_perpetually,
     'model_storage_path': 'models',
     'bench_storage_path': 'bench',
-    'board_representation': 'quack',
-    'force_creation': args.force_creation
+    'board_representation': 'quack-fat',
+    'force_creation': args.force_creation,
+    'use_baseline': args.use_baseline
 }
 
 # Create models folder
@@ -72,18 +75,26 @@ if not os.path.isdir(log_path):
 
 
 # Define helper functions
-def log_train_outcome(outcome, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "train.log")):
+def log_train_outcome(outcome, diff_in_values, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "train.log")):
     format_vars = { 'trained_eps': trained_eps,
-                    'count': len(train_outcome),
-                    'sum': sum(train_outcome),
-                    'mean': sum(train_outcome) / len(train_outcome),
-                    'time': int(time.time())
+                    'count': len(outcome),
+                    'sum': sum(outcome),
+                    'mean': sum(outcome) / len(outcome),
+                    'time': int(time.time()),
+                    'average_diff_in_vals': diff_in_values/len(outcome)
     }
     with open(log_path, 'a+') as f:
-        f.write("{time};{trained_eps};{count};{sum};{mean}".format(**format_vars) + "\n")
+        f.write("{time};{trained_eps};{count};{sum};{mean};{average_diff_in_vals}".format(**format_vars) + "\n")
     
 
 def log_eval_outcomes(outcomes, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "eval.log")):
+    """
+    :param outcomes:
+    :param average_diff_in_value:
+    :param trained_eps:
+    :param log_path:
+    :return:
+    """
     for outcome in outcomes:
         scores = outcome[1]
         format_vars = { 'trained_eps': trained_eps,
@@ -137,9 +148,9 @@ if __name__ == "__main__":
         network = Network(config, config['model'])
         start_episode = network.episodes_trained
         while True:
-            train_outcome = network.train_model(episodes = episode_count, trained_eps = start_episode)
+            train_outcome, diff_in_values = network.train_model(episodes = episode_count, trained_eps = start_episode)
             start_episode += episode_count
-            log_train_outcome(train_outcome, trained_eps = start_episode)
+            log_train_outcome(train_outcome, diff_in_values, trained_eps = start_episode)
             if config['eval_after_train']:
                 eval_outcomes = network.eval(trained_eps = start_episode)
                 log_eval_outcomes(eval_outcomes, trained_eps = start_episode)
diff --git a/network.py b/network.py
index f4be4c0..f30e724 100644
--- a/network.py
+++ b/network.py
@@ -139,7 +139,7 @@ class Network:
             if os.path.isfile(episode_count_path):
                 with open(episode_count_path, 'r') as f:
                     self.config['start_episode'] = int(f.read())
-        elif glob.glob(os.path.join(os.path.join(self.config['model_storage_path'], "baseline_model"), 'model.ckpt*.index')):
+        elif self.config['use_baseline'] and glob.glob(os.path.join(os.path.join(self.config['model_storage_path'], "baseline_model"), 'model.ckpt*.index')):
             checkpoint_path = os.path.join(self.config['model_storage_path'], "baseline_model")
             latest_checkpoint = tf.train.latest_checkpoint(checkpoint_path)
             print("[NETWK] ({name}) Restoring model from:".format(name=self.name),
@@ -157,7 +157,7 @@ class Network:
             exit()
 
 
-    #def make_move(self, sess, board, roll, player):
+    def make_move(self, sess, board, roll, player):
         """
         Find the best move given a board, roll and a player, by finding all possible states one can go to
         and then picking the best, by using the network to evaluate each state. The highest score is picked
@@ -169,14 +169,14 @@ class Network:
         :param player: Current player
         :return: A pair of the best state to go to, together with the score of that state
         """
-     #   legal_moves = Board.calculate_legal_states(board, player, roll)
-     #   moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves]
-     #   scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores]
-     #   best_score_index = np.array(scores).argmax()
-     #   best_move_pair = moves_and_scores[best_score_index]
-     #   return best_move_pair
+        legal_moves = Board.calculate_legal_states(board, player, roll)
+        moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves]
+        scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores]
+        best_score_index = np.array(scores).argmax()
+        best_move_pair = moves_and_scores[best_score_index]
+        return best_move_pair
 
-    def make_move(self, sess, board, roll, player, n = 1):
+    def make_move_n_ply(self, sess, board, roll, player, n = 1):
         best_pair = self.calc_n_ply(n, sess, board, player, roll)
         return best_pair
 
@@ -201,13 +201,7 @@ class Network:
         zero_ply_moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in init_legal_states]
 
         # pythons reverse is in place and I can't call [:15] on it, without applying it to an object like so. Fuck.
-        best_fifteen = sorted(zero_ply_moves_and_scores, key=itemgetter(1))
-
-
-        # They're sorted from smallest to largest, therefore we wan't to reverse if the current player is 1, since
-        # player 1 wishes to maximize. It's not needed for player -1, since that player seeks to minimize.
-        if player == 1:
-            best_fifteen.reverse()
+        best_fifteen = sorted(zero_ply_moves_and_scores, key=itemgetter(1), reverse=player==1)
 
         best_fifteen_boards = [x[0] for x in best_fifteen[:10]]
 
@@ -228,14 +222,9 @@ class Network:
         zero_ply_moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in init_legal_states]
 
         # pythons reverse is in place and I can't call [:15] on it, without applying it to an object like so. Fuck.
-        sorted_moves_and_scores = sorted(zero_ply_moves_and_scores, key=itemgetter(1))
+        sorted_moves_and_scores = sorted(zero_ply_moves_and_scores, key=itemgetter(1), reverse=player==1)
 
 
-        # They're sorted from smallest to largest, therefore we wan't to reverse if the current player is 1, since
-        # player 1 wishes to maximize. It's not needed for player -1, since that player seeks to minimize.
-        if player == 1:
-            sorted_moves_and_scores.reverse()
-
         best_boards = [x[0] for x in sorted_moves_and_scores[:10]]
 
         best_move_score_pair = self.n_ply(n_init, sess, best_boards, player)
@@ -365,7 +354,7 @@ class Network:
         all_rolls = gen_21_rolls()
 
         all_rolls_scores = []
-
+        count = 0
         # loop over boards
         for a_board in boards:
             a_board_scores = []
@@ -375,7 +364,7 @@ class Network:
 
                 # find all states we can get to, given the board and roll and the opposite player
                 all_rolls_boards = Board.calculate_legal_states(a_board, player*-1, roll)
-
+                count += len(all_rolls_boards)
                 # find scores for each board found above
                 spec_roll_scores = [self.eval_state(sess, self.board_trans_func(new_board, player*-1))
                                     for new_board in all_rolls_boards]
@@ -393,6 +382,7 @@ class Network:
             all_rolls_scores.append(sum(a_board_scores)/len(a_board_scores))
 
         # return all the average scores
+        print(count)
         return all_rolls_scores
 
 
@@ -508,6 +498,7 @@ class Network:
 
     def train_model(self, episodes=1000, save_step_size=100, trained_eps=0):
         with tf.Session() as sess:
+            difference_in_vals = 0
             writer = tf.summary.FileWriter("/tmp/log/tf", sess.graph)
 
             sess.run(tf.global_variables_initializer())
@@ -552,6 +543,7 @@ class Network:
                                                                 (random.randrange(1, 7), random.randrange(1, 7)),
                                                                 player)
 
+                    difference_in_vals += abs((cur_board_value - self.eval_state(sess, self.board_trans_func(prev_board, player))))
 
 
                     # adjust weights
@@ -590,6 +582,6 @@ class Network:
             
             writer.close()
         
-            return outcomes
+            return outcomes, difference_in_vals