No longer use n_ply, shit's too slow man.

Added extra logging, now logs the average difference in values between trainings. Also fixed bug with the length of quack-norm. Also added cli argument; use-baseline, if set, the baseline-model will be used.
2018-05-06 20:41:07 +02:00 · 2018-05-06 20:41:07 +02:00 · 1f8485f54e
commit 1f8485f54e
parent 1db469709a
3 changed files with 66 additions and 61 deletions
--- a/board.py
+++ b/board.py
@ -62,7 +62,9 @@ class Board:
        negatives = [x if x < 0 else 0 for x in board]
        board[0] = board[0] / 2
        board[25] = board[25] / 2
-        board = [board[x] / 15 for x in range(1,25)]
+
+        board = [board[x] if x == 0 or 25 else board[x] / 15 for x in range(0, 26)]
+
        board.append(15 - sum(positives))
        board.append(-15 - sum(negatives))
        board += ([1, 0] if np.sign(player) > 0 else [0, 1])
@ -100,31 +102,31 @@ class Board:
        return np.array(board_rep).reshape(1,198)


-    # @staticmethod
-    # def board_features_tesauro(board, cur_player):
-    #     features = []
-    #     for player in [-1,1]:
-    #         sum = 0.0
-    #         for board_range in range(1,25):
-    #             pin = board[board_range]
-    #             #print("PIIIN:",pin)
-    #             feature = [0.0]*4
-    #             if np.sign(pin) == np.sign(player):
-    #                 sum += abs(pin)
-    #                 for i in range(min(abs(pin), 3)):
-    #                     feature[i] = 1
-    #                     if (abs(pin) > 3):
-    #                         feature[3] = (abs(pin)-3)/2
-    #             features += feature
-    #         #print("SUUUM:",sum)
-    #         # Append the amount of men on the bar of the current player divided by 2
-    #         features.append((board[0] if np.sign(player) < 0 else board[25]) / 2.0)
-    #         # Calculate how many pieces there must be in the home state and divide it by 15
-    #         features.append((15 - sum) / 15)
-    #     features += ([1,0] if np.sign(cur_player) > 0 else [0,1])
-    #     test = np.array(features).reshape(1,-1)
-    #     #print("TEST:",test)
-    #     return test
+    @staticmethod
+    def board_features_tesauro_wrong(board, cur_player):
+        features = []
+        for player in [-1,1]:
+            sum = 0.0
+            for board_range in range(1,25):
+                pin = board[board_range]
+                #print("PIIIN:",pin)
+                feature = [0.0]*4
+                if np.sign(pin) == np.sign(player):
+                    sum += abs(pin)
+                    for i in range(min(abs(pin), 3)):
+                        feature[i] = 1
+                        if (abs(pin) > 3):
+                            feature[3] = (abs(pin)-3)/2
+                features += feature
+            #print("SUUUM:",sum)
+            # Append the amount of men on the bar of the current player divided by 2
+            features.append((board[0] if np.sign(player) < 0 else board[25]) / 2.0)
+            # Calculate how many pieces there must be in the home state and divide it by 15
+            features.append((15 - sum) / 15)
+        features += ([1,0] if np.sign(cur_player) > 0 else [0,1])
+        test = np.array(features).reshape(1,-1)
+        #print("TEST:",test)
+        return test



--- a/main.py
+++ b/main.py
@ -33,6 +33,8 @@ parser.add_argument('--list-models', action='store_true',
                    help='list all known models')
 parser.add_argument('--force-creation', action='store_true',
                    help='force model creation if model does not exist')
+parser.add_argument('--use-baseline', action='store_true',
+                    help='use the baseline model, note, has size 28')

 args = parser.parse_args()

@ -53,8 +55,9 @@ config = {
    'train_perpetually': args.train_perpetually,
    'model_storage_path': 'models',
    'bench_storage_path': 'bench',
-    'board_representation': 'quack',
-    'force_creation': args.force_creation
+    'board_representation': 'quack-fat',
+    'force_creation': args.force_creation,
+    'use_baseline': args.use_baseline
 }

 # Create models folder
@ -72,18 +75,26 @@ if not os.path.isdir(log_path):


 # Define helper functions
-def log_train_outcome(outcome, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "train.log")):
+def log_train_outcome(outcome, diff_in_values, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "train.log")):
    format_vars = { 'trained_eps': trained_eps,
-                    'count': len(train_outcome),
-                    'sum': sum(train_outcome),
-                    'mean': sum(train_outcome) / len(train_outcome),
-                    'time': int(time.time())
+                    'count': len(outcome),
+                    'sum': sum(outcome),
+                    'mean': sum(outcome) / len(outcome),
+                    'time': int(time.time()),
+                    'average_diff_in_vals': diff_in_values/len(outcome)
    }
    with open(log_path, 'a+') as f:
-        f.write("{time};{trained_eps};{count};{sum};{mean}".format(**format_vars) + "\n")
+        f.write("{time};{trained_eps};{count};{sum};{mean};{average_diff_in_vals}".format(**format_vars) + "\n")
    

 def log_eval_outcomes(outcomes, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "eval.log")):
+    """
+    :param outcomes:
+    :param average_diff_in_value:
+    :param trained_eps:
+    :param log_path:
+    :return:
+    """
    for outcome in outcomes:
        scores = outcome[1]
        format_vars = { 'trained_eps': trained_eps,
@ -137,9 +148,9 @@ if __name__ == "__main__":
        network = Network(config, config['model'])
        start_episode = network.episodes_trained
        while True:
-            train_outcome = network.train_model(episodes = episode_count, trained_eps = start_episode)
+            train_outcome, diff_in_values = network.train_model(episodes = episode_count, trained_eps = start_episode)
            start_episode += episode_count
-            log_train_outcome(train_outcome, trained_eps = start_episode)
+            log_train_outcome(train_outcome, diff_in_values, trained_eps = start_episode)
            if config['eval_after_train']:
                eval_outcomes = network.eval(trained_eps = start_episode)
                log_eval_outcomes(eval_outcomes, trained_eps = start_episode)
--- a/network.py
+++ b/network.py
@ -139,7 +139,7 @@ class Network:
            if os.path.isfile(episode_count_path):
                with open(episode_count_path, 'r') as f:
                    self.config['start_episode'] = int(f.read())
-        elif glob.glob(os.path.join(os.path.join(self.config['model_storage_path'], "baseline_model"), 'model.ckpt*.index')):
+        elif self.config['use_baseline'] and glob.glob(os.path.join(os.path.join(self.config['model_storage_path'], "baseline_model"), 'model.ckpt*.index')):
            checkpoint_path = os.path.join(self.config['model_storage_path'], "baseline_model")
            latest_checkpoint = tf.train.latest_checkpoint(checkpoint_path)
            print("[NETWK] ({name}) Restoring model from:".format(name=self.name),
@ -157,7 +157,7 @@ class Network:
            exit()


-    #def make_move(self, sess, board, roll, player):
+    def make_move(self, sess, board, roll, player):
        """
        Find the best move given a board, roll and a player, by finding all possible states one can go to
        and then picking the best, by using the network to evaluate each state. The highest score is picked
@ -169,14 +169,14 @@ class Network:
        :param player: Current player
        :return: A pair of the best state to go to, together with the score of that state
        """
-     #   legal_moves = Board.calculate_legal_states(board, player, roll)
-     #   moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves]
-     #   scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores]
-     #   best_score_index = np.array(scores).argmax()
-     #   best_move_pair = moves_and_scores[best_score_index]
-     #   return best_move_pair
+        legal_moves = Board.calculate_legal_states(board, player, roll)
+        moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves]
+        scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores]
+        best_score_index = np.array(scores).argmax()
+        best_move_pair = moves_and_scores[best_score_index]
+        return best_move_pair

-    def make_move(self, sess, board, roll, player, n = 1):
+    def make_move_n_ply(self, sess, board, roll, player, n = 1):
        best_pair = self.calc_n_ply(n, sess, board, player, roll)
        return best_pair

@ -201,13 +201,7 @@ class Network:
        zero_ply_moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in init_legal_states]

        # pythons reverse is in place and I can't call [:15] on it, without applying it to an object like so. Fuck.
-        best_fifteen = sorted(zero_ply_moves_and_scores, key=itemgetter(1))
-
-
-        # They're sorted from smallest to largest, therefore we wan't to reverse if the current player is 1, since
-        # player 1 wishes to maximize. It's not needed for player -1, since that player seeks to minimize.
-        if player == 1:
-            best_fifteen.reverse()
+        best_fifteen = sorted(zero_ply_moves_and_scores, key=itemgetter(1), reverse=player==1)

        best_fifteen_boards = [x[0] for x in best_fifteen[:10]]

@ -228,14 +222,9 @@ class Network:
        zero_ply_moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in init_legal_states]

        # pythons reverse is in place and I can't call [:15] on it, without applying it to an object like so. Fuck.
-        sorted_moves_and_scores = sorted(zero_ply_moves_and_scores, key=itemgetter(1))
+        sorted_moves_and_scores = sorted(zero_ply_moves_and_scores, key=itemgetter(1), reverse=player==1)


-        # They're sorted from smallest to largest, therefore we wan't to reverse if the current player is 1, since
-        # player 1 wishes to maximize. It's not needed for player -1, since that player seeks to minimize.
-        if player == 1:
-            sorted_moves_and_scores.reverse()
-
        best_boards = [x[0] for x in sorted_moves_and_scores[:10]]

        best_move_score_pair = self.n_ply(n_init, sess, best_boards, player)
@ -365,7 +354,7 @@ class Network:
        all_rolls = gen_21_rolls()

        all_rolls_scores = []
-
+        count = 0
        # loop over boards
        for a_board in boards:
            a_board_scores = []
@ -375,7 +364,7 @@ class Network:

                # find all states we can get to, given the board and roll and the opposite player
                all_rolls_boards = Board.calculate_legal_states(a_board, player*-1, roll)
-
+                count += len(all_rolls_boards)
                # find scores for each board found above
                spec_roll_scores = [self.eval_state(sess, self.board_trans_func(new_board, player*-1))
                                    for new_board in all_rolls_boards]
@ -393,6 +382,7 @@ class Network:
            all_rolls_scores.append(sum(a_board_scores)/len(a_board_scores))

        # return all the average scores
+        print(count)
        return all_rolls_scores


@ -508,6 +498,7 @@ class Network:

    def train_model(self, episodes=1000, save_step_size=100, trained_eps=0):
        with tf.Session() as sess:
+            difference_in_vals = 0
            writer = tf.summary.FileWriter("/tmp/log/tf", sess.graph)

            sess.run(tf.global_variables_initializer())
@ -552,6 +543,7 @@ class Network:
                                                                (random.randrange(1, 7), random.randrange(1, 7)),
                                                                player)

+                    difference_in_vals += abs((cur_board_value - self.eval_state(sess, self.board_trans_func(prev_board, player))))


                    # adjust weights
@ -590,6 +582,6 @@ class Network:
            
            writer.close()
        
-            return outcomes
+            return outcomes, difference_in_vals