Fixed potential bug in regards to scores in 2-ply calculation.

2018-04-25 00:51:04 +02:00 · 2018-04-25 00:51:04 +02:00 · 8899c5c2d9
commit 8899c5c2d9
parent ea3f05846d
1 changed files with 30 additions and 7 deletions
--- a/network.py
+++ b/network.py
@ -113,6 +113,14 @@ class Network:
            f.write(str(episode_count) + "\n")

    def restore_model(self, sess):
+        """
+        Restore a model for a session, such that a trained model and either be further trained or
+        used for evaluation
+
+        :param sess: Current session
+        :return: Nothing. It's a side-effect that a model gets restored for the network.
+        """
+
        if glob.glob(os.path.join(self.checkpoint_path, 'model.ckpt*.index')):
        
            latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_path)
@ -150,6 +158,17 @@ class Network:


    def make_move(self, sess, board, roll, player):
+        """
+        Find the best move given a board, roll and a player, by finding all possible states one can go to
+        and then picking the best, by using the network to evaluate each state. The highest score is picked
+        for the 1-player and the max(1-score) is picked for the -1-player.
+
+        :param sess:
+        :param board: Current board
+        :param roll:  Current roll
+        :param player: Current player
+        :return: A pair of the best state to go to, together with the score of that state
+        """
        legal_moves = Board.calculate_legal_states(board, player, roll)
        moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves]
        scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores]
@ -189,6 +208,10 @@ class Network:

        # pythons reverse is in place and I can't call [:15] on it, without applying it to an object like so. Fuck.
        best_fifteen = sorted(zero_ply_moves_and_scores, key=itemgetter(1))
+
+        # They're sorted from smallest to largest, therefore we wan't to reverse if the current player is 1, since
+        # player 1 wishes to maximize. It's not needed for player -1, since that player seeks to minimize.
+        if player == 1:
            best_fifteen.reverse()
        best_fifteen_boards = [x[0] for x in best_fifteen[:15]]

@ -198,12 +221,13 @@ class Network:
        for a_board in best_fifteen_boards:
            a_board_scores = []
            for roll in all_rolls:
-                spec_roll_scores = []
                all_rolls_boards = Board.calculate_legal_states(a_board, player*-1, roll)

-                spec_roll_scores.append(
-                    [self.eval_state(sess, self.board_trans_func(new_board, player*-1)) for new_board in all_rolls_boards]
-                )
+                spec_roll_scores = [self.eval_state(sess, self.board_trans_func(new_board, player*-1))
+                                    for new_board in all_rolls_boards]
+
+                # We need 1-score for the -1 player
+                spec_roll_scores = [x if player == 1 else (1-x) for x in spec_roll_scores]

                best_score = max(spec_roll_scores)

@ -372,8 +396,7 @@ class Network:
                                                                (random.randrange(1, 7), random.randrange(1, 7)),
                                                                player)

-                    # print("The evaluation of the previous state:\n", self.eval_state(sess, self.board_trans_func(prev_board, player)))
-                    # print("The evaluation of the current_state:\n", cur_board_value)
+

                    # adjust weights
                    sess.run(self.training_op,