tesauro fat and diffs in values

2018-05-22 15:10:41 +02:00 · 2018-05-22 15:10:41 +02:00 · d426c1c3b5
commit d426c1c3b5
parent c31bc39780
4 changed files with 95 additions and 30 deletions
--- a/board.py
+++ b/board.py
@ -51,7 +51,6 @@ class Board:
        # board += ([1, 0] if np.sign(player) > 0 else [0, 1])
        # return np.array(board).reshape(1,30)
    # quack-fatter
    @staticmethod
    def board_features_quack_norm(board, player):
@ -66,7 +65,7 @@ class Board:
        board.append(15 - sum(positives))
        board.append(-15 - sum(negatives))
        board += ([1, 0] if np.sign(player) > 0 else [0, 1])
-        return np.array(board).reshape(1,30)
+        return np.array(board).reshape(1, 30)
    # tesauro
    @staticmethod
@ -95,9 +94,62 @@ class Board:
            board_rep += bar_trans(board, player)
            board_rep += (15 - Board.num_of_checkers_for_player(board, player),)
-        board_rep += ([1,0] if cur_player == 1 else [1,0])
+        board_rep += ([1, 0] if cur_player == 1 else [0, 1])
-        return np.array(board_rep).reshape(1,198)
+        return np.array(board_rep).reshape(1, 198)
    @staticmethod
    def board_features_tesauro_fat(board, cur_player):
        def ordinary_trans(val, player):
            abs_val = val*player
            if abs_val <= 0:
                return (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
            elif abs_val == 1:
                return (1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
            elif abs_val == 2:
                return (1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
            elif abs_val == 3:
                return (1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
            elif abs_val == 4:
                return (1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
            elif abs_val == 5:
                return (1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
            elif abs_val == 6:
                return (1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0)
            elif abs_val == 7:
                return (1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0)
            elif abs_val == 8:
                return (1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0)
            elif abs_val == 9:
                return (1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0)
            elif abs_val == 10:
                return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0)
            elif abs_val == 11:
                return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0)
            elif abs_val == 12:
                return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0)
            elif abs_val == 13:
                return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0)
            elif abs_val == 14:
                return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)
            elif abs_val == 15:
                return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
        def bar_trans(board, player):
            if   player == 1: return (abs(board[0]/2),)
            elif player == -1: return (abs(board[25]/2),)
        board_rep = []
        for player in [1, -1]:
            for x in board[1:25]:
                board_rep += ordinary_trans(x, player)
            board_rep += bar_trans(board, player)
            board_rep += (15 - Board.num_of_checkers_for_player(board, player),)
        board_rep += ([1, 0] if cur_player == 1 else [0, 1])
        return np.array(board_rep).reshape(1, len(board_rep))
    @staticmethod
--- a/main.py
+++ b/main.py
@ -84,7 +84,7 @@ def log_train_outcome(outcome, diff_in_values, trained_eps = 0, log_path = os.pa
                    'sum': sum(outcome),
                    'mean': sum(outcome) / len(outcome),
                    'time': int(time.time()),
-                    'average_diff_in_vals': diff_in_values/len(outcome)
+                    'average_diff_in_vals': diff_in_values
    }
    with open(log_path, 'a+') as f:
--- a/network.py
+++ b/network.py
@ -21,10 +21,10 @@ class Network:
        'quack'       : (28, Board.board_features_quack),
        'tesauro'     : (198, Board.board_features_tesauro),
        'quack-norm'  : (30, Board.board_features_quack_norm),
        'tesauro-fat' : (726, Board.board_features_tesauro_fat),
        'tesauro-poop': (198, Board.board_features_tesauro_wrong)
    }
    def custom_tanh(self, x, name=None):
        return tf.scalar_mul(tf.constant(2.00), tf.tanh(x, name))
@ -39,6 +39,11 @@ class Network:
            '0': self.make_move_0_ply
        }
        self.max_or_min = {
            1: np.argmax,
            -1: np.argmin
        }
        tf.enable_eager_execution()
        xavier_init = tf.contrib.layers.xavier_initializer()
@ -144,8 +149,9 @@ class Network:
        :param episode_count:
        :return:
        """
        tfe.Saver(self.model.variables).save(os.path.join(self.checkpoint_path, 'model.ckpt'))
-        #self.saver.save(sess, os.path.join(self.checkpoint_path, 'model.ckpt'), global_step=global_step)
+
        with open(os.path.join(self.checkpoint_path, "episodes_trained"), 'w+') as f:
            print("[NETWK] ({name}) Saving model to:".format(name=self.name),
                  os.path.join(self.checkpoint_path, 'model.ckpt'))
@ -184,9 +190,6 @@ class Network:
                  str(latest_checkpoint))
            tfe.Saver(self.model.variables).restore(latest_checkpoint)
            # variables_names = [v.name for v in self.model.variables]
            # Restore trained episode count for model
            episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained")
            if os.path.isfile(episode_count_path):
@ -218,9 +221,9 @@ class Network:
        legal_states = np.array([self.board_trans_func(move, player)[0] for move in legal_moves])
        scores = self.model.predict_on_batch(legal_states)
        transformed_scores = [x if np.sign(player) > 0 else 1 - x for x in scores]
-        best_score_idx = np.argmax(np.array(transformed_scores))
+        best_score_idx = self.max_or_min[player](scores)
        best_move, best_score = legal_moves[best_score_idx], scores[best_score_idx]
        return (best_move, best_score)
@ -263,9 +266,10 @@ class Network:
        sorted_moves_and_scores = sorted(moves_and_scores, key=itemgetter(1), reverse=(player == 1))
        best_boards = [ x[0] for x in sorted_moves_and_scores[:10] ]
-        scores, trans_scores = self.do_ply(best_boards, player)
+        scores = self.do_ply(best_boards, player)
-        best_score_idx = np.array(trans_scores).argmax()
+        best_score_idx = self.max_or_min[player](scores)
        # best_score_idx = np.array(trans_scores).argmax()
        return (best_boards[best_score_idx], scores[best_score_idx])
@ -308,7 +312,7 @@ class Network:
        # print(time.time() - start)
-        # start = time.time()
+        start = time.time()
        all_scores = self.model.predict_on_batch(np.array(test_list))
@ -319,10 +323,10 @@ class Network:
            from_idx += length
        means_splits = [tf.reduce_mean(scores) for scores in split_scores]
        transformed_means_splits = [x if player == 1 else (1-x) for x in means_splits]
        # print(time.time() - start)
-        return (means_splits, transformed_means_splits)
+        # print(time.time() - start)
        # print("/"*50)
        return means_splits
    def eval(self, episode_count, trained_eps = 0):
@ -363,7 +367,6 @@ class Network:
            sys.stderr.write(
                "[EVAL ] Evaluating {eps} episode(s) with method '{method}'\n".format(eps=episodes, method=method))
            if method == 'pubeval':
                outcomes = []
                for i in range(1, episodes + 1):
@ -454,10 +457,8 @@ class Network:
        :return:
        """
        difference_in_vals = 0
        self.restore_model()
-
+        average_diffs = 0
        start_time = time.time()
        def print_time_estimate(eps_completed):
@ -479,26 +480,26 @@ class Network:
            sys.stderr.write("[TRAIN] Episode {}".format(episode + trained_eps))
            # TODO decide which player should be here
-            player = 1
+            # player = 1
            player = random.choice([-1,1])
            prev_board = Board.initial_state
            i = 0
            difference_in_values = 0
            while Board.outcome(prev_board) is None:
                i += 1
                self.global_step += 1
                cur_board, cur_board_value = self.make_move(prev_board,
                                                            (random.randrange(1, 7), random.randrange(1, 7)),
                                                            player)
-                difference_in_vals += abs((cur_board_value - self.eval_state(self.board_trans_func(prev_board, player))))
+                difference_in_values += abs((cur_board_value - self.eval_state(self.board_trans_func(prev_board, player))))
                if self.config['verbose']:
                    print("Difference in values:", difference_in_vals)
                    print("Current board value :", cur_board_value)
                    print("Current board is    :\n",cur_board)
                # adjust weights
                if Board.outcome(cur_board) is None:
                    self.do_backprop(self.board_trans_func(prev_board, player), cur_board_value)
@ -512,6 +513,10 @@ class Network:
            final_score = np.array([Board.outcome(final_board)[1]])
            scaled_final_score = ((final_score + 2) / 4)
            difference_in_values += abs(scaled_final_score-cur_board_value)
            average_diffs += (difference_in_values[0][0] / (i+1))
            self.do_backprop(self.board_trans_func(prev_board, player), scaled_final_score.reshape(1,1))
            sys.stderr.write("\n")
@ -524,8 +529,9 @@ class Network:
                print_time_estimate(episode)
        sys.stderr.write("[TRAIN] Saving model for final episode...\n")
        self.save_model(episode+trained_eps)
-        return outcomes, difference_in_vals[0][0]
+        return outcomes, average_diffs/len(outcomes)
--- a/network_test.py
+++ b/network_test.py
@ -57,4 +57,11 @@ boards = {initial_state,
 # print(network.calculate_1_ply(Board.initial_state, [3,2], 1))
-network.play_against_network()
+
 diff = [0, 0]
 val = network.eval_state(Board.board_features_quack_fat(initial_state, 1))
 print(val)
 diff[0] += abs(-1-val)
 diff[1] += 1
 print(diff[1])