tesauro fat and diffs in values

2018-05-22 15:10:41 +02:00 · 2018-05-22 15:10:41 +02:00 · d426c1c3b5
commit d426c1c3b5
parent c31bc39780
4 changed files with 95 additions and 30 deletions
--- a/board.py
+++ b/board.py
@ -51,7 +51,6 @@ class Board:
        # board += ([1, 0] if np.sign(player) > 0 else [0, 1])
        # return np.array(board).reshape(1,30)

-
    # quack-fatter
    @staticmethod
    def board_features_quack_norm(board, player):
@ -66,7 +65,7 @@ class Board:
        board.append(15 - sum(positives))
        board.append(-15 - sum(negatives))
        board += ([1, 0] if np.sign(player) > 0 else [0, 1])
-        return np.array(board).reshape(1,30)
+        return np.array(board).reshape(1, 30)

    # tesauro
    @staticmethod
@ -95,9 +94,62 @@ class Board:
            board_rep += bar_trans(board, player)
            board_rep += (15 - Board.num_of_checkers_for_player(board, player),)

-        board_rep += ([1,0] if cur_player == 1 else [1,0])
+        board_rep += ([1, 0] if cur_player == 1 else [0, 1])

-        return np.array(board_rep).reshape(1,198)
+        return np.array(board_rep).reshape(1, 198)
+
+
+    @staticmethod
+    def board_features_tesauro_fat(board, cur_player):
+        def ordinary_trans(val, player):
+            abs_val = val*player
+            if abs_val <= 0:
+                return (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
+            elif abs_val == 1:
+                return (1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
+            elif abs_val == 2:
+                return (1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
+            elif abs_val == 3:
+                return (1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
+            elif abs_val == 4:
+                return (1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
+            elif abs_val == 5:
+                return (1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
+            elif abs_val == 6:
+                return (1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0)
+            elif abs_val == 7:
+                return (1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0)
+            elif abs_val == 8:
+                return (1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0)
+            elif abs_val == 9:
+                return (1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0)
+            elif abs_val == 10:
+                return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0)
+            elif abs_val == 11:
+                return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0)
+            elif abs_val == 12:
+                return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0)
+            elif abs_val == 13:
+                return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0)
+            elif abs_val == 14:
+                return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)
+            elif abs_val == 15:
+                return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
+
+        def bar_trans(board, player):
+            if   player == 1: return (abs(board[0]/2),)
+            elif player == -1: return (abs(board[25]/2),)
+
+        board_rep = []
+        for player in [1, -1]:
+            for x in board[1:25]:
+                board_rep += ordinary_trans(x, player)
+            board_rep += bar_trans(board, player)
+            board_rep += (15 - Board.num_of_checkers_for_player(board, player),)
+
+        board_rep += ([1, 0] if cur_player == 1 else [0, 1])
+
+        return np.array(board_rep).reshape(1, len(board_rep))


    @staticmethod
--- a/main.py
+++ b/main.py
@ -84,7 +84,7 @@ def log_train_outcome(outcome, diff_in_values, trained_eps = 0, log_path = os.pa
                    'sum': sum(outcome),
                    'mean': sum(outcome) / len(outcome),
                    'time': int(time.time()),
-                    'average_diff_in_vals': diff_in_values/len(outcome)
+                    'average_diff_in_vals': diff_in_values
    }

    with open(log_path, 'a+') as f:
--- a/network.py
+++ b/network.py
@ -21,10 +21,10 @@ class Network:
        'quack'       : (28, Board.board_features_quack),
        'tesauro'     : (198, Board.board_features_tesauro),
        'quack-norm'  : (30, Board.board_features_quack_norm),
+        'tesauro-fat' : (726, Board.board_features_tesauro_fat),
        'tesauro-poop': (198, Board.board_features_tesauro_wrong)
    }

-
    def custom_tanh(self, x, name=None):
        return tf.scalar_mul(tf.constant(2.00), tf.tanh(x, name))

@ -39,6 +39,11 @@ class Network:
            '0': self.make_move_0_ply
        }

+        self.max_or_min = {
+            1: np.argmax,
+            -1: np.argmin
+        }
+
        tf.enable_eager_execution()

        xavier_init = tf.contrib.layers.xavier_initializer()
@ -106,7 +111,7 @@ class Network:
        self.learning_rate = tf.maximum(self.min_learning_rate,
                                        self.exp_decay(self.max_learning_rate, self.global_step, 0.96, 50000),
                                        name="learning_rate")
-  
+
        with tf.GradientTape() as tape:
            value = self.model(prev_state.reshape(1,-1))
        grads = tape.gradient(value, self.model.variables)
@ -144,8 +149,9 @@ class Network:
        :param episode_count:
        :return:
        """
+
        tfe.Saver(self.model.variables).save(os.path.join(self.checkpoint_path, 'model.ckpt'))
-        #self.saver.save(sess, os.path.join(self.checkpoint_path, 'model.ckpt'), global_step=global_step)
+
        with open(os.path.join(self.checkpoint_path, "episodes_trained"), 'w+') as f:
            print("[NETWK] ({name}) Saving model to:".format(name=self.name),
                  os.path.join(self.checkpoint_path, 'model.ckpt'))
@ -184,9 +190,6 @@ class Network:
                  str(latest_checkpoint))
            tfe.Saver(self.model.variables).restore(latest_checkpoint)

-            # variables_names = [v.name for v in self.model.variables]
-
-
            # Restore trained episode count for model
            episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained")
            if os.path.isfile(episode_count_path):
@ -218,9 +221,9 @@ class Network:
        legal_states = np.array([self.board_trans_func(move, player)[0] for move in legal_moves])

        scores = self.model.predict_on_batch(legal_states)
-        transformed_scores = [x if np.sign(player) > 0 else 1 - x for x in scores]

-        best_score_idx = np.argmax(np.array(transformed_scores))
+        best_score_idx = self.max_or_min[player](scores)
+
        best_move, best_score = legal_moves[best_score_idx], scores[best_score_idx]

        return (best_move, best_score)
@ -263,9 +266,10 @@ class Network:
        sorted_moves_and_scores = sorted(moves_and_scores, key=itemgetter(1), reverse=(player == 1))
        best_boards = [ x[0] for x in sorted_moves_and_scores[:10] ]

-        scores, trans_scores = self.do_ply(best_boards, player)
+        scores = self.do_ply(best_boards, player)

-        best_score_idx = np.array(trans_scores).argmax()
+        best_score_idx = self.max_or_min[player](scores)
+        # best_score_idx = np.array(trans_scores).argmax()

        return (best_boards[best_score_idx], scores[best_score_idx])

@ -308,7 +312,7 @@ class Network:

        # print(time.time() - start)

-        # start = time.time()
+        start = time.time()

        all_scores = self.model.predict_on_batch(np.array(test_list))

@ -319,10 +323,10 @@ class Network:
            from_idx += length

        means_splits = [tf.reduce_mean(scores) for scores in split_scores]
-        transformed_means_splits = [x if player == 1 else (1-x) for x in means_splits]
-        # print(time.time() - start)

-        return (means_splits, transformed_means_splits)
+        # print(time.time() - start)
+        # print("/"*50)
+        return means_splits


    def eval(self, episode_count, trained_eps = 0):
@ -363,7 +367,6 @@ class Network:
            sys.stderr.write(
                "[EVAL ] Evaluating {eps} episode(s) with method '{method}'\n".format(eps=episodes, method=method))

-
            if method == 'pubeval':
                outcomes = []
                for i in range(1, episodes + 1):
@ -454,10 +457,8 @@ class Network:
        :return:
        """

-        difference_in_vals = 0
-
        self.restore_model()
-
+        average_diffs = 0
        start_time = time.time()

        def print_time_estimate(eps_completed):
@ -479,26 +480,26 @@ class Network:
            sys.stderr.write("[TRAIN] Episode {}".format(episode + trained_eps))
            # TODO decide which player should be here

-            player = 1
+            # player = 1
+            player = random.choice([-1,1])
            prev_board = Board.initial_state
            i = 0
+            difference_in_values = 0
            while Board.outcome(prev_board) is None:
                i += 1
                self.global_step += 1

-
                cur_board, cur_board_value = self.make_move(prev_board,
                                                            (random.randrange(1, 7), random.randrange(1, 7)),
                                                            player)

-                difference_in_vals += abs((cur_board_value - self.eval_state(self.board_trans_func(prev_board, player))))
+                difference_in_values += abs((cur_board_value - self.eval_state(self.board_trans_func(prev_board, player))))

                if self.config['verbose']:
                    print("Difference in values:", difference_in_vals)
                    print("Current board value :", cur_board_value)
                    print("Current board is    :\n",cur_board)

-
                # adjust weights
                if Board.outcome(cur_board) is None:
                    self.do_backprop(self.board_trans_func(prev_board, player), cur_board_value)
@ -511,7 +512,11 @@ class Network:
            outcomes.append(Board.outcome(final_board)[1])
            final_score = np.array([Board.outcome(final_board)[1]])
            scaled_final_score = ((final_score + 2) / 4)
-    
+
+            difference_in_values += abs(scaled_final_score-cur_board_value)
+
+            average_diffs += (difference_in_values[0][0] / (i+1))
+
            self.do_backprop(self.board_trans_func(prev_board, player), scaled_final_score.reshape(1,1))

            sys.stderr.write("\n")
@ -524,8 +529,9 @@ class Network:
                print_time_estimate(episode)

        sys.stderr.write("[TRAIN] Saving model for final episode...\n")
+
        self.save_model(episode+trained_eps)

-        return outcomes, difference_in_vals[0][0]
+        return outcomes, average_diffs/len(outcomes)


--- a/network_test.py
+++ b/network_test.py
@ -57,4 +57,11 @@ boards = {initial_state,

 # print(network.calculate_1_ply(Board.initial_state, [3,2], 1))

-network.play_against_network()
+
+diff = [0, 0]
+val = network.eval_state(Board.board_features_quack_fat(initial_state, 1))
+print(val)
+diff[0] += abs(-1-val)
+diff[1] += 1
+
+print(diff[1])