diff --git a/board.py b/board.py index 0233cf2..655dfe5 100644 --- a/board.py +++ b/board.py @@ -51,7 +51,6 @@ class Board: # board += ([1, 0] if np.sign(player) > 0 else [0, 1]) # return np.array(board).reshape(1,30) - # quack-fatter @staticmethod def board_features_quack_norm(board, player): @@ -66,7 +65,7 @@ class Board: board.append(15 - sum(positives)) board.append(-15 - sum(negatives)) board += ([1, 0] if np.sign(player) > 0 else [0, 1]) - return np.array(board).reshape(1,30) + return np.array(board).reshape(1, 30) # tesauro @staticmethod @@ -95,9 +94,62 @@ class Board: board_rep += bar_trans(board, player) board_rep += (15 - Board.num_of_checkers_for_player(board, player),) - board_rep += ([1,0] if cur_player == 1 else [1,0]) + board_rep += ([1, 0] if cur_player == 1 else [0, 1]) - return np.array(board_rep).reshape(1,198) + return np.array(board_rep).reshape(1, 198) + + + @staticmethod + def board_features_tesauro_fat(board, cur_player): + def ordinary_trans(val, player): + abs_val = val*player + if abs_val <= 0: + return (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) + elif abs_val == 1: + return (1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) + elif abs_val == 2: + return (1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) + elif abs_val == 3: + return (1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) + elif abs_val == 4: + return (1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) + elif abs_val == 5: + return (1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) + elif abs_val == 6: + return (1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0) + elif abs_val == 7: + return (1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) + elif abs_val == 8: + return (1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) + elif abs_val == 9: + return (1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0) + elif abs_val == 10: + return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0) + elif abs_val == 11: + return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) + elif abs_val == 12: + return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0) + elif abs_val == 13: + return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0) + elif abs_val == 14: + return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) + elif abs_val == 15: + return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) + + def bar_trans(board, player): + if player == 1: return (abs(board[0]/2),) + elif player == -1: return (abs(board[25]/2),) + + board_rep = [] + for player in [1, -1]: + for x in board[1:25]: + board_rep += ordinary_trans(x, player) + board_rep += bar_trans(board, player) + board_rep += (15 - Board.num_of_checkers_for_player(board, player),) + + board_rep += ([1, 0] if cur_player == 1 else [0, 1]) + + return np.array(board_rep).reshape(1, len(board_rep)) @staticmethod diff --git a/main.py b/main.py index 44e6c1c..4a58fab 100644 --- a/main.py +++ b/main.py @@ -86,7 +86,7 @@ def log_train_outcome(outcome, diff_in_values, trained_eps = 0, log_path = os.pa 'sum': sum(outcome), 'mean': sum(outcome) / len(outcome), 'time': int(time.time()), - 'average_diff_in_vals': diff_in_values/len(outcome), + 'average_diff_in_vals': diff_in_values 'commit': commit } diff --git a/network.py b/network.py index 3178a95..1dbbbc1 100644 --- a/network.py +++ b/network.py @@ -21,10 +21,10 @@ class Network: 'quack' : (28, Board.board_features_quack), 'tesauro' : (198, Board.board_features_tesauro), 'quack-norm' : (30, Board.board_features_quack_norm), + 'tesauro-fat' : (726, Board.board_features_tesauro_fat), 'tesauro-poop': (198, Board.board_features_tesauro_wrong) } - def custom_tanh(self, x, name=None): return tf.scalar_mul(tf.constant(2.00), tf.tanh(x, name)) @@ -39,6 +39,11 @@ class Network: '0': self.make_move_0_ply } + self.max_or_min = { + 1: np.argmax, + -1: np.argmin + } + tf.enable_eager_execution() xavier_init = tf.contrib.layers.xavier_initializer() @@ -106,7 +111,7 @@ class Network: self.learning_rate = tf.maximum(self.min_learning_rate, self.exp_decay(self.max_learning_rate, self.global_step, 0.96, 50000), name="learning_rate") - + with tf.GradientTape() as tape: value = self.model(prev_state.reshape(1,-1)) grads = tape.gradient(value, self.model.variables) @@ -144,8 +149,9 @@ class Network: :param episode_count: :return: """ + tfe.Saver(self.model.variables).save(os.path.join(self.checkpoint_path, 'model.ckpt')) - #self.saver.save(sess, os.path.join(self.checkpoint_path, 'model.ckpt'), global_step=global_step) + with open(os.path.join(self.checkpoint_path, "episodes_trained"), 'w+') as f: print("[NETWK] ({name}) Saving model to:".format(name=self.name), os.path.join(self.checkpoint_path, 'model.ckpt')) @@ -184,9 +190,6 @@ class Network: str(latest_checkpoint)) tfe.Saver(self.model.variables).restore(latest_checkpoint) - # variables_names = [v.name for v in self.model.variables] - - # Restore trained episode count for model episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained") if os.path.isfile(episode_count_path): @@ -218,9 +221,9 @@ class Network: legal_states = np.array([self.board_trans_func(move, player)[0] for move in legal_moves]) scores = self.model.predict_on_batch(legal_states) - transformed_scores = [x if np.sign(player) > 0 else 1 - x for x in scores] - best_score_idx = np.argmax(np.array(transformed_scores)) + best_score_idx = self.max_or_min[player](scores) + best_move, best_score = legal_moves[best_score_idx], scores[best_score_idx] return (best_move, best_score) @@ -263,9 +266,10 @@ class Network: sorted_moves_and_scores = sorted(moves_and_scores, key=itemgetter(1), reverse=(player == 1)) best_boards = [ x[0] for x in sorted_moves_and_scores[:10] ] - scores, trans_scores = self.do_ply(best_boards, player) + scores = self.do_ply(best_boards, player) - best_score_idx = np.array(trans_scores).argmax() + best_score_idx = self.max_or_min[player](scores) + # best_score_idx = np.array(trans_scores).argmax() return (best_boards[best_score_idx], scores[best_score_idx]) @@ -308,7 +312,7 @@ class Network: # print(time.time() - start) - # start = time.time() + start = time.time() all_scores = self.model.predict_on_batch(np.array(test_list)) @@ -319,10 +323,10 @@ class Network: from_idx += length means_splits = [tf.reduce_mean(scores) for scores in split_scores] - transformed_means_splits = [x if player == 1 else (1-x) for x in means_splits] - # print(time.time() - start) - return (means_splits, transformed_means_splits) + # print(time.time() - start) + # print("/"*50) + return means_splits def eval(self, episode_count, trained_eps = 0): @@ -363,7 +367,6 @@ class Network: sys.stderr.write( "[EVAL ] Evaluating {eps} episode(s) with method '{method}'\n".format(eps=episodes, method=method)) - if method == 'pubeval': outcomes = [] for i in range(1, episodes + 1): @@ -454,10 +457,8 @@ class Network: :return: """ - difference_in_vals = 0 - self.restore_model() - + average_diffs = 0 start_time = time.time() def print_time_estimate(eps_completed): @@ -479,26 +480,26 @@ class Network: sys.stderr.write("[TRAIN] Episode {}".format(episode + trained_eps)) # TODO decide which player should be here - player = 1 + # player = 1 + player = random.choice([-1,1]) prev_board = Board.initial_state i = 0 + difference_in_values = 0 while Board.outcome(prev_board) is None: i += 1 self.global_step += 1 - cur_board, cur_board_value = self.make_move(prev_board, (random.randrange(1, 7), random.randrange(1, 7)), player) - difference_in_vals += abs((cur_board_value - self.eval_state(self.board_trans_func(prev_board, player)))) + difference_in_values += abs((cur_board_value - self.eval_state(self.board_trans_func(prev_board, player)))) if self.config['verbose']: print("Difference in values:", difference_in_vals) print("Current board value :", cur_board_value) print("Current board is :\n",cur_board) - # adjust weights if Board.outcome(cur_board) is None: self.do_backprop(self.board_trans_func(prev_board, player), cur_board_value) @@ -511,7 +512,11 @@ class Network: outcomes.append(Board.outcome(final_board)[1]) final_score = np.array([Board.outcome(final_board)[1]]) scaled_final_score = ((final_score + 2) / 4) - + + difference_in_values += abs(scaled_final_score-cur_board_value) + + average_diffs += (difference_in_values[0][0] / (i+1)) + self.do_backprop(self.board_trans_func(prev_board, player), scaled_final_score.reshape(1,1)) sys.stderr.write("\n") @@ -524,8 +529,9 @@ class Network: print_time_estimate(episode) sys.stderr.write("[TRAIN] Saving model for final episode...\n") + self.save_model(episode+trained_eps) - return outcomes, difference_in_vals[0][0] + return outcomes, average_diffs/len(outcomes) diff --git a/network_test.py b/network_test.py index 1bcb878..7325ff6 100644 --- a/network_test.py +++ b/network_test.py @@ -57,4 +57,11 @@ boards = {initial_state, # print(network.calculate_1_ply(Board.initial_state, [3,2], 1)) -network.play_against_network() \ No newline at end of file + +diff = [0, 0] +val = network.eval_state(Board.board_features_quack_fat(initial_state, 1)) +print(val) +diff[0] += abs(-1-val) +diff[1] += 1 + +print(diff[1]) \ No newline at end of file