From 1f8485f54eacc5a9f2d81e7eabc5b90a140a3795 Mon Sep 17 00:00:00 2001 From: Alexander Munch-Hansen Date: Sun, 6 May 2018 20:41:07 +0200 Subject: [PATCH] No longer use n_ply, shit's too slow man. Added extra logging, now logs the average difference in values between trainings. Also fixed bug with the length of quack-norm. Also added cli argument; use-baseline, if set, the baseline-model will be used. --- board.py | 54 ++++++++++++++++++++++++++++-------------------------- main.py | 31 +++++++++++++++++++++---------- network.py | 42 +++++++++++++++++------------------------- 3 files changed, 66 insertions(+), 61 deletions(-) diff --git a/board.py b/board.py index 4bd586a..d32197c 100644 --- a/board.py +++ b/board.py @@ -62,7 +62,9 @@ class Board: negatives = [x if x < 0 else 0 for x in board] board[0] = board[0] / 2 board[25] = board[25] / 2 - board = [board[x] / 15 for x in range(1,25)] + + board = [board[x] if x == 0 or 25 else board[x] / 15 for x in range(0, 26)] + board.append(15 - sum(positives)) board.append(-15 - sum(negatives)) board += ([1, 0] if np.sign(player) > 0 else [0, 1]) @@ -100,31 +102,31 @@ class Board: return np.array(board_rep).reshape(1,198) - # @staticmethod - # def board_features_tesauro(board, cur_player): - # features = [] - # for player in [-1,1]: - # sum = 0.0 - # for board_range in range(1,25): - # pin = board[board_range] - # #print("PIIIN:",pin) - # feature = [0.0]*4 - # if np.sign(pin) == np.sign(player): - # sum += abs(pin) - # for i in range(min(abs(pin), 3)): - # feature[i] = 1 - # if (abs(pin) > 3): - # feature[3] = (abs(pin)-3)/2 - # features += feature - # #print("SUUUM:",sum) - # # Append the amount of men on the bar of the current player divided by 2 - # features.append((board[0] if np.sign(player) < 0 else board[25]) / 2.0) - # # Calculate how many pieces there must be in the home state and divide it by 15 - # features.append((15 - sum) / 15) - # features += ([1,0] if np.sign(cur_player) > 0 else [0,1]) - # test = np.array(features).reshape(1,-1) - # #print("TEST:",test) - # return test + @staticmethod + def board_features_tesauro_wrong(board, cur_player): + features = [] + for player in [-1,1]: + sum = 0.0 + for board_range in range(1,25): + pin = board[board_range] + #print("PIIIN:",pin) + feature = [0.0]*4 + if np.sign(pin) == np.sign(player): + sum += abs(pin) + for i in range(min(abs(pin), 3)): + feature[i] = 1 + if (abs(pin) > 3): + feature[3] = (abs(pin)-3)/2 + features += feature + #print("SUUUM:",sum) + # Append the amount of men on the bar of the current player divided by 2 + features.append((board[0] if np.sign(player) < 0 else board[25]) / 2.0) + # Calculate how many pieces there must be in the home state and divide it by 15 + features.append((15 - sum) / 15) + features += ([1,0] if np.sign(cur_player) > 0 else [0,1]) + test = np.array(features).reshape(1,-1) + #print("TEST:",test) + return test diff --git a/main.py b/main.py index bcf891d..8916d3f 100644 --- a/main.py +++ b/main.py @@ -33,6 +33,8 @@ parser.add_argument('--list-models', action='store_true', help='list all known models') parser.add_argument('--force-creation', action='store_true', help='force model creation if model does not exist') +parser.add_argument('--use-baseline', action='store_true', + help='use the baseline model, note, has size 28') args = parser.parse_args() @@ -53,8 +55,9 @@ config = { 'train_perpetually': args.train_perpetually, 'model_storage_path': 'models', 'bench_storage_path': 'bench', - 'board_representation': 'quack', - 'force_creation': args.force_creation + 'board_representation': 'quack-fat', + 'force_creation': args.force_creation, + 'use_baseline': args.use_baseline } # Create models folder @@ -72,18 +75,26 @@ if not os.path.isdir(log_path): # Define helper functions -def log_train_outcome(outcome, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "train.log")): +def log_train_outcome(outcome, diff_in_values, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "train.log")): format_vars = { 'trained_eps': trained_eps, - 'count': len(train_outcome), - 'sum': sum(train_outcome), - 'mean': sum(train_outcome) / len(train_outcome), - 'time': int(time.time()) + 'count': len(outcome), + 'sum': sum(outcome), + 'mean': sum(outcome) / len(outcome), + 'time': int(time.time()), + 'average_diff_in_vals': diff_in_values/len(outcome) } with open(log_path, 'a+') as f: - f.write("{time};{trained_eps};{count};{sum};{mean}".format(**format_vars) + "\n") + f.write("{time};{trained_eps};{count};{sum};{mean};{average_diff_in_vals}".format(**format_vars) + "\n") def log_eval_outcomes(outcomes, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "eval.log")): + """ + :param outcomes: + :param average_diff_in_value: + :param trained_eps: + :param log_path: + :return: + """ for outcome in outcomes: scores = outcome[1] format_vars = { 'trained_eps': trained_eps, @@ -137,9 +148,9 @@ if __name__ == "__main__": network = Network(config, config['model']) start_episode = network.episodes_trained while True: - train_outcome = network.train_model(episodes = episode_count, trained_eps = start_episode) + train_outcome, diff_in_values = network.train_model(episodes = episode_count, trained_eps = start_episode) start_episode += episode_count - log_train_outcome(train_outcome, trained_eps = start_episode) + log_train_outcome(train_outcome, diff_in_values, trained_eps = start_episode) if config['eval_after_train']: eval_outcomes = network.eval(trained_eps = start_episode) log_eval_outcomes(eval_outcomes, trained_eps = start_episode) diff --git a/network.py b/network.py index f4be4c0..f30e724 100644 --- a/network.py +++ b/network.py @@ -139,7 +139,7 @@ class Network: if os.path.isfile(episode_count_path): with open(episode_count_path, 'r') as f: self.config['start_episode'] = int(f.read()) - elif glob.glob(os.path.join(os.path.join(self.config['model_storage_path'], "baseline_model"), 'model.ckpt*.index')): + elif self.config['use_baseline'] and glob.glob(os.path.join(os.path.join(self.config['model_storage_path'], "baseline_model"), 'model.ckpt*.index')): checkpoint_path = os.path.join(self.config['model_storage_path'], "baseline_model") latest_checkpoint = tf.train.latest_checkpoint(checkpoint_path) print("[NETWK] ({name}) Restoring model from:".format(name=self.name), @@ -157,7 +157,7 @@ class Network: exit() - #def make_move(self, sess, board, roll, player): + def make_move(self, sess, board, roll, player): """ Find the best move given a board, roll and a player, by finding all possible states one can go to and then picking the best, by using the network to evaluate each state. The highest score is picked @@ -169,14 +169,14 @@ class Network: :param player: Current player :return: A pair of the best state to go to, together with the score of that state """ - # legal_moves = Board.calculate_legal_states(board, player, roll) - # moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves] - # scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores] - # best_score_index = np.array(scores).argmax() - # best_move_pair = moves_and_scores[best_score_index] - # return best_move_pair + legal_moves = Board.calculate_legal_states(board, player, roll) + moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves] + scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores] + best_score_index = np.array(scores).argmax() + best_move_pair = moves_and_scores[best_score_index] + return best_move_pair - def make_move(self, sess, board, roll, player, n = 1): + def make_move_n_ply(self, sess, board, roll, player, n = 1): best_pair = self.calc_n_ply(n, sess, board, player, roll) return best_pair @@ -201,13 +201,7 @@ class Network: zero_ply_moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in init_legal_states] # pythons reverse is in place and I can't call [:15] on it, without applying it to an object like so. Fuck. - best_fifteen = sorted(zero_ply_moves_and_scores, key=itemgetter(1)) - - - # They're sorted from smallest to largest, therefore we wan't to reverse if the current player is 1, since - # player 1 wishes to maximize. It's not needed for player -1, since that player seeks to minimize. - if player == 1: - best_fifteen.reverse() + best_fifteen = sorted(zero_ply_moves_and_scores, key=itemgetter(1), reverse=player==1) best_fifteen_boards = [x[0] for x in best_fifteen[:10]] @@ -228,14 +222,9 @@ class Network: zero_ply_moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in init_legal_states] # pythons reverse is in place and I can't call [:15] on it, without applying it to an object like so. Fuck. - sorted_moves_and_scores = sorted(zero_ply_moves_and_scores, key=itemgetter(1)) + sorted_moves_and_scores = sorted(zero_ply_moves_and_scores, key=itemgetter(1), reverse=player==1) - # They're sorted from smallest to largest, therefore we wan't to reverse if the current player is 1, since - # player 1 wishes to maximize. It's not needed for player -1, since that player seeks to minimize. - if player == 1: - sorted_moves_and_scores.reverse() - best_boards = [x[0] for x in sorted_moves_and_scores[:10]] best_move_score_pair = self.n_ply(n_init, sess, best_boards, player) @@ -365,7 +354,7 @@ class Network: all_rolls = gen_21_rolls() all_rolls_scores = [] - + count = 0 # loop over boards for a_board in boards: a_board_scores = [] @@ -375,7 +364,7 @@ class Network: # find all states we can get to, given the board and roll and the opposite player all_rolls_boards = Board.calculate_legal_states(a_board, player*-1, roll) - + count += len(all_rolls_boards) # find scores for each board found above spec_roll_scores = [self.eval_state(sess, self.board_trans_func(new_board, player*-1)) for new_board in all_rolls_boards] @@ -393,6 +382,7 @@ class Network: all_rolls_scores.append(sum(a_board_scores)/len(a_board_scores)) # return all the average scores + print(count) return all_rolls_scores @@ -508,6 +498,7 @@ class Network: def train_model(self, episodes=1000, save_step_size=100, trained_eps=0): with tf.Session() as sess: + difference_in_vals = 0 writer = tf.summary.FileWriter("/tmp/log/tf", sess.graph) sess.run(tf.global_variables_initializer()) @@ -552,6 +543,7 @@ class Network: (random.randrange(1, 7), random.randrange(1, 7)), player) + difference_in_vals += abs((cur_board_value - self.eval_state(sess, self.board_trans_func(prev_board, player)))) # adjust weights @@ -590,6 +582,6 @@ class Network: writer.close() - return outcomes + return outcomes, difference_in_vals