No longer use n_ply, shit's too slow man.

Added extra logging, now logs the average difference in values
between trainings.
Also fixed bug with the length of quack-norm.
Also added cli argument; use-baseline, if set, the baseline-model
will be used.
This commit is contained in:
Alexander Munch-Hansen 2018-05-06 20:41:07 +02:00
parent 1db469709a
commit 1f8485f54e
3 changed files with 66 additions and 61 deletions

View File

@ -62,7 +62,9 @@ class Board:
negatives = [x if x < 0 else 0 for x in board] negatives = [x if x < 0 else 0 for x in board]
board[0] = board[0] / 2 board[0] = board[0] / 2
board[25] = board[25] / 2 board[25] = board[25] / 2
board = [board[x] / 15 for x in range(1,25)]
board = [board[x] if x == 0 or 25 else board[x] / 15 for x in range(0, 26)]
board.append(15 - sum(positives)) board.append(15 - sum(positives))
board.append(-15 - sum(negatives)) board.append(-15 - sum(negatives))
board += ([1, 0] if np.sign(player) > 0 else [0, 1]) board += ([1, 0] if np.sign(player) > 0 else [0, 1])
@ -100,31 +102,31 @@ class Board:
return np.array(board_rep).reshape(1,198) return np.array(board_rep).reshape(1,198)
# @staticmethod @staticmethod
# def board_features_tesauro(board, cur_player): def board_features_tesauro_wrong(board, cur_player):
# features = [] features = []
# for player in [-1,1]: for player in [-1,1]:
# sum = 0.0 sum = 0.0
# for board_range in range(1,25): for board_range in range(1,25):
# pin = board[board_range] pin = board[board_range]
# #print("PIIIN:",pin) #print("PIIIN:",pin)
# feature = [0.0]*4 feature = [0.0]*4
# if np.sign(pin) == np.sign(player): if np.sign(pin) == np.sign(player):
# sum += abs(pin) sum += abs(pin)
# for i in range(min(abs(pin), 3)): for i in range(min(abs(pin), 3)):
# feature[i] = 1 feature[i] = 1
# if (abs(pin) > 3): if (abs(pin) > 3):
# feature[3] = (abs(pin)-3)/2 feature[3] = (abs(pin)-3)/2
# features += feature features += feature
# #print("SUUUM:",sum) #print("SUUUM:",sum)
# # Append the amount of men on the bar of the current player divided by 2 # Append the amount of men on the bar of the current player divided by 2
# features.append((board[0] if np.sign(player) < 0 else board[25]) / 2.0) features.append((board[0] if np.sign(player) < 0 else board[25]) / 2.0)
# # Calculate how many pieces there must be in the home state and divide it by 15 # Calculate how many pieces there must be in the home state and divide it by 15
# features.append((15 - sum) / 15) features.append((15 - sum) / 15)
# features += ([1,0] if np.sign(cur_player) > 0 else [0,1]) features += ([1,0] if np.sign(cur_player) > 0 else [0,1])
# test = np.array(features).reshape(1,-1) test = np.array(features).reshape(1,-1)
# #print("TEST:",test) #print("TEST:",test)
# return test return test

31
main.py
View File

@ -33,6 +33,8 @@ parser.add_argument('--list-models', action='store_true',
help='list all known models') help='list all known models')
parser.add_argument('--force-creation', action='store_true', parser.add_argument('--force-creation', action='store_true',
help='force model creation if model does not exist') help='force model creation if model does not exist')
parser.add_argument('--use-baseline', action='store_true',
help='use the baseline model, note, has size 28')
args = parser.parse_args() args = parser.parse_args()
@ -53,8 +55,9 @@ config = {
'train_perpetually': args.train_perpetually, 'train_perpetually': args.train_perpetually,
'model_storage_path': 'models', 'model_storage_path': 'models',
'bench_storage_path': 'bench', 'bench_storage_path': 'bench',
'board_representation': 'quack', 'board_representation': 'quack-fat',
'force_creation': args.force_creation 'force_creation': args.force_creation,
'use_baseline': args.use_baseline
} }
# Create models folder # Create models folder
@ -72,18 +75,26 @@ if not os.path.isdir(log_path):
# Define helper functions # Define helper functions
def log_train_outcome(outcome, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "train.log")): def log_train_outcome(outcome, diff_in_values, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "train.log")):
format_vars = { 'trained_eps': trained_eps, format_vars = { 'trained_eps': trained_eps,
'count': len(train_outcome), 'count': len(outcome),
'sum': sum(train_outcome), 'sum': sum(outcome),
'mean': sum(train_outcome) / len(train_outcome), 'mean': sum(outcome) / len(outcome),
'time': int(time.time()) 'time': int(time.time()),
'average_diff_in_vals': diff_in_values/len(outcome)
} }
with open(log_path, 'a+') as f: with open(log_path, 'a+') as f:
f.write("{time};{trained_eps};{count};{sum};{mean}".format(**format_vars) + "\n") f.write("{time};{trained_eps};{count};{sum};{mean};{average_diff_in_vals}".format(**format_vars) + "\n")
def log_eval_outcomes(outcomes, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "eval.log")): def log_eval_outcomes(outcomes, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "eval.log")):
"""
:param outcomes:
:param average_diff_in_value:
:param trained_eps:
:param log_path:
:return:
"""
for outcome in outcomes: for outcome in outcomes:
scores = outcome[1] scores = outcome[1]
format_vars = { 'trained_eps': trained_eps, format_vars = { 'trained_eps': trained_eps,
@ -137,9 +148,9 @@ if __name__ == "__main__":
network = Network(config, config['model']) network = Network(config, config['model'])
start_episode = network.episodes_trained start_episode = network.episodes_trained
while True: while True:
train_outcome = network.train_model(episodes = episode_count, trained_eps = start_episode) train_outcome, diff_in_values = network.train_model(episodes = episode_count, trained_eps = start_episode)
start_episode += episode_count start_episode += episode_count
log_train_outcome(train_outcome, trained_eps = start_episode) log_train_outcome(train_outcome, diff_in_values, trained_eps = start_episode)
if config['eval_after_train']: if config['eval_after_train']:
eval_outcomes = network.eval(trained_eps = start_episode) eval_outcomes = network.eval(trained_eps = start_episode)
log_eval_outcomes(eval_outcomes, trained_eps = start_episode) log_eval_outcomes(eval_outcomes, trained_eps = start_episode)

View File

@ -139,7 +139,7 @@ class Network:
if os.path.isfile(episode_count_path): if os.path.isfile(episode_count_path):
with open(episode_count_path, 'r') as f: with open(episode_count_path, 'r') as f:
self.config['start_episode'] = int(f.read()) self.config['start_episode'] = int(f.read())
elif glob.glob(os.path.join(os.path.join(self.config['model_storage_path'], "baseline_model"), 'model.ckpt*.index')): elif self.config['use_baseline'] and glob.glob(os.path.join(os.path.join(self.config['model_storage_path'], "baseline_model"), 'model.ckpt*.index')):
checkpoint_path = os.path.join(self.config['model_storage_path'], "baseline_model") checkpoint_path = os.path.join(self.config['model_storage_path'], "baseline_model")
latest_checkpoint = tf.train.latest_checkpoint(checkpoint_path) latest_checkpoint = tf.train.latest_checkpoint(checkpoint_path)
print("[NETWK] ({name}) Restoring model from:".format(name=self.name), print("[NETWK] ({name}) Restoring model from:".format(name=self.name),
@ -157,7 +157,7 @@ class Network:
exit() exit()
#def make_move(self, sess, board, roll, player): def make_move(self, sess, board, roll, player):
""" """
Find the best move given a board, roll and a player, by finding all possible states one can go to Find the best move given a board, roll and a player, by finding all possible states one can go to
and then picking the best, by using the network to evaluate each state. The highest score is picked and then picking the best, by using the network to evaluate each state. The highest score is picked
@ -169,14 +169,14 @@ class Network:
:param player: Current player :param player: Current player
:return: A pair of the best state to go to, together with the score of that state :return: A pair of the best state to go to, together with the score of that state
""" """
# legal_moves = Board.calculate_legal_states(board, player, roll) legal_moves = Board.calculate_legal_states(board, player, roll)
# moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves] moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves]
# scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores] scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores]
# best_score_index = np.array(scores).argmax() best_score_index = np.array(scores).argmax()
# best_move_pair = moves_and_scores[best_score_index] best_move_pair = moves_and_scores[best_score_index]
# return best_move_pair return best_move_pair
def make_move(self, sess, board, roll, player, n = 1): def make_move_n_ply(self, sess, board, roll, player, n = 1):
best_pair = self.calc_n_ply(n, sess, board, player, roll) best_pair = self.calc_n_ply(n, sess, board, player, roll)
return best_pair return best_pair
@ -201,13 +201,7 @@ class Network:
zero_ply_moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in init_legal_states] zero_ply_moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in init_legal_states]
# pythons reverse is in place and I can't call [:15] on it, without applying it to an object like so. Fuck. # pythons reverse is in place and I can't call [:15] on it, without applying it to an object like so. Fuck.
best_fifteen = sorted(zero_ply_moves_and_scores, key=itemgetter(1)) best_fifteen = sorted(zero_ply_moves_and_scores, key=itemgetter(1), reverse=player==1)
# They're sorted from smallest to largest, therefore we wan't to reverse if the current player is 1, since
# player 1 wishes to maximize. It's not needed for player -1, since that player seeks to minimize.
if player == 1:
best_fifteen.reverse()
best_fifteen_boards = [x[0] for x in best_fifteen[:10]] best_fifteen_boards = [x[0] for x in best_fifteen[:10]]
@ -228,14 +222,9 @@ class Network:
zero_ply_moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in init_legal_states] zero_ply_moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in init_legal_states]
# pythons reverse is in place and I can't call [:15] on it, without applying it to an object like so. Fuck. # pythons reverse is in place and I can't call [:15] on it, without applying it to an object like so. Fuck.
sorted_moves_and_scores = sorted(zero_ply_moves_and_scores, key=itemgetter(1)) sorted_moves_and_scores = sorted(zero_ply_moves_and_scores, key=itemgetter(1), reverse=player==1)
# They're sorted from smallest to largest, therefore we wan't to reverse if the current player is 1, since
# player 1 wishes to maximize. It's not needed for player -1, since that player seeks to minimize.
if player == 1:
sorted_moves_and_scores.reverse()
best_boards = [x[0] for x in sorted_moves_and_scores[:10]] best_boards = [x[0] for x in sorted_moves_and_scores[:10]]
best_move_score_pair = self.n_ply(n_init, sess, best_boards, player) best_move_score_pair = self.n_ply(n_init, sess, best_boards, player)
@ -365,7 +354,7 @@ class Network:
all_rolls = gen_21_rolls() all_rolls = gen_21_rolls()
all_rolls_scores = [] all_rolls_scores = []
count = 0
# loop over boards # loop over boards
for a_board in boards: for a_board in boards:
a_board_scores = [] a_board_scores = []
@ -375,7 +364,7 @@ class Network:
# find all states we can get to, given the board and roll and the opposite player # find all states we can get to, given the board and roll and the opposite player
all_rolls_boards = Board.calculate_legal_states(a_board, player*-1, roll) all_rolls_boards = Board.calculate_legal_states(a_board, player*-1, roll)
count += len(all_rolls_boards)
# find scores for each board found above # find scores for each board found above
spec_roll_scores = [self.eval_state(sess, self.board_trans_func(new_board, player*-1)) spec_roll_scores = [self.eval_state(sess, self.board_trans_func(new_board, player*-1))
for new_board in all_rolls_boards] for new_board in all_rolls_boards]
@ -393,6 +382,7 @@ class Network:
all_rolls_scores.append(sum(a_board_scores)/len(a_board_scores)) all_rolls_scores.append(sum(a_board_scores)/len(a_board_scores))
# return all the average scores # return all the average scores
print(count)
return all_rolls_scores return all_rolls_scores
@ -508,6 +498,7 @@ class Network:
def train_model(self, episodes=1000, save_step_size=100, trained_eps=0): def train_model(self, episodes=1000, save_step_size=100, trained_eps=0):
with tf.Session() as sess: with tf.Session() as sess:
difference_in_vals = 0
writer = tf.summary.FileWriter("/tmp/log/tf", sess.graph) writer = tf.summary.FileWriter("/tmp/log/tf", sess.graph)
sess.run(tf.global_variables_initializer()) sess.run(tf.global_variables_initializer())
@ -552,6 +543,7 @@ class Network:
(random.randrange(1, 7), random.randrange(1, 7)), (random.randrange(1, 7), random.randrange(1, 7)),
player) player)
difference_in_vals += abs((cur_board_value - self.eval_state(sess, self.board_trans_func(prev_board, player))))
# adjust weights # adjust weights
@ -590,6 +582,6 @@ class Network:
writer.close() writer.close()
return outcomes return outcomes, difference_in_vals