No longer use n_ply, shit's too slow man.

Added extra logging, now logs the average difference in values
between trainings.
Also fixed bug with the length of quack-norm.
Also added cli argument; use-baseline, if set, the baseline-model
will be used.
This commit is contained in:
Alexander Munch-Hansen 2018-05-06 20:41:07 +02:00
parent 1db469709a
commit 1f8485f54e
3 changed files with 66 additions and 61 deletions

View File

@ -62,7 +62,9 @@ class Board:
negatives = [x if x < 0 else 0 for x in board]
board[0] = board[0] / 2
board[25] = board[25] / 2
board = [board[x] / 15 for x in range(1,25)]
board = [board[x] if x == 0 or 25 else board[x] / 15 for x in range(0, 26)]
board.append(15 - sum(positives))
board.append(-15 - sum(negatives))
board += ([1, 0] if np.sign(player) > 0 else [0, 1])
@ -100,31 +102,31 @@ class Board:
return np.array(board_rep).reshape(1,198)
# @staticmethod
# def board_features_tesauro(board, cur_player):
# features = []
# for player in [-1,1]:
# sum = 0.0
# for board_range in range(1,25):
# pin = board[board_range]
# #print("PIIIN:",pin)
# feature = [0.0]*4
# if np.sign(pin) == np.sign(player):
# sum += abs(pin)
# for i in range(min(abs(pin), 3)):
# feature[i] = 1
# if (abs(pin) > 3):
# feature[3] = (abs(pin)-3)/2
# features += feature
# #print("SUUUM:",sum)
# # Append the amount of men on the bar of the current player divided by 2
# features.append((board[0] if np.sign(player) < 0 else board[25]) / 2.0)
# # Calculate how many pieces there must be in the home state and divide it by 15
# features.append((15 - sum) / 15)
# features += ([1,0] if np.sign(cur_player) > 0 else [0,1])
# test = np.array(features).reshape(1,-1)
# #print("TEST:",test)
# return test
@staticmethod
def board_features_tesauro_wrong(board, cur_player):
features = []
for player in [-1,1]:
sum = 0.0
for board_range in range(1,25):
pin = board[board_range]
#print("PIIIN:",pin)
feature = [0.0]*4
if np.sign(pin) == np.sign(player):
sum += abs(pin)
for i in range(min(abs(pin), 3)):
feature[i] = 1
if (abs(pin) > 3):
feature[3] = (abs(pin)-3)/2
features += feature
#print("SUUUM:",sum)
# Append the amount of men on the bar of the current player divided by 2
features.append((board[0] if np.sign(player) < 0 else board[25]) / 2.0)
# Calculate how many pieces there must be in the home state and divide it by 15
features.append((15 - sum) / 15)
features += ([1,0] if np.sign(cur_player) > 0 else [0,1])
test = np.array(features).reshape(1,-1)
#print("TEST:",test)
return test

31
main.py
View File

@ -33,6 +33,8 @@ parser.add_argument('--list-models', action='store_true',
help='list all known models')
parser.add_argument('--force-creation', action='store_true',
help='force model creation if model does not exist')
parser.add_argument('--use-baseline', action='store_true',
help='use the baseline model, note, has size 28')
args = parser.parse_args()
@ -53,8 +55,9 @@ config = {
'train_perpetually': args.train_perpetually,
'model_storage_path': 'models',
'bench_storage_path': 'bench',
'board_representation': 'quack',
'force_creation': args.force_creation
'board_representation': 'quack-fat',
'force_creation': args.force_creation,
'use_baseline': args.use_baseline
}
# Create models folder
@ -72,18 +75,26 @@ if not os.path.isdir(log_path):
# Define helper functions
def log_train_outcome(outcome, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "train.log")):
def log_train_outcome(outcome, diff_in_values, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "train.log")):
format_vars = { 'trained_eps': trained_eps,
'count': len(train_outcome),
'sum': sum(train_outcome),
'mean': sum(train_outcome) / len(train_outcome),
'time': int(time.time())
'count': len(outcome),
'sum': sum(outcome),
'mean': sum(outcome) / len(outcome),
'time': int(time.time()),
'average_diff_in_vals': diff_in_values/len(outcome)
}
with open(log_path, 'a+') as f:
f.write("{time};{trained_eps};{count};{sum};{mean}".format(**format_vars) + "\n")
f.write("{time};{trained_eps};{count};{sum};{mean};{average_diff_in_vals}".format(**format_vars) + "\n")
def log_eval_outcomes(outcomes, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "eval.log")):
"""
:param outcomes:
:param average_diff_in_value:
:param trained_eps:
:param log_path:
:return:
"""
for outcome in outcomes:
scores = outcome[1]
format_vars = { 'trained_eps': trained_eps,
@ -137,9 +148,9 @@ if __name__ == "__main__":
network = Network(config, config['model'])
start_episode = network.episodes_trained
while True:
train_outcome = network.train_model(episodes = episode_count, trained_eps = start_episode)
train_outcome, diff_in_values = network.train_model(episodes = episode_count, trained_eps = start_episode)
start_episode += episode_count
log_train_outcome(train_outcome, trained_eps = start_episode)
log_train_outcome(train_outcome, diff_in_values, trained_eps = start_episode)
if config['eval_after_train']:
eval_outcomes = network.eval(trained_eps = start_episode)
log_eval_outcomes(eval_outcomes, trained_eps = start_episode)

View File

@ -139,7 +139,7 @@ class Network:
if os.path.isfile(episode_count_path):
with open(episode_count_path, 'r') as f:
self.config['start_episode'] = int(f.read())
elif glob.glob(os.path.join(os.path.join(self.config['model_storage_path'], "baseline_model"), 'model.ckpt*.index')):
elif self.config['use_baseline'] and glob.glob(os.path.join(os.path.join(self.config['model_storage_path'], "baseline_model"), 'model.ckpt*.index')):
checkpoint_path = os.path.join(self.config['model_storage_path'], "baseline_model")
latest_checkpoint = tf.train.latest_checkpoint(checkpoint_path)
print("[NETWK] ({name}) Restoring model from:".format(name=self.name),
@ -157,7 +157,7 @@ class Network:
exit()
#def make_move(self, sess, board, roll, player):
def make_move(self, sess, board, roll, player):
"""
Find the best move given a board, roll and a player, by finding all possible states one can go to
and then picking the best, by using the network to evaluate each state. The highest score is picked
@ -169,14 +169,14 @@ class Network:
:param player: Current player
:return: A pair of the best state to go to, together with the score of that state
"""
# legal_moves = Board.calculate_legal_states(board, player, roll)
# moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves]
# scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores]
# best_score_index = np.array(scores).argmax()
# best_move_pair = moves_and_scores[best_score_index]
# return best_move_pair
legal_moves = Board.calculate_legal_states(board, player, roll)
moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves]
scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores]
best_score_index = np.array(scores).argmax()
best_move_pair = moves_and_scores[best_score_index]
return best_move_pair
def make_move(self, sess, board, roll, player, n = 1):
def make_move_n_ply(self, sess, board, roll, player, n = 1):
best_pair = self.calc_n_ply(n, sess, board, player, roll)
return best_pair
@ -201,13 +201,7 @@ class Network:
zero_ply_moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in init_legal_states]
# pythons reverse is in place and I can't call [:15] on it, without applying it to an object like so. Fuck.
best_fifteen = sorted(zero_ply_moves_and_scores, key=itemgetter(1))
# They're sorted from smallest to largest, therefore we wan't to reverse if the current player is 1, since
# player 1 wishes to maximize. It's not needed for player -1, since that player seeks to minimize.
if player == 1:
best_fifteen.reverse()
best_fifteen = sorted(zero_ply_moves_and_scores, key=itemgetter(1), reverse=player==1)
best_fifteen_boards = [x[0] for x in best_fifteen[:10]]
@ -228,14 +222,9 @@ class Network:
zero_ply_moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in init_legal_states]
# pythons reverse is in place and I can't call [:15] on it, without applying it to an object like so. Fuck.
sorted_moves_and_scores = sorted(zero_ply_moves_and_scores, key=itemgetter(1))
sorted_moves_and_scores = sorted(zero_ply_moves_and_scores, key=itemgetter(1), reverse=player==1)
# They're sorted from smallest to largest, therefore we wan't to reverse if the current player is 1, since
# player 1 wishes to maximize. It's not needed for player -1, since that player seeks to minimize.
if player == 1:
sorted_moves_and_scores.reverse()
best_boards = [x[0] for x in sorted_moves_and_scores[:10]]
best_move_score_pair = self.n_ply(n_init, sess, best_boards, player)
@ -365,7 +354,7 @@ class Network:
all_rolls = gen_21_rolls()
all_rolls_scores = []
count = 0
# loop over boards
for a_board in boards:
a_board_scores = []
@ -375,7 +364,7 @@ class Network:
# find all states we can get to, given the board and roll and the opposite player
all_rolls_boards = Board.calculate_legal_states(a_board, player*-1, roll)
count += len(all_rolls_boards)
# find scores for each board found above
spec_roll_scores = [self.eval_state(sess, self.board_trans_func(new_board, player*-1))
for new_board in all_rolls_boards]
@ -393,6 +382,7 @@ class Network:
all_rolls_scores.append(sum(a_board_scores)/len(a_board_scores))
# return all the average scores
print(count)
return all_rolls_scores
@ -508,6 +498,7 @@ class Network:
def train_model(self, episodes=1000, save_step_size=100, trained_eps=0):
with tf.Session() as sess:
difference_in_vals = 0
writer = tf.summary.FileWriter("/tmp/log/tf", sess.graph)
sess.run(tf.global_variables_initializer())
@ -552,6 +543,7 @@ class Network:
(random.randrange(1, 7), random.randrange(1, 7)),
player)
difference_in_vals += abs((cur_board_value - self.eval_state(sess, self.board_trans_func(prev_board, player))))
# adjust weights
@ -590,6 +582,6 @@ class Network:
writer.close()
return outcomes
return outcomes, difference_in_vals