No longer use n_ply, shit's too slow man.
Added extra logging, now logs the average difference in values between trainings. Also fixed bug with the length of quack-norm. Also added cli argument; use-baseline, if set, the baseline-model will be used.
This commit is contained in:
parent
1db469709a
commit
1f8485f54e
54
board.py
54
board.py
|
@ -62,7 +62,9 @@ class Board:
|
|||
negatives = [x if x < 0 else 0 for x in board]
|
||||
board[0] = board[0] / 2
|
||||
board[25] = board[25] / 2
|
||||
board = [board[x] / 15 for x in range(1,25)]
|
||||
|
||||
board = [board[x] if x == 0 or 25 else board[x] / 15 for x in range(0, 26)]
|
||||
|
||||
board.append(15 - sum(positives))
|
||||
board.append(-15 - sum(negatives))
|
||||
board += ([1, 0] if np.sign(player) > 0 else [0, 1])
|
||||
|
@ -100,31 +102,31 @@ class Board:
|
|||
return np.array(board_rep).reshape(1,198)
|
||||
|
||||
|
||||
# @staticmethod
|
||||
# def board_features_tesauro(board, cur_player):
|
||||
# features = []
|
||||
# for player in [-1,1]:
|
||||
# sum = 0.0
|
||||
# for board_range in range(1,25):
|
||||
# pin = board[board_range]
|
||||
# #print("PIIIN:",pin)
|
||||
# feature = [0.0]*4
|
||||
# if np.sign(pin) == np.sign(player):
|
||||
# sum += abs(pin)
|
||||
# for i in range(min(abs(pin), 3)):
|
||||
# feature[i] = 1
|
||||
# if (abs(pin) > 3):
|
||||
# feature[3] = (abs(pin)-3)/2
|
||||
# features += feature
|
||||
# #print("SUUUM:",sum)
|
||||
# # Append the amount of men on the bar of the current player divided by 2
|
||||
# features.append((board[0] if np.sign(player) < 0 else board[25]) / 2.0)
|
||||
# # Calculate how many pieces there must be in the home state and divide it by 15
|
||||
# features.append((15 - sum) / 15)
|
||||
# features += ([1,0] if np.sign(cur_player) > 0 else [0,1])
|
||||
# test = np.array(features).reshape(1,-1)
|
||||
# #print("TEST:",test)
|
||||
# return test
|
||||
@staticmethod
|
||||
def board_features_tesauro_wrong(board, cur_player):
|
||||
features = []
|
||||
for player in [-1,1]:
|
||||
sum = 0.0
|
||||
for board_range in range(1,25):
|
||||
pin = board[board_range]
|
||||
#print("PIIIN:",pin)
|
||||
feature = [0.0]*4
|
||||
if np.sign(pin) == np.sign(player):
|
||||
sum += abs(pin)
|
||||
for i in range(min(abs(pin), 3)):
|
||||
feature[i] = 1
|
||||
if (abs(pin) > 3):
|
||||
feature[3] = (abs(pin)-3)/2
|
||||
features += feature
|
||||
#print("SUUUM:",sum)
|
||||
# Append the amount of men on the bar of the current player divided by 2
|
||||
features.append((board[0] if np.sign(player) < 0 else board[25]) / 2.0)
|
||||
# Calculate how many pieces there must be in the home state and divide it by 15
|
||||
features.append((15 - sum) / 15)
|
||||
features += ([1,0] if np.sign(cur_player) > 0 else [0,1])
|
||||
test = np.array(features).reshape(1,-1)
|
||||
#print("TEST:",test)
|
||||
return test
|
||||
|
||||
|
||||
|
||||
|
|
31
main.py
31
main.py
|
@ -33,6 +33,8 @@ parser.add_argument('--list-models', action='store_true',
|
|||
help='list all known models')
|
||||
parser.add_argument('--force-creation', action='store_true',
|
||||
help='force model creation if model does not exist')
|
||||
parser.add_argument('--use-baseline', action='store_true',
|
||||
help='use the baseline model, note, has size 28')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
@ -53,8 +55,9 @@ config = {
|
|||
'train_perpetually': args.train_perpetually,
|
||||
'model_storage_path': 'models',
|
||||
'bench_storage_path': 'bench',
|
||||
'board_representation': 'quack',
|
||||
'force_creation': args.force_creation
|
||||
'board_representation': 'quack-fat',
|
||||
'force_creation': args.force_creation,
|
||||
'use_baseline': args.use_baseline
|
||||
}
|
||||
|
||||
# Create models folder
|
||||
|
@ -72,18 +75,26 @@ if not os.path.isdir(log_path):
|
|||
|
||||
|
||||
# Define helper functions
|
||||
def log_train_outcome(outcome, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "train.log")):
|
||||
def log_train_outcome(outcome, diff_in_values, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "train.log")):
|
||||
format_vars = { 'trained_eps': trained_eps,
|
||||
'count': len(train_outcome),
|
||||
'sum': sum(train_outcome),
|
||||
'mean': sum(train_outcome) / len(train_outcome),
|
||||
'time': int(time.time())
|
||||
'count': len(outcome),
|
||||
'sum': sum(outcome),
|
||||
'mean': sum(outcome) / len(outcome),
|
||||
'time': int(time.time()),
|
||||
'average_diff_in_vals': diff_in_values/len(outcome)
|
||||
}
|
||||
with open(log_path, 'a+') as f:
|
||||
f.write("{time};{trained_eps};{count};{sum};{mean}".format(**format_vars) + "\n")
|
||||
f.write("{time};{trained_eps};{count};{sum};{mean};{average_diff_in_vals}".format(**format_vars) + "\n")
|
||||
|
||||
|
||||
def log_eval_outcomes(outcomes, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "eval.log")):
|
||||
"""
|
||||
:param outcomes:
|
||||
:param average_diff_in_value:
|
||||
:param trained_eps:
|
||||
:param log_path:
|
||||
:return:
|
||||
"""
|
||||
for outcome in outcomes:
|
||||
scores = outcome[1]
|
||||
format_vars = { 'trained_eps': trained_eps,
|
||||
|
@ -137,9 +148,9 @@ if __name__ == "__main__":
|
|||
network = Network(config, config['model'])
|
||||
start_episode = network.episodes_trained
|
||||
while True:
|
||||
train_outcome = network.train_model(episodes = episode_count, trained_eps = start_episode)
|
||||
train_outcome, diff_in_values = network.train_model(episodes = episode_count, trained_eps = start_episode)
|
||||
start_episode += episode_count
|
||||
log_train_outcome(train_outcome, trained_eps = start_episode)
|
||||
log_train_outcome(train_outcome, diff_in_values, trained_eps = start_episode)
|
||||
if config['eval_after_train']:
|
||||
eval_outcomes = network.eval(trained_eps = start_episode)
|
||||
log_eval_outcomes(eval_outcomes, trained_eps = start_episode)
|
||||
|
|
42
network.py
42
network.py
|
@ -139,7 +139,7 @@ class Network:
|
|||
if os.path.isfile(episode_count_path):
|
||||
with open(episode_count_path, 'r') as f:
|
||||
self.config['start_episode'] = int(f.read())
|
||||
elif glob.glob(os.path.join(os.path.join(self.config['model_storage_path'], "baseline_model"), 'model.ckpt*.index')):
|
||||
elif self.config['use_baseline'] and glob.glob(os.path.join(os.path.join(self.config['model_storage_path'], "baseline_model"), 'model.ckpt*.index')):
|
||||
checkpoint_path = os.path.join(self.config['model_storage_path'], "baseline_model")
|
||||
latest_checkpoint = tf.train.latest_checkpoint(checkpoint_path)
|
||||
print("[NETWK] ({name}) Restoring model from:".format(name=self.name),
|
||||
|
@ -157,7 +157,7 @@ class Network:
|
|||
exit()
|
||||
|
||||
|
||||
#def make_move(self, sess, board, roll, player):
|
||||
def make_move(self, sess, board, roll, player):
|
||||
"""
|
||||
Find the best move given a board, roll and a player, by finding all possible states one can go to
|
||||
and then picking the best, by using the network to evaluate each state. The highest score is picked
|
||||
|
@ -169,14 +169,14 @@ class Network:
|
|||
:param player: Current player
|
||||
:return: A pair of the best state to go to, together with the score of that state
|
||||
"""
|
||||
# legal_moves = Board.calculate_legal_states(board, player, roll)
|
||||
# moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves]
|
||||
# scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores]
|
||||
# best_score_index = np.array(scores).argmax()
|
||||
# best_move_pair = moves_and_scores[best_score_index]
|
||||
# return best_move_pair
|
||||
legal_moves = Board.calculate_legal_states(board, player, roll)
|
||||
moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves]
|
||||
scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores]
|
||||
best_score_index = np.array(scores).argmax()
|
||||
best_move_pair = moves_and_scores[best_score_index]
|
||||
return best_move_pair
|
||||
|
||||
def make_move(self, sess, board, roll, player, n = 1):
|
||||
def make_move_n_ply(self, sess, board, roll, player, n = 1):
|
||||
best_pair = self.calc_n_ply(n, sess, board, player, roll)
|
||||
return best_pair
|
||||
|
||||
|
@ -201,13 +201,7 @@ class Network:
|
|||
zero_ply_moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in init_legal_states]
|
||||
|
||||
# pythons reverse is in place and I can't call [:15] on it, without applying it to an object like so. Fuck.
|
||||
best_fifteen = sorted(zero_ply_moves_and_scores, key=itemgetter(1))
|
||||
|
||||
|
||||
# They're sorted from smallest to largest, therefore we wan't to reverse if the current player is 1, since
|
||||
# player 1 wishes to maximize. It's not needed for player -1, since that player seeks to minimize.
|
||||
if player == 1:
|
||||
best_fifteen.reverse()
|
||||
best_fifteen = sorted(zero_ply_moves_and_scores, key=itemgetter(1), reverse=player==1)
|
||||
|
||||
best_fifteen_boards = [x[0] for x in best_fifteen[:10]]
|
||||
|
||||
|
@ -228,14 +222,9 @@ class Network:
|
|||
zero_ply_moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in init_legal_states]
|
||||
|
||||
# pythons reverse is in place and I can't call [:15] on it, without applying it to an object like so. Fuck.
|
||||
sorted_moves_and_scores = sorted(zero_ply_moves_and_scores, key=itemgetter(1))
|
||||
sorted_moves_and_scores = sorted(zero_ply_moves_and_scores, key=itemgetter(1), reverse=player==1)
|
||||
|
||||
|
||||
# They're sorted from smallest to largest, therefore we wan't to reverse if the current player is 1, since
|
||||
# player 1 wishes to maximize. It's not needed for player -1, since that player seeks to minimize.
|
||||
if player == 1:
|
||||
sorted_moves_and_scores.reverse()
|
||||
|
||||
best_boards = [x[0] for x in sorted_moves_and_scores[:10]]
|
||||
|
||||
best_move_score_pair = self.n_ply(n_init, sess, best_boards, player)
|
||||
|
@ -365,7 +354,7 @@ class Network:
|
|||
all_rolls = gen_21_rolls()
|
||||
|
||||
all_rolls_scores = []
|
||||
|
||||
count = 0
|
||||
# loop over boards
|
||||
for a_board in boards:
|
||||
a_board_scores = []
|
||||
|
@ -375,7 +364,7 @@ class Network:
|
|||
|
||||
# find all states we can get to, given the board and roll and the opposite player
|
||||
all_rolls_boards = Board.calculate_legal_states(a_board, player*-1, roll)
|
||||
|
||||
count += len(all_rolls_boards)
|
||||
# find scores for each board found above
|
||||
spec_roll_scores = [self.eval_state(sess, self.board_trans_func(new_board, player*-1))
|
||||
for new_board in all_rolls_boards]
|
||||
|
@ -393,6 +382,7 @@ class Network:
|
|||
all_rolls_scores.append(sum(a_board_scores)/len(a_board_scores))
|
||||
|
||||
# return all the average scores
|
||||
print(count)
|
||||
return all_rolls_scores
|
||||
|
||||
|
||||
|
@ -508,6 +498,7 @@ class Network:
|
|||
|
||||
def train_model(self, episodes=1000, save_step_size=100, trained_eps=0):
|
||||
with tf.Session() as sess:
|
||||
difference_in_vals = 0
|
||||
writer = tf.summary.FileWriter("/tmp/log/tf", sess.graph)
|
||||
|
||||
sess.run(tf.global_variables_initializer())
|
||||
|
@ -552,6 +543,7 @@ class Network:
|
|||
(random.randrange(1, 7), random.randrange(1, 7)),
|
||||
player)
|
||||
|
||||
difference_in_vals += abs((cur_board_value - self.eval_state(sess, self.board_trans_func(prev_board, player))))
|
||||
|
||||
|
||||
# adjust weights
|
||||
|
@ -590,6 +582,6 @@ class Network:
|
|||
|
||||
writer.close()
|
||||
|
||||
return outcomes
|
||||
return outcomes, difference_in_vals
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user