backgammon/network.py

566 lines
24 KiB
Python
Raw Normal View History

2018-03-04 16:35:36 +00:00
import tensorflow as tf
import numpy as np
from board import Board
import os
2018-03-20 12:03:21 +00:00
import time
import sys
import random
2018-03-20 12:17:38 +00:00
from eval import Eval
2018-04-19 13:22:00 +00:00
import glob
2018-04-22 13:07:19 +00:00
from operator import itemgetter
2018-03-08 15:27:16 +00:00
class Network:
# board_features_quack has size 28
# board_features_quack_fat has size 30
# board_features_tesauro has size 198
board_reps = {
'quack-fat' : (30, Board.board_features_quack_fat),
'quack' : (28, Board.board_features_quack),
2018-04-22 22:35:25 +00:00
'tesauro' : (198, Board.board_features_tesauro),
'quack-norm': (30, Board.board_features_quack_norm)
}
def custom_tanh(self, x, name=None):
2018-03-14 19:42:09 +00:00
return tf.scalar_mul(tf.constant(2.00), tf.tanh(x, name))
2018-03-20 12:03:21 +00:00
def __init__(self, config, name):
2018-03-08 15:27:16 +00:00
self.config = config
self.checkpoint_path = os.path.join(config['model_storage_path'], config['model'])
2018-03-22 14:30:47 +00:00
2018-03-14 19:42:09 +00:00
self.name = name
2018-03-22 14:30:47 +00:00
# Set board representation from config
self.input_size, self.board_trans_func = Network.board_reps[
self.config['board_representation']
]
self.output_size = 1
self.hidden_size = 40
2018-04-14 21:11:20 +00:00
self.max_learning_rate = 0.1
self.min_learning_rate = 0.001
self.global_step = tf.Variable(0, trainable=False, name="global_step")
2018-04-22 13:07:19 +00:00
self.learning_rate = tf.maximum(self.min_learning_rate,
tf.train.exponential_decay(self.max_learning_rate,
self.global_step, 50000,
0.96,
staircase=True),
name="learning_rate")
2018-04-14 21:11:20 +00:00
2018-03-22 14:30:47 +00:00
# Restore trained episode count for model
episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained")
if os.path.isfile(episode_count_path):
with open(episode_count_path, 'r') as f:
self.episodes_trained = int(f.read())
else:
self.episodes_trained = 0
self.x = tf.placeholder('float', [1, self.input_size], name='input')
self.value_next = tf.placeholder('float', [1, self.output_size], name="value_next")
2018-03-04 16:35:36 +00:00
xavier_init = tf.contrib.layers.xavier_initializer()
W_1 = tf.get_variable("w_1", (self.input_size, self.hidden_size),
2018-03-14 19:42:09 +00:00
initializer=xavier_init)
W_2 = tf.get_variable("w_2", (self.hidden_size, self.output_size),
2018-03-14 19:42:09 +00:00
initializer=xavier_init)
2018-03-04 16:35:36 +00:00
b_1 = tf.get_variable("b_1", (self.hidden_size,),
2018-03-14 19:42:09 +00:00
initializer=tf.zeros_initializer)
b_2 = tf.get_variable("b_2", (self.output_size,),
2018-03-14 19:42:09 +00:00
initializer=tf.zeros_initializer)
2018-03-04 16:35:36 +00:00
value_after_input = tf.sigmoid(tf.matmul(self.x, W_1) + b_1, name='hidden_layer')
2018-03-04 16:35:36 +00:00
2018-03-22 14:30:47 +00:00
self.value = tf.sigmoid(tf.matmul(value_after_input, W_2) + b_2, name='output_layer')
2018-03-04 16:35:36 +00:00
2018-03-20 12:03:21 +00:00
# TODO: Alexander thinks that self.value will be computed twice (instead of once)
2018-03-22 14:30:47 +00:00
difference_in_values = tf.reshape(tf.subtract(self.value_next, self.value, name='difference_in_values'), [])
tf.summary.scalar("difference_in_values", tf.abs(difference_in_values))
2018-03-04 16:35:36 +00:00
trainable_vars = tf.trainable_variables()
gradients = tf.gradients(self.value, trainable_vars)
2018-03-04 16:35:36 +00:00
apply_gradients = []
2018-04-14 21:11:20 +00:00
global_step_op = self.global_step.assign_add(1)
2018-04-19 13:22:00 +00:00
2018-03-04 16:35:36 +00:00
with tf.variable_scope('apply_gradients'):
for gradient, trainable_var in zip(gradients, trainable_vars):
backprop_calc = self.learning_rate * difference_in_values * gradient
2018-03-04 16:35:36 +00:00
grad_apply = trainable_var.assign_add(backprop_calc)
apply_gradients.append(grad_apply)
2018-04-19 13:22:00 +00:00
with tf.control_dependencies([global_step_op]):
2018-03-04 16:35:36 +00:00
self.training_op = tf.group(*apply_gradients, name='training_op')
2018-03-22 14:30:47 +00:00
self.saver = tf.train.Saver(max_to_keep=1)
2018-03-22 14:30:47 +00:00
def eval_state(self, sess, state):
return sess.run(self.value, feed_dict={self.x: state})
2018-03-04 16:35:36 +00:00
2018-04-14 21:11:20 +00:00
def save_model(self, sess, episode_count, global_step):
self.saver.save(sess, os.path.join(self.checkpoint_path, 'model.ckpt'), global_step=global_step)
2018-03-11 23:11:55 +00:00
with open(os.path.join(self.checkpoint_path, "episodes_trained"), 'w+') as f:
print("[NETWK] ({name}) Saving model to:".format(name=self.name),
2018-03-14 19:42:09 +00:00
os.path.join(self.checkpoint_path, 'model.ckpt'))
f.write(str(episode_count) + "\n")
2018-03-22 14:30:47 +00:00
def restore_model(self, sess):
"""
Restore a model for a session, such that a trained model and either be further trained or
used for evaluation
:param sess: Current session
:return: Nothing. It's a side-effect that a model gets restored for the network.
"""
2018-04-19 13:22:00 +00:00
if glob.glob(os.path.join(self.checkpoint_path, 'model.ckpt*.index')):
2018-03-06 10:53:42 +00:00
latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_path)
print("[NETWK] ({name}) Restoring model from:".format(name=self.name),
2018-03-14 19:42:09 +00:00
str(latest_checkpoint))
2018-03-22 14:30:47 +00:00
self.saver.restore(sess, latest_checkpoint)
2018-03-20 12:03:21 +00:00
variables_names = [v.name for v in tf.trainable_variables()]
2018-03-22 14:30:47 +00:00
values = sess.run(variables_names)
2018-03-20 12:03:21 +00:00
for k, v in zip(variables_names, values):
print("Variable: ", k)
print("Shape: ", v.shape)
print(v)
# Restore trained episode count for model
episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained")
2018-04-19 14:04:49 +00:00
if os.path.isfile(episode_count_path):
2018-03-20 12:03:21 +00:00
with open(episode_count_path, 'r') as f:
self.config['start_episode'] = int(f.read())
2018-04-24 20:30:58 +00:00
elif glob.glob(os.path.join(os.path.join(self.config['model_storage_path'], "baseline_model"), 'model.ckpt*.index')):
checkpoint_path = os.path.join(self.config['model_storage_path'], "baseline_model")
latest_checkpoint = tf.train.latest_checkpoint(checkpoint_path)
print("[NETWK] ({name}) Restoring model from:".format(name=self.name),
str(latest_checkpoint))
self.saver.restore(sess, latest_checkpoint)
variables_names = [v.name for v in tf.trainable_variables()]
values = sess.run(variables_names)
for k, v in zip(variables_names, values):
print("Variable: ", k)
print("Shape: ", v.shape)
print(v)
elif not self.config['force_creation']:
2018-04-24 20:30:58 +00:00
print("You need to have baseline_model inside models")
exit()
2018-04-19 14:01:19 +00:00
2018-03-04 16:35:36 +00:00
def make_move(self, sess, board, roll, player):
"""
Find the best move given a board, roll and a player, by finding all possible states one can go to
and then picking the best, by using the network to evaluate each state. The highest score is picked
for the 1-player and the max(1-score) is picked for the -1-player.
:param sess:
:param board: Current board
:param roll: Current roll
:param player: Current player
:return: A pair of the best state to go to, together with the score of that state
"""
legal_moves = Board.calculate_legal_states(board, player, roll)
moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves]
scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores]
2018-03-20 12:03:21 +00:00
best_score_index = np.array(scores).argmax()
best_move_pair = moves_and_scores[best_score_index]
return best_move_pair
2018-04-22 13:07:19 +00:00
def calculate_2_ply(self, sess, board, roll, player):
"""
Find the best move based on a 2-ply look-ahead. First the best move is found for a single ply and then an
exhaustive search is performed on the best 15 moves from the single ply.
:param sess:
:param board:
:param roll: The original roll
:param player: The current player
:return: Best possible move based on 2-ply look-ahead
"""
2018-04-29 10:14:14 +00:00
# find all legal states from the given board and the given roll
init_legal_states = Board.calculate_legal_states(board, player, roll)
2018-04-29 10:14:14 +00:00
# find all values for the above boards
zero_ply_moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in init_legal_states]
# pythons reverse is in place and I can't call [:15] on it, without applying it to an object like so. Fuck.
best_fifteen = sorted(zero_ply_moves_and_scores, key=itemgetter(1))
# They're sorted from smallest to largest, therefore we wan't to reverse if the current player is 1, since
# player 1 wishes to maximize. It's not needed for player -1, since that player seeks to minimize.
if player == 1:
best_fifteen.reverse()
2018-04-29 10:14:14 +00:00
best_fifteen_boards = [x[0] for x in best_fifteen[:15]]
all_rolls_scores = self.do_ply(sess, best_fifteen_boards, player)
best_score_index = np.array(all_rolls_scores).argmax()
best_board = best_fifteen_boards[best_score_index]
return [best_board, max(all_rolls_scores)]
2018-04-26 14:49:49 +00:00
def n_ply(self, n_init, sess, boards_init, player_init):
def ply(n, boards, player):
def calculate_possible_states(board):
possible_rolls = [ (1, 1), (1, 2), (1, 3), (1, 4), (1, 5),
(1, 6), (2, 2), (2, 3), (2, 4), (2, 5),
(2, 6), (3, 3), (3, 4), (3, 5), (3, 6),
(4, 4), (4, 5), (4, 6), (5, 5), (5, 6),
(6, 6) ]
return [ Board.calculate_legal_states(board, player, roll)
for roll
in possible_rolls ]
def find_best_state_score(boards):
score_pairs = [ (board, self.eval_state(sess, self.board_trans_func(board, player)))
for board
in boards ]
scores = [ pair[1]
for pair
in score_pairs ]
best_score_pair = score_pairs[np.array(scores).argmax()]
return best_score_pair
def average_score(boards):
return sum(boards)/len(boards)
def average_ply_score(board):
states_for_rolls = calculate_possible_states(board)
best_state_score_for_each_roll = [
find_best_state_score(states)
for states
in states_for_rolls ]
best_score_for_each_roll = [ x[1]
for x
in best_state_score_for_each_roll ]
average_score_var = average_score(best_score_for_each_roll)
return average_score_var
if n == 1:
print("blalhlalha")
average_score_pairs = [ (board, average_ply_score(board))
for board
in boards ]
return average_score_pairs
elif n > 1: # n != 1
def average_for_score_pairs(score_pairs):
scores = [ pair[1]
for pair
in score_pairs ]
return sum(scores)/len(scores)
def average_plain(scores):
return sum(scores)/len(scores)
print("+"*20)
print(n)
print(type(boards))
print(boards)
possible_states_for_boards = [
(board, calculate_possible_states(board))
for board
in boards ]
average_score_pairs = [
(inner_boards[0], average_plain([ average_for_score_pairs(ply(n - 1, inner_board, player * -1))
for inner_board
in inner_boards[1] ]))
for inner_boards
in possible_states_for_boards ]
return average_score_pairs
else:
assert False
if n_init < 1: print("Unexpected argument n = {}".format(n_init)); exit()
2018-05-01 11:48:42 +00:00
boards_with_scores = ply(n_init, boards_init, -1 * player_init)
2018-04-26 14:49:49 +00:00
print(boards_with_scores)
2018-05-01 11:48:42 +00:00
scores = [ ( pair[1] if player_init == 1 else (1 - pair[1]) )
2018-04-26 14:49:49 +00:00
for pair
in boards_with_scores ]
best_score_pair = boards_with_scores[np.array(scores).argmax()]
return best_score_pair[0]
def do_ply(self, sess, boards, player):
"""
Calculates a single extra ply, resulting in a larger search space for our best move.
This is somewhat hardcoded to only do a single ply, seeing that it calls max on all scores, rather than
allowing the function to search deeper, which could result in an even larger search space. If we wish
to have more than 2-ply, this should be fixed, so we could extend this method to allow for 3-ply.
:param sess:
:param boards: The boards to try all rolls on
:param player: The player of the previous ply
:return: An array of scores where each index describes one of the boards which was given as param
to this function.
"""
def gen_21_rolls():
"""
Calculate all possible rolls, [[1,1], [1,2] ..]
:return: All possible rolls
"""
a = []
for x in range(1, 7):
for y in range(1, 7):
if not [x, y] in a and not [y, x] in a:
a.append([x, y])
return a
2018-04-22 13:07:19 +00:00
all_rolls = gen_21_rolls()
2018-04-22 13:07:19 +00:00
all_rolls_scores = []
2018-04-29 10:14:14 +00:00
# loop over boards
for a_board in boards:
2018-04-22 13:07:19 +00:00
a_board_scores = []
2018-04-29 10:14:14 +00:00
# loop over all rolls, for each board
2018-04-22 13:07:19 +00:00
for roll in all_rolls:
2018-04-29 10:14:14 +00:00
# find all states we can get to, given the board and roll and the opposite player
2018-04-22 13:07:19 +00:00
all_rolls_boards = Board.calculate_legal_states(a_board, player*-1, roll)
2018-04-29 10:14:14 +00:00
# find scores for each board found above
spec_roll_scores = [self.eval_state(sess, self.board_trans_func(new_board, player*-1))
for new_board in all_rolls_boards]
2018-04-29 10:14:14 +00:00
# if the original player is the -1 player, then we need to find (1-value)
spec_roll_scores = [x if player == 1 else (1-x) for x in spec_roll_scores]
2018-04-22 13:07:19 +00:00
2018-04-29 10:14:14 +00:00
# find the best score
2018-04-22 13:07:19 +00:00
best_score = max(spec_roll_scores)
2018-04-29 10:14:14 +00:00
# append the best score to a_board_scores, where we keep track of the best score for each board
2018-04-22 13:07:19 +00:00
a_board_scores.append(best_score)
2018-04-29 10:14:14 +00:00
# save the expected average of board scores
2018-04-22 13:07:19 +00:00
all_rolls_scores.append(sum(a_board_scores)/len(a_board_scores))
2018-04-29 10:14:14 +00:00
# return all the average scores
return all_rolls_scores
2018-04-22 13:07:19 +00:00
2018-04-29 10:14:14 +00:00
2018-03-27 11:02:36 +00:00
def eval(self, episode_count, trained_eps = 0, tf_session = None):
"""
Used to evaluate a model. Can either use pubeval, a model playing at an intermediate level, or dumbeval
a model which has been given random weights, so it acts deterministically random.
:param episode_count: The amount of episodes to run
:param trained_eps: The amount of episodes the model we want to evaluate, has trained
:param tf_session:
:return: outcomes: The outcomes of the evaluation session
"""
2018-04-22 13:07:19 +00:00
2018-03-27 11:02:36 +00:00
def do_eval(sess, method, episodes = 1000, trained_eps = 0):
"""
Do the actual evaluation
:param sess:
:param method: Either pubeval or dumbeval
:param episodes: Amount of episodes to use in the evaluation
:param trained_eps:
:return: outcomes : Described above
"""
start_time = time.time()
def print_time_estimate(eps_completed):
cur_time = time.time()
time_diff = cur_time - start_time
eps_per_sec = eps_completed / time_diff
secs_per_ep = time_diff / eps_completed
eps_remaining = (episodes - eps_completed)
sys.stderr.write(
"[EVAL ] Averaging {per_sec} episodes per second\n".format(per_sec=round(eps_per_sec, 2)))
sys.stderr.write(
"[EVAL ] {eps_remaining} episodes remaining; approx. {time_remaining} seconds remaining\n".format(
eps_remaining=eps_remaining, time_remaining=int(eps_remaining * secs_per_ep)))
sys.stderr.write(
"[EVAL ] Evaluating {eps} episode(s) with method '{method}'\n".format(eps=episodes, method=method))
2018-04-22 13:07:19 +00:00
if method == 'pubeval':
outcomes = []
for i in range(1, episodes + 1):
sys.stderr.write("[EVAL ] Episode {}".format(i))
board = Board.initial_state
while Board.outcome(board) is None:
roll = (random.randrange(1, 7), random.randrange(1, 7))
board = (self.make_move(sess, board, roll, 1))[0]
roll = (random.randrange(1, 7), random.randrange(1, 7))
board = Eval.make_pubeval_move(board, -1, roll)[0][0:26]
sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1]))
outcomes.append(Board.outcome(board)[1])
sys.stderr.write("\n")
if i % 10 == 0:
print_time_estimate(i)
return outcomes
elif method == 'dumbeval':
outcomes = []
for i in range(1, episodes + 1):
sys.stderr.write("[EVAL ] Episode {}".format(i))
board = Board.initial_state
while Board.outcome(board) is None:
roll = (random.randrange(1, 7), random.randrange(1, 7))
board = (self.make_move(sess, board, roll, 1))[0]
roll = (random.randrange(1, 7), random.randrange(1, 7))
board = Eval.make_dumbeval_move(board, -1, roll)[0][0:26]
sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1]))
outcomes.append(Board.outcome(board)[1])
sys.stderr.write("\n")
if i % 10 == 0:
print_time_estimate(i)
return outcomes
else:
sys.stderr.write("[EVAL ] Evaluation method '{}' is not defined\n".format(method))
return [0]
2018-03-27 11:02:36 +00:00
if tf_session == None:
2018-03-27 23:13:59 +00:00
with tf.Session() as session:
2018-03-27 11:02:36 +00:00
session.run(tf.global_variables_initializer())
self.restore_model(session)
outcomes = [ (method, do_eval(session,
method,
episode_count,
trained_eps = trained_eps))
for method
in self.config['eval_methods'] ]
return outcomes
else:
outcomes = [ (method, do_eval(tf_session,
method,
episode_count,
trained_eps = trained_eps))
for method
in self.config['eval_methods'] ]
return outcomes
def train_model(self, episodes=1000, save_step_size=100, trained_eps=0):
2018-03-22 14:30:47 +00:00
with tf.Session() as sess:
writer = tf.summary.FileWriter("/tmp/log/tf", sess.graph)
2018-03-22 14:30:47 +00:00
sess.run(tf.global_variables_initializer())
self.restore_model(sess)
2018-03-22 14:30:47 +00:00
variables_names = [v.name for v in tf.trainable_variables()]
values = sess.run(variables_names)
for k, v in zip(variables_names, values):
print("Variable: ", k)
print("Shape: ", v.shape)
print(v)
start_time = time.time()
2018-03-20 12:03:21 +00:00
2018-03-22 14:30:47 +00:00
def print_time_estimate(eps_completed):
cur_time = time.time()
time_diff = cur_time - start_time
eps_per_sec = eps_completed / time_diff
secs_per_ep = time_diff / eps_completed
2018-03-22 14:30:47 +00:00
eps_remaining = (episodes - eps_completed)
sys.stderr.write(
"[TRAIN] Averaging {per_sec} episodes per second\n".format(per_sec=round(eps_per_sec, 2)))
sys.stderr.write(
"[TRAIN] {eps_remaining} episodes remaining; approx. {time_remaining} seconds remaining\n".format(
eps_remaining=eps_remaining, time_remaining=int(eps_remaining * secs_per_ep)))
2018-03-20 12:03:21 +00:00
2018-03-22 14:30:47 +00:00
sys.stderr.write("[TRAIN] Training {} episodes and save_step_size {}\n".format(episodes, save_step_size))
outcomes = []
for episode in range(1, episodes + 1):
2018-04-22 13:07:19 +00:00
2018-03-22 14:30:47 +00:00
sys.stderr.write("[TRAIN] Episode {}".format(episode + trained_eps))
# TODO decide which player should be here
2018-03-22 14:30:47 +00:00
player = 1
prev_board = Board.initial_state
2018-03-28 12:36:52 +00:00
i = 0
2018-03-22 14:30:47 +00:00
while Board.outcome(prev_board) is None:
2018-03-28 12:36:52 +00:00
i += 1
2018-03-20 12:03:21 +00:00
cur_board, cur_board_value = self.make_move(sess,
prev_board,
2018-04-22 13:07:19 +00:00
(random.randrange(1, 7), random.randrange(1, 7)),
player)
2018-03-20 12:03:21 +00:00
2018-03-22 14:30:47 +00:00
# adjust weights
sess.run(self.training_op,
feed_dict={self.x: self.board_trans_func(prev_board, player),
self.value_next: cur_board_value})
player *= -1
2018-03-22 14:30:47 +00:00
prev_board = cur_board
final_board = prev_board
2018-03-28 12:36:52 +00:00
sys.stderr.write("\t outcome {}\t turns {}".format(Board.outcome(final_board)[1], i))
2018-03-22 14:30:47 +00:00
outcomes.append(Board.outcome(final_board)[1])
final_score = np.array([Board.outcome(final_board)[1]])
2018-03-22 14:30:47 +00:00
scaled_final_score = ((final_score + 2) / 4)
with tf.name_scope("final"):
merged = tf.summary.merge_all()
2018-04-19 13:22:00 +00:00
global_step, summary, _ = sess.run([self.global_step, merged, self.training_op],
feed_dict={self.x: self.board_trans_func(prev_board, player),
self.value_next: scaled_final_score.reshape((1, 1))})
2018-03-22 14:30:47 +00:00
writer.add_summary(summary, episode + trained_eps)
2018-03-22 14:30:47 +00:00
sys.stderr.write("\n")
2018-03-22 14:30:47 +00:00
if episode % min(save_step_size, episodes) == 0:
sys.stderr.write("[TRAIN] Saving model...\n")
2018-04-14 21:11:20 +00:00
self.save_model(sess, episode + trained_eps, global_step)
2018-03-20 12:03:21 +00:00
2018-03-22 14:30:47 +00:00
if episode % 50 == 0:
print_time_estimate(episode)
2018-03-20 12:03:21 +00:00
2018-03-22 14:30:47 +00:00
sys.stderr.write("[TRAIN] Saving model for final episode...\n")
2018-04-19 13:22:00 +00:00
self.save_model(sess, episode+trained_eps, global_step)
2018-03-22 14:30:47 +00:00
writer.close()
2018-03-20 12:03:21 +00:00
return outcomes
2018-04-29 10:14:14 +00:00