backgammon/network.py

376 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import tensorflow as tf
from cup import Cup
import numpy as np
from board import Board
import os
import time
import sys
import random
from eval import Eval
import glob
from operator import itemgetter
class Network:
# board_features_quack has size 28
# board_features_quack_fat has size 30
# board_features_tesauro has size 198
board_reps = {
'quack-fat' : (30, Board.board_features_quack_fat),
'quack' : (28, Board.board_features_quack),
'tesauro' : (198, Board.board_features_tesauro)
}
def custom_tanh(self, x, name=None):
return tf.scalar_mul(tf.constant(2.00), tf.tanh(x, name))
def __init__(self, config, name):
self.config = config
self.checkpoint_path = os.path.join(config['model_storage_path'], config['model'])
self.name = name
# Set board representation from config
self.input_size, self.board_trans_func = Network.board_reps[
self.config['board_representation']
]
self.output_size = 1
self.hidden_size = 40
self.max_learning_rate = 0.1
self.min_learning_rate = 0.001
self.global_step = tf.Variable(0, trainable=False, name="global_step")
self.learning_rate = tf.maximum(self.min_learning_rate,
tf.train.exponential_decay(self.max_learning_rate,
self.global_step, 50000,
0.96,
staircase=True),
name="learning_rate")
# Restore trained episode count for model
episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained")
if os.path.isfile(episode_count_path):
with open(episode_count_path, 'r') as f:
self.episodes_trained = int(f.read())
else:
self.episodes_trained = 0
self.x = tf.placeholder('float', [1, self.input_size], name='input')
self.value_next = tf.placeholder('float', [1, self.output_size], name="value_next")
xavier_init = tf.contrib.layers.xavier_initializer()
W_1 = tf.get_variable("w_1", (self.input_size, self.hidden_size),
initializer=xavier_init)
W_2 = tf.get_variable("w_2", (self.hidden_size, self.output_size),
initializer=xavier_init)
b_1 = tf.get_variable("b_1", (self.hidden_size,),
initializer=tf.zeros_initializer)
b_2 = tf.get_variable("b_2", (self.output_size,),
initializer=tf.zeros_initializer)
value_after_input = tf.sigmoid(tf.matmul(self.x, W_1) + b_1, name='hidden_layer')
self.value = tf.sigmoid(tf.matmul(value_after_input, W_2) + b_2, name='output_layer')
# tf.reduce_sum basically finds the sum of its input, so this gives the
# difference between the two values, in case they should be lists, which
# they might be if our input changes
# TODO: Alexander thinks that self.value will be computed twice (instead of once)
difference_in_values = tf.reshape(tf.subtract(self.value_next, self.value, name='difference_in_values'), [])
tf.summary.scalar("difference_in_values", tf.abs(difference_in_values))
trainable_vars = tf.trainable_variables()
gradients = tf.gradients(self.value, trainable_vars)
apply_gradients = []
global_step_op = self.global_step.assign_add(1)
with tf.variable_scope('apply_gradients'):
for gradient, trainable_var in zip(gradients, trainable_vars):
# Hopefully this is Δw_t = α(V_t+1 - V_t)▿_wV_t.
backprop_calc = self.learning_rate * difference_in_values * gradient
grad_apply = trainable_var.assign_add(backprop_calc)
apply_gradients.append(grad_apply)
with tf.control_dependencies([global_step_op]):
self.training_op = tf.group(*apply_gradients, name='training_op')
self.saver = tf.train.Saver(max_to_keep=1)
def eval_state(self, sess, state):
return sess.run(self.value, feed_dict={self.x: state})
def save_model(self, sess, episode_count, global_step):
self.saver.save(sess, os.path.join(self.checkpoint_path, 'model.ckpt'), global_step=global_step)
with open(os.path.join(self.checkpoint_path, "episodes_trained"), 'w+') as f:
print("[NETWK] ({name}) Saving model to:".format(name=self.name),
os.path.join(self.checkpoint_path, 'model.ckpt'))
f.write(str(episode_count) + "\n")
def restore_model(self, sess):
if glob.glob(os.path.join(self.checkpoint_path, 'model.ckpt*.index')):
latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_path)
print("[NETWK] ({name}) Restoring model from:".format(name=self.name),
str(latest_checkpoint))
self.saver.restore(sess, latest_checkpoint)
variables_names = [v.name for v in tf.trainable_variables()]
values = sess.run(variables_names)
for k, v in zip(variables_names, values):
print("Variable: ", k)
print("Shape: ", v.shape)
print(v)
# Restore trained episode count for model
episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained")
if os.path.isfile(episode_count_path):
with open(episode_count_path, 'r') as f:
self.config['start_episode'] = int(f.read())
def make_move(self, sess, board, roll, player):
legal_moves = Board.calculate_legal_states(board, player, roll)
moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves]
scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores]
best_score_index = np.array(scores).argmax()
best_move_pair = moves_and_scores[best_score_index]
return best_move_pair
def gen_21_rolls(self):
a = []
for x in range(1,7):
for y in range(1,7):
if not [x,y] in a and not [y,x] in a:
a.append([x,y])
return a
def calculate_2_ply(self, sess, board, roll, player):
"""
Find the best move based on a 2-ply look-ahead. First the best move is found for a single ply and then an
exhaustive search is performed on the best 15 moves from the single ply.
:param sess:
:param board:
:param roll: The original roll
:param player: The current player
:return: Best possible move based on 2-ply look-ahead
"""
init_legal_states = Board.calculate_legal_states(board, player, roll)
zero_ply_moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in init_legal_states]
# pythons reverse is in place and I can't call [:15] on it, without applying it to an object like so. Fuck.
best_fifteen = sorted(zero_ply_moves_and_scores, key=itemgetter(1))
best_fifteen.reverse()
best_fifteen_boards = [x[0] for x in best_fifteen[:15]]
all_rolls = self.gen_21_rolls()
all_rolls_scores = []
for a_board in best_fifteen_boards:
a_board_scores = []
for roll in all_rolls:
spec_roll_scores = []
all_rolls_boards = Board.calculate_legal_states(a_board, player*-1, roll)
spec_roll_scores.append([self.eval_state(sess, self.board_trans_func(new_board, player*-1)) for new_board in all_rolls_boards])
best_score = max(spec_roll_scores)
a_board_scores.append(best_score)
all_rolls_scores.append(sum(a_board_scores)/len(a_board_scores))
best_score_index = np.array(all_rolls_scores).argmax()
best_board = best_fifteen_boards[best_score_index]
return [best_board, max(all_rolls_scores)]
def eval(self, episode_count, trained_eps = 0, tf_session = None):
def do_eval(sess, method, episodes = 1000, trained_eps = 0):
start_time = time.time()
def print_time_estimate(eps_completed):
cur_time = time.time()
time_diff = cur_time - start_time
eps_per_sec = eps_completed / time_diff
secs_per_ep = time_diff / eps_completed
eps_remaining = (episodes - eps_completed)
sys.stderr.write(
"[EVAL ] Averaging {per_sec} episodes per second\n".format(per_sec=round(eps_per_sec, 2)))
sys.stderr.write(
"[EVAL ] {eps_remaining} episodes remaining; approx. {time_remaining} seconds remaining\n".format(
eps_remaining=eps_remaining, time_remaining=int(eps_remaining * secs_per_ep)))
sys.stderr.write(
"[EVAL ] Evaluating {eps} episode(s) with method '{method}'\n".format(eps=episodes, method=method))
if method == 'pubeval':
outcomes = []
for i in range(1, episodes + 1):
sys.stderr.write("[EVAL ] Episode {}".format(i))
board = Board.initial_state
while Board.outcome(board) is None:
roll = (random.randrange(1, 7), random.randrange(1, 7))
board = (self.make_move(sess, board, roll, 1))[0]
roll = (random.randrange(1, 7), random.randrange(1, 7))
board = Eval.make_pubeval_move(board, -1, roll)[0][0:26]
sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1]))
outcomes.append(Board.outcome(board)[1])
sys.stderr.write("\n")
if i % 10 == 0:
print_time_estimate(i)
return outcomes
elif method == 'dumbeval':
outcomes = []
for i in range(1, episodes + 1):
sys.stderr.write("[EVAL ] Episode {}".format(i))
board = Board.initial_state
while Board.outcome(board) is None:
roll = (random.randrange(1, 7), random.randrange(1, 7))
board = (self.make_move(sess, board, roll, 1))[0]
roll = (random.randrange(1, 7), random.randrange(1, 7))
board = Eval.make_dumbeval_move(board, -1, roll)[0][0:26]
sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1]))
outcomes.append(Board.outcome(board)[1])
sys.stderr.write("\n")
if i % 10 == 0:
print_time_estimate(i)
return outcomes
else:
sys.stderr.write("[EVAL ] Evaluation method '{}' is not defined\n".format(method))
return [0]
if tf_session == None:
with tf.Session() as session:
session.run(tf.global_variables_initializer())
self.restore_model(session)
outcomes = [ (method, do_eval(session,
method,
episode_count,
trained_eps = trained_eps))
for method
in self.config['eval_methods'] ]
return outcomes
else:
outcomes = [ (method, do_eval(tf_session,
method,
episode_count,
trained_eps = trained_eps))
for method
in self.config['eval_methods'] ]
return outcomes
def train_model(self, episodes=1000, save_step_size=100, trained_eps=0):
with tf.Session() as sess:
writer = tf.summary.FileWriter("/tmp/log/tf", sess.graph)
sess.run(tf.global_variables_initializer())
self.restore_model(sess)
variables_names = [v.name for v in tf.trainable_variables()]
values = sess.run(variables_names)
for k, v in zip(variables_names, values):
print("Variable: ", k)
print("Shape: ", v.shape)
print(v)
start_time = time.time()
def print_time_estimate(eps_completed):
cur_time = time.time()
time_diff = cur_time - start_time
eps_per_sec = eps_completed / time_diff
secs_per_ep = time_diff / eps_completed
eps_remaining = (episodes - eps_completed)
sys.stderr.write(
"[TRAIN] Averaging {per_sec} episodes per second\n".format(per_sec=round(eps_per_sec, 2)))
sys.stderr.write(
"[TRAIN] {eps_remaining} episodes remaining; approx. {time_remaining} seconds remaining\n".format(
eps_remaining=eps_remaining, time_remaining=int(eps_remaining * secs_per_ep)))
sys.stderr.write("[TRAIN] Training {} episodes and save_step_size {}\n".format(episodes, save_step_size))
outcomes = []
for episode in range(1, episodes + 1):
sys.stderr.write("[TRAIN] Episode {}".format(episode + trained_eps))
# TODO decide which player should be here
player = 1
prev_board = Board.initial_state
i = 0
while Board.outcome(prev_board) is None:
i += 1
cur_board, cur_board_value = self.make_move(sess,
prev_board,
(random.randrange(1, 7), random.randrange(1, 7)),
player)
# adjust weights
sess.run(self.training_op,
feed_dict={self.x: self.board_trans_func(prev_board, player),
self.value_next: cur_board_value})
player *= -1
prev_board = cur_board
final_board = prev_board
sys.stderr.write("\t outcome {}\t turns {}".format(Board.outcome(final_board)[1], i))
outcomes.append(Board.outcome(final_board)[1])
final_score = np.array([Board.outcome(final_board)[1]])
scaled_final_score = ((final_score + 2) / 4)
with tf.name_scope("final"):
merged = tf.summary.merge_all()
global_step, summary, _ = sess.run([self.global_step, merged, self.training_op],
feed_dict={self.x: self.board_trans_func(prev_board, player),
self.value_next: scaled_final_score.reshape((1, 1))})
writer.add_summary(summary, episode + trained_eps)
sys.stderr.write("\n")
if episode % min(save_step_size, episodes) == 0:
sys.stderr.write("[TRAIN] Saving model...\n")
self.save_model(sess, episode + trained_eps, global_step)
if episode % 50 == 0:
print_time_estimate(episode)
sys.stderr.write("[TRAIN] Saving model for final episode...\n")
self.save_model(sess, episode+trained_eps, global_step)
writer.close()
return outcomes