Alexander Munch-Hansen
926a331df0
again and it is possible to play against the ai. There is no flag for this yet, so this has to be added.
672 lines
25 KiB
Python
672 lines
25 KiB
Python
import tensorflow as tf
|
|
import numpy as np
|
|
from board import Board
|
|
import os
|
|
import time
|
|
import sys
|
|
import random
|
|
from eval import Eval
|
|
import glob
|
|
from operator import itemgetter
|
|
import tensorflow.contrib.eager as tfe
|
|
from player import Player
|
|
|
|
class Network:
|
|
# board_features_quack has size 28
|
|
# board_features_quack_fat has size 30
|
|
# board_features_tesauro has size 198
|
|
|
|
board_reps = {
|
|
'quack-fat' : (30, Board.board_features_quack_fat),
|
|
'quack' : (28, Board.board_features_quack),
|
|
'tesauro' : (198, Board.board_features_tesauro),
|
|
'quack-norm' : (30, Board.board_features_quack_norm),
|
|
'tesauro-poop': (198, Board.board_features_tesauro_wrong)
|
|
}
|
|
|
|
|
|
def custom_tanh(self, x, name=None):
|
|
return tf.scalar_mul(tf.constant(2.00), tf.tanh(x, name))
|
|
|
|
def __init__(self, config, name):
|
|
"""
|
|
:param config:
|
|
:param name:
|
|
"""
|
|
|
|
move_options = {
|
|
'1': self.make_move_1_ply,
|
|
'0': self.make_move_0_ply
|
|
}
|
|
|
|
tf.enable_eager_execution()
|
|
|
|
xavier_init = tf.contrib.layers.xavier_initializer()
|
|
|
|
self.config = config
|
|
self.checkpoint_path = os.path.join(config['model_storage_path'], config['model'])
|
|
|
|
self.name = name
|
|
|
|
self.make_move = move_options[
|
|
self.config['ply']
|
|
]
|
|
|
|
# Set board representation from config
|
|
self.input_size, self.board_trans_func = Network.board_reps[
|
|
self.config['board_representation']
|
|
]
|
|
self.output_size = 1
|
|
self.hidden_size = 40
|
|
self.max_learning_rate = 0.1
|
|
self.min_learning_rate = 0.001
|
|
|
|
# Restore trained episode count for model
|
|
episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained")
|
|
if os.path.isfile(episode_count_path):
|
|
with open(episode_count_path, 'r') as f:
|
|
self.episodes_trained = int(f.read())
|
|
else:
|
|
self.episodes_trained = 0
|
|
|
|
global_step_path = os.path.join(self.checkpoint_path, "global_step")
|
|
if os.path.isfile(global_step_path):
|
|
with open(global_step_path, 'r') as f:
|
|
self.global_step = int(f.read())
|
|
else:
|
|
self.global_step = 0
|
|
|
|
|
|
self.model = tf.keras.Sequential([
|
|
tf.keras.layers.Dense(40, activation="sigmoid", kernel_initializer=xavier_init,
|
|
input_shape=(1,self.input_size)),
|
|
tf.keras.layers.Dense(1, activation="sigmoid", kernel_initializer=xavier_init)
|
|
])
|
|
|
|
|
|
def exp_decay(self, max_lr, global_step, decay_rate, decay_steps):
|
|
"""
|
|
Calculates the exponential decay on a learning rate
|
|
:param max_lr: The learning rate that the network starts at
|
|
:param global_step: The global step
|
|
:param decay_rate: The rate at which the learning rate should decay
|
|
:param decay_steps: The amount of steps between each decay
|
|
:return: The result of the exponential decay performed on the learning rate
|
|
"""
|
|
res = max_lr * decay_rate**(global_step // decay_steps)
|
|
return res
|
|
|
|
def do_backprop(self, prev_state, value_next):
|
|
"""
|
|
Performs the Temporal-difference backpropagation step on the model
|
|
:param prev_state: The previous state of the game, this has its value recalculated
|
|
:param value_next: The value of the current move
|
|
:return: Nothing, the calculation is performed on the model of the network
|
|
"""
|
|
self.learning_rate = tf.maximum(self.min_learning_rate,
|
|
self.exp_decay(self.max_learning_rate, self.global_step, 0.96, 50000),
|
|
name="learning_rate")
|
|
|
|
with tf.GradientTape() as tape:
|
|
value = self.model(prev_state.reshape(1,-1))
|
|
grads = tape.gradient(value, self.model.variables)
|
|
|
|
difference_in_values = tf.reshape(tf.subtract(value_next, value, name='difference_in_values'), [])
|
|
tf.summary.scalar("difference_in_values", tf.abs(difference_in_values))
|
|
|
|
with tf.variable_scope('apply_gradients'):
|
|
for grad, train_var in zip(grads, self.model.variables):
|
|
backprop_calc = self.learning_rate * difference_in_values * grad
|
|
train_var.assign_add(backprop_calc)
|
|
|
|
|
|
|
|
def print_variables(self):
|
|
"""
|
|
Prints all the variables of the model
|
|
:return:
|
|
"""
|
|
variables = self.model.variables
|
|
for k in variables:
|
|
print(k)
|
|
|
|
def eval_state(self, state):
|
|
"""
|
|
Evaluates a single state
|
|
:param state:
|
|
:return:
|
|
"""
|
|
return self.model(state.reshape(1,-1))
|
|
|
|
def save_model(self, episode_count):
|
|
"""
|
|
Saves the model of the network, it references global_step as self.global_step
|
|
:param episode_count:
|
|
:return:
|
|
"""
|
|
tfe.Saver(self.model.variables).save(os.path.join(self.checkpoint_path, 'model.ckpt'))
|
|
#self.saver.save(sess, os.path.join(self.checkpoint_path, 'model.ckpt'), global_step=global_step)
|
|
with open(os.path.join(self.checkpoint_path, "episodes_trained"), 'w+') as f:
|
|
print("[NETWK] ({name}) Saving model to:".format(name=self.name),
|
|
os.path.join(self.checkpoint_path, 'model.ckpt'))
|
|
f.write(str(episode_count) + "\n")
|
|
|
|
with open(os.path.join(self.checkpoint_path, "global_step"), 'w+') as f:
|
|
print("[NETWK] ({name}) Saving global step to:".format(name=self.name),
|
|
os.path.join(self.checkpoint_path, 'model.ckpt'))
|
|
f.write(str(self.global_step) + "\n")
|
|
if self.config['verbose']:
|
|
self.print_variables()
|
|
|
|
|
|
def calc_vals(self, states):
|
|
"""
|
|
Calculate a score of each state in states
|
|
:param states: A number of states. The states have to be transformed before being given to this function.
|
|
:return:
|
|
"""
|
|
values = self.model.predict_on_batch(states)
|
|
return values
|
|
|
|
|
|
def restore_model(self):
|
|
"""
|
|
Restore a model for a session, such that a trained model and either be further trained or
|
|
used for evaluation
|
|
|
|
:param sess: Current session
|
|
:return: Nothing. It's a side-effect that a model gets restored for the network.
|
|
"""
|
|
|
|
|
|
if glob.glob(os.path.join(self.checkpoint_path, 'model.ckpt*.index')):
|
|
|
|
latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_path)
|
|
print("[NETWK] ({name}) Restoring model from:".format(name=self.name),
|
|
str(latest_checkpoint))
|
|
tfe.Saver(self.model.variables).restore(latest_checkpoint)
|
|
|
|
# variables_names = [v.name for v in self.model.variables]
|
|
|
|
|
|
# Restore trained episode count for model
|
|
episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained")
|
|
if os.path.isfile(episode_count_path):
|
|
with open(episode_count_path, 'r') as f:
|
|
self.config['start_episode'] = int(f.read())
|
|
|
|
global_step_path = os.path.join(self.checkpoint_path, "global_step")
|
|
if os.path.isfile(global_step_path):
|
|
with open(global_step_path, 'r') as f:
|
|
self.config['global_step'] = int(f.read())
|
|
|
|
if self.config['verbose']:
|
|
self.print_variables()
|
|
|
|
|
|
|
|
def make_move_0_ply(self, board, roll, player):
|
|
"""
|
|
Find the best move given a board, roll and a player, by finding all possible states one can go to
|
|
and then picking the best, by using the network to evaluate each state. This is 0-ply, ie. no look-ahead.
|
|
The highest score is picked for the 1-player and the max(1-score) is picked for the -1-player.
|
|
|
|
:param sess:
|
|
:param board: Current board
|
|
:param roll: Current roll
|
|
:param player: Current player
|
|
:return: A pair of the best state to go to, together with the score of that state
|
|
"""
|
|
legal_moves = list(Board.calculate_legal_states(board, player, roll))
|
|
legal_states = np.array([self.board_trans_func(move, player)[0] for move in legal_moves])
|
|
|
|
scores = self.model.predict_on_batch(legal_states)
|
|
transformed_scores = [x if np.sign(player) > 0 else 1 - x for x in scores]
|
|
|
|
best_score_idx = np.argmax(np.array(transformed_scores))
|
|
best_move = legal_moves[best_score_idx]
|
|
best_score = scores[best_score_idx]
|
|
|
|
return [best_move, best_score]
|
|
|
|
def make_move_1_ply(self, board, roll, player):
|
|
"""
|
|
Return the best board and best score based on a 1-ply look-ahead.
|
|
:param board:
|
|
:param roll:
|
|
:param player:
|
|
:return:
|
|
"""
|
|
# start = time.time()
|
|
best_pair = self.calculate_1_ply(board, roll, player)
|
|
# print(time.time() - start)
|
|
return best_pair
|
|
|
|
|
|
def calculate_1_ply(self, board, roll, player):
|
|
"""
|
|
Find the best move based on a 1-ply look-ahead. First the x best moves are picked from a 0-ply and then
|
|
all moves and scores are found for them. The expected score is then calculated for each of the boards from the
|
|
0-ply.
|
|
:param sess:
|
|
:param board:
|
|
:param roll: The original roll
|
|
:param player: The current player
|
|
:return: Best possible move based on 1-ply look-ahead
|
|
|
|
"""
|
|
|
|
# find all legal states from the given board and the given roll
|
|
init_legal_states = Board.calculate_legal_states(board, player, roll)
|
|
|
|
legal_states = np.array([self.board_trans_func(state, player)[0] for state in init_legal_states])
|
|
|
|
scores = self.calc_vals(legal_states)
|
|
scores = [score.numpy() for score in scores]
|
|
|
|
moves_and_scores = list(zip(init_legal_states, scores))
|
|
|
|
sorted_moves_and_scores = sorted(moves_and_scores, key=itemgetter(1), reverse=player==1)
|
|
|
|
best_boards = [x[0] for x in sorted_moves_and_scores[:10]]
|
|
|
|
|
|
|
|
scores, trans_scores = self.do_ply(best_boards, player)
|
|
|
|
best_score_idx = np.array(trans_scores).argmax()
|
|
|
|
return [best_boards[best_score_idx], scores[best_score_idx]]
|
|
|
|
def do_ply(self, boards, player):
|
|
"""
|
|
Calculates a single extra ply, resulting in a larger search space for our best move.
|
|
This is somewhat hardcoded to only do a single ply, seeing that it calls max on all scores, rather than
|
|
allowing the function to search deeper, which could result in an even larger search space. If we wish
|
|
to have more than 2-ply, this should be fixed, so we could extend this method to allow for 3-ply.
|
|
|
|
:param sess:
|
|
:param boards: The boards to try all rolls on
|
|
:param player: The player of the previous ply
|
|
:return: An array of scores where each index describes one of the boards which was given as param
|
|
to this function.
|
|
"""
|
|
|
|
import time
|
|
|
|
def gen_21_rolls():
|
|
"""
|
|
Calculate all possible rolls, [[1,1], [1,2] ..]
|
|
:return: All possible rolls
|
|
"""
|
|
a = []
|
|
for x in range(1, 7):
|
|
for y in range(1, 7):
|
|
if not [x, y] in a and not [y, x] in a:
|
|
a.append([x, y])
|
|
|
|
return a
|
|
|
|
all_rolls = gen_21_rolls()
|
|
|
|
# start = time.time()
|
|
|
|
list_of_moves = []
|
|
|
|
# Prepping of data
|
|
for idx, board in enumerate(boards):
|
|
all_board_moves = []
|
|
for roll in all_rolls:
|
|
all_states = list(Board.calculate_legal_states(board, player*-1, roll))
|
|
for state in all_states:
|
|
state = np.array(self.board_trans_func(state, player*-1)[0])
|
|
all_board_moves.append(state)
|
|
list_of_moves.append(np.array(all_board_moves))
|
|
|
|
|
|
# print(time.time() - start)
|
|
# start = time.time()
|
|
|
|
# Running data through networks
|
|
all_scores = [self.model.predict_on_batch(board) for board in list_of_moves]
|
|
scores_means = [tf.reduce_mean(score) for score in all_scores]
|
|
|
|
transformed_means = [x if player == 1 else (1-x) for x in scores_means]
|
|
|
|
# print(time.time() - start)
|
|
return ([scores_means, transformed_means])
|
|
|
|
|
|
def calc_n_ply(self, n_init, sess, board, player, roll):
|
|
"""
|
|
:param n_init:
|
|
:param sess:
|
|
:param board:
|
|
:param player:
|
|
:param roll:
|
|
:return:
|
|
"""
|
|
|
|
# find all legal states from the given board and the given roll
|
|
init_legal_states = Board.calculate_legal_states(board, player, roll)
|
|
|
|
# find all values for the above boards
|
|
zero_ply_moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in init_legal_states]
|
|
|
|
# pythons reverse is in place and I can't call [:15] on it, without applying it to an object like so. Fuck.
|
|
sorted_moves_and_scores = sorted(zero_ply_moves_and_scores, key=itemgetter(1), reverse=player==1)
|
|
|
|
|
|
best_boards = [x[0] for x in sorted_moves_and_scores[:10]]
|
|
|
|
best_move_score_pair = self.n_ply(n_init, sess, best_boards, player)
|
|
|
|
return best_move_score_pair
|
|
|
|
|
|
def n_ply(self, n_init, sess, boards_init, player_init):
|
|
"""
|
|
:param n_init:
|
|
:param sess:
|
|
:param boards_init:
|
|
:param player_init:
|
|
:return:
|
|
"""
|
|
def ply(n, boards, player):
|
|
def calculate_possible_states(board):
|
|
possible_rolls = [ (1, 1), (1, 2), (1, 3), (1, 4), (1, 5),
|
|
(1, 6), (2, 2), (2, 3), (2, 4), (2, 5),
|
|
(2, 6), (3, 3), (3, 4), (3, 5), (3, 6),
|
|
(4, 4), (4, 5), (4, 6), (5, 5), (5, 6),
|
|
(6, 6) ]
|
|
|
|
# for roll in possible_rolls:
|
|
# print(len(Board.calculate_legal_states(board, player, roll)))
|
|
|
|
return [ Board.calculate_legal_states(board, player, roll)
|
|
for roll
|
|
in possible_rolls ]
|
|
|
|
def find_best_state_score(boards):
|
|
score_pairs = [ (board, self.eval_state(sess, self.board_trans_func(board, player)))
|
|
for board
|
|
in boards ]
|
|
scores = [ pair[1]
|
|
for pair
|
|
in score_pairs ]
|
|
best_score_pair = score_pairs[np.array(scores).argmax()]
|
|
|
|
return best_score_pair
|
|
|
|
def average_score(boards):
|
|
return sum(boards)/len(boards)
|
|
|
|
def average_ply_score(board):
|
|
states_for_rolls = calculate_possible_states(board)
|
|
|
|
best_state_score_for_each_roll = [
|
|
find_best_state_score(states)
|
|
for states
|
|
in states_for_rolls ]
|
|
best_score_for_each_roll = [ x[1]
|
|
for x
|
|
in best_state_score_for_each_roll ]
|
|
|
|
average_score_var = average_score(best_score_for_each_roll)
|
|
return average_score_var
|
|
|
|
|
|
if n == 1:
|
|
average_score_pairs = [ (board, average_ply_score(board))
|
|
for board
|
|
in boards ]
|
|
return average_score_pairs
|
|
elif n > 1: # n != 1
|
|
def average_for_score_pairs(score_pairs):
|
|
scores = [ pair[1]
|
|
for pair
|
|
in score_pairs ]
|
|
return sum(scores)/len(scores)
|
|
|
|
def average_plain(scores):
|
|
return sum(scores)/len(scores)
|
|
|
|
print("+"*20)
|
|
print(n)
|
|
print(type(boards))
|
|
print(boards)
|
|
possible_states_for_boards = [
|
|
(board, calculate_possible_states(board))
|
|
for board
|
|
in boards ]
|
|
|
|
average_score_pairs = [
|
|
(inner_boards[0], average_plain([ average_for_score_pairs(ply(n - 1, inner_board, player * -1 if n == 1 else player))
|
|
for inner_board
|
|
in inner_boards[1] ]))
|
|
for inner_boards
|
|
in possible_states_for_boards ]
|
|
|
|
return average_score_pairs
|
|
|
|
else:
|
|
assert False
|
|
|
|
if n_init < 1: print("Unexpected argument n = {}".format(n_init)); exit()
|
|
|
|
boards_with_scores = ply(n_init, boards_init, -1 * player_init)
|
|
#print("Boards with scores:",boards_with_scores)
|
|
scores = [ ( pair[1] if player_init == 1 else (1 - pair[1]) )
|
|
for pair
|
|
in boards_with_scores ]
|
|
#print("All the scores:",scores)
|
|
best_score_pair = boards_with_scores[np.array(scores).argmax()]
|
|
return best_score_pair
|
|
|
|
|
|
def eval(self, episode_count, trained_eps = 0):
|
|
"""
|
|
Used to evaluate a model. Can either use pubeval, a model playing at an intermediate level, or dumbeval
|
|
a model which has been given random weights, so it acts deterministically random.
|
|
|
|
:param episode_count: The amount of episodes to run
|
|
:param trained_eps: The amount of episodes the model we want to evaluate, has trained
|
|
:param tf_session:
|
|
:return: outcomes: The outcomes of the evaluation session
|
|
"""
|
|
|
|
def do_eval(method, episodes = 1000, trained_eps = 0):
|
|
"""
|
|
Do the actual evaluation
|
|
|
|
:param sess:
|
|
:param method: Either pubeval or dumbeval
|
|
:param episodes: Amount of episodes to use in the evaluation
|
|
:param trained_eps:
|
|
:return: outcomes : Described above
|
|
"""
|
|
|
|
start_time = time.time()
|
|
|
|
def print_time_estimate(eps_completed):
|
|
cur_time = time.time()
|
|
time_diff = cur_time - start_time
|
|
eps_per_sec = eps_completed / time_diff
|
|
secs_per_ep = time_diff / eps_completed
|
|
eps_remaining = (episodes - eps_completed)
|
|
sys.stderr.write(
|
|
"[EVAL ] Averaging {per_sec} episodes per second\n".format(per_sec=round(eps_per_sec, 2)))
|
|
sys.stderr.write(
|
|
"[EVAL ] {eps_remaining} episodes remaining; approx. {time_remaining} seconds remaining\n".format(
|
|
eps_remaining=eps_remaining, time_remaining=int(eps_remaining * secs_per_ep)))
|
|
|
|
sys.stderr.write(
|
|
"[EVAL ] Evaluating {eps} episode(s) with method '{method}'\n".format(eps=episodes, method=method))
|
|
|
|
|
|
if method == 'pubeval':
|
|
outcomes = []
|
|
for i in range(1, episodes + 1):
|
|
sys.stderr.write("[EVAL ] Episode {}".format(i))
|
|
board = Board.initial_state
|
|
while Board.outcome(board) is None:
|
|
roll = (random.randrange(1, 7), random.randrange(1, 7))
|
|
|
|
board = (self.make_move(board, roll, 1))[0]
|
|
|
|
roll = (random.randrange(1, 7), random.randrange(1, 7))
|
|
|
|
board = Eval.make_pubeval_move(board, -1, roll)[0][0:26]
|
|
|
|
sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1]))
|
|
outcomes.append(Board.outcome(board)[1])
|
|
sys.stderr.write("\n")
|
|
|
|
if i % 10 == 0:
|
|
print_time_estimate(i)
|
|
|
|
return outcomes
|
|
|
|
elif method == 'dumbeval':
|
|
outcomes = []
|
|
for i in range(1, episodes + 1):
|
|
sys.stderr.write("[EVAL ] Episode {}".format(i))
|
|
board = Board.initial_state
|
|
while Board.outcome(board) is None:
|
|
roll = (random.randrange(1, 7), random.randrange(1, 7))
|
|
|
|
board = (self.make_move(board, roll, 1))[0]
|
|
|
|
roll = (random.randrange(1, 7), random.randrange(1, 7))
|
|
|
|
board = Eval.make_dumbeval_move(board, -1, roll)[0][0:26]
|
|
|
|
sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1]))
|
|
outcomes.append(Board.outcome(board)[1])
|
|
sys.stderr.write("\n")
|
|
|
|
if i % 10 == 0:
|
|
print_time_estimate(i)
|
|
|
|
return outcomes
|
|
|
|
else:
|
|
sys.stderr.write("[EVAL ] Evaluation method '{}' is not defined\n".format(method))
|
|
return [0]
|
|
|
|
|
|
outcomes = [ (method, do_eval(method,
|
|
episode_count,
|
|
trained_eps = trained_eps))
|
|
for method
|
|
in self.config['eval_methods'] ]
|
|
return outcomes
|
|
|
|
|
|
def play_against_network(self):
|
|
self.restore_model()
|
|
human_player = Player(-1)
|
|
cur_player = 1
|
|
player = 1
|
|
board = Board.initial_state
|
|
i = 0
|
|
while Board.outcome(board) is None:
|
|
print(Board.pretty(board))
|
|
roll = (random.randrange(1, 7), random.randrange(1, 7))
|
|
print("Bot rolled:", roll)
|
|
|
|
board, _ = self.make_move(board, roll, player)
|
|
print(Board.pretty(board))
|
|
roll = (random.randrange(1, 7), random.randrange(1, 7))
|
|
print("You rolled:", roll)
|
|
board = human_player.make_human_move(board, roll)
|
|
print("DONE "*10)
|
|
print(Board.pretty(board))
|
|
|
|
|
|
|
|
def train_model(self, episodes=1000, save_step_size=100, trained_eps=0):
|
|
"""
|
|
|
|
:param episodes:
|
|
:param save_step_size:
|
|
:param trained_eps:
|
|
:return:
|
|
"""
|
|
|
|
difference_in_vals = 0
|
|
|
|
self.restore_model()
|
|
|
|
start_time = time.time()
|
|
|
|
def print_time_estimate(eps_completed):
|
|
cur_time = time.time()
|
|
time_diff = cur_time - start_time
|
|
eps_per_sec = eps_completed / time_diff
|
|
secs_per_ep = time_diff / eps_completed
|
|
eps_remaining = (episodes - eps_completed)
|
|
sys.stderr.write(
|
|
"[TRAIN] Averaging {per_sec} episodes per second\n".format(per_sec=round(eps_per_sec, 2)))
|
|
sys.stderr.write(
|
|
"[TRAIN] {eps_remaining} episodes remaining; approx. {time_remaining} seconds remaining\n".format(
|
|
eps_remaining=eps_remaining, time_remaining=int(eps_remaining * secs_per_ep)))
|
|
|
|
sys.stderr.write("[TRAIN] Training {} episodes and save_step_size {}\n".format(episodes, save_step_size))
|
|
outcomes = []
|
|
for episode in range(1, episodes + 1):
|
|
|
|
sys.stderr.write("[TRAIN] Episode {}".format(episode + trained_eps))
|
|
# TODO decide which player should be here
|
|
|
|
player = 1
|
|
prev_board = Board.initial_state
|
|
i = 0
|
|
while Board.outcome(prev_board) is None:
|
|
i += 1
|
|
self.global_step += 1
|
|
|
|
|
|
cur_board, cur_board_value = self.make_move(prev_board,
|
|
(random.randrange(1, 7), random.randrange(1, 7)),
|
|
player)
|
|
|
|
difference_in_vals += abs((cur_board_value - self.eval_state(self.board_trans_func(prev_board, player))))
|
|
|
|
if self.config['verbose']:
|
|
print("Difference in values:", difference_in_vals)
|
|
print("Current board value :", cur_board_value)
|
|
print("Current board is :\n",cur_board)
|
|
|
|
|
|
# adjust weights
|
|
if Board.outcome(cur_board) is None:
|
|
self.do_backprop(self.board_trans_func(prev_board, player), cur_board_value)
|
|
player *= -1
|
|
|
|
prev_board = cur_board
|
|
|
|
final_board = prev_board
|
|
sys.stderr.write("\t outcome {}\t turns {}".format(Board.outcome(final_board)[1], i))
|
|
outcomes.append(Board.outcome(final_board)[1])
|
|
final_score = np.array([Board.outcome(final_board)[1]])
|
|
scaled_final_score = ((final_score + 2) / 4)
|
|
|
|
self.do_backprop(self.board_trans_func(prev_board, player), scaled_final_score.reshape(1,1))
|
|
|
|
sys.stderr.write("\n")
|
|
|
|
if episode % min(save_step_size, episodes) == 0:
|
|
sys.stderr.write("[TRAIN] Saving model...\n")
|
|
self.save_model(episode + trained_eps)
|
|
|
|
if episode % 50 == 0:
|
|
print_time_estimate(episode)
|
|
|
|
sys.stderr.write("[TRAIN] Saving model for final episode...\n")
|
|
self.save_model(episode+trained_eps)
|
|
|
|
return outcomes, difference_in_vals[0][0]
|
|
|
|
|