Getting closer to functionality. We're capable of evaluating moves

and a rework of global_step has begun, such that we now use
episode_count as a way of calculating exp_decay, which have been
implemented as a function.
This commit is contained in:
Alexander Munch-Hansen 2018-05-09 22:22:12 +02:00
parent 9a2d87516e
commit cb7e7b519c
4 changed files with 82 additions and 51 deletions

View File

@ -43,7 +43,10 @@ class Network:
self.hidden_size = 40 self.hidden_size = 40
self.max_learning_rate = 0.1 self.max_learning_rate = 0.1
self.min_learning_rate = 0.001 self.min_learning_rate = 0.001
self.global_step = "lol"
self.global_step = tf.train.get_or_create_global_step()
#tf.train.get_or_create_global_step()
# Restore trained episode count for model # Restore trained episode count for model
episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained") episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained")
if os.path.isfile(episode_count_path): if os.path.isfile(episode_count_path):
@ -62,47 +65,48 @@ class Network:
def exp_decay(self, max_lr, epi_counter, decay_rate, decay_steps):
res = max_lr * decay_rate**(epi_counter // decay_steps)
return res
def do_backprop(self, prev_state, value_next): def do_backprop(self, prev_state, value_next):
self.learning_rate = tf.maximum(self.min_learning_rate, self.learning_rate = tf.maximum(self.min_learning_rate,
tf.train.exponential_decay(self.max_learning_rate, self.exp_decay(self.max_learning_rate, self.episodes_trained, 0.96, 50000),
self.global_step, 50000,
0.96,
staircase=True),
name="learning_rate") name="learning_rate")
# self.learning_rate = 0.1
print(tf.train.get_global_step())
with tf.GradientTape() as tape: with tf.GradientTape() as tape:
value = self.model(np.array(input).reshape(1, -1)) value = self.model(prev_state.reshape(1,-1))
grads = tape.gradient(value, self.model.variables) grads = tape.gradient(value, self.model.variables)
difference_in_values = tf.reshape(tf.subtract(value_next, value, name='difference_in_values'), []) difference_in_values = tf.reshape(tf.subtract(value_next, value, name='difference_in_values'), [])
tf.summary.scalar("difference_in_values", tf.abs(difference_in_values)) tf.summary.scalar("difference_in_values", tf.abs(difference_in_values))
global_step_op = self.global_step.assign_add(1) # global_step_op = self.global_step.assign_add(1)
with tf.variable_scope('apply_gradients'): with tf.variable_scope('apply_gradients'):
for grad, train_var in zip(grads, self.model.variables): for grad, train_var in zip(grads, self.model.variables):
backprop_calc = self.learning_rate * difference_in_values * grad backprop_calc = self.learning_rate * difference_in_values * grad
train_var.assign_add(backprop_calc) train_var.assign_add(backprop_calc)
print(self.episodes_trained)
def eval_state(self, sess, state): def eval_state(self, sess, state):
return sess.run(self.value, feed_dict={self.x: state}) return sess.run(self.value, feed_dict={self.x: state})
def save_model(self, episode_count, global_step): def save_model(self, episode_count, global_step):
tfe.Saver(self.model.variables).save("./tmp_ckpt", global_step=global_step) tfe.Saver(self.model.variables).save(os.path.join(self.checkpoint_path, 'model.ckpt'), global_step=self.global_step)
#self.saver.save(sess, os.path.join(self.checkpoint_path, 'model.ckpt'), global_step=global_step) #self.saver.save(sess, os.path.join(self.checkpoint_path, 'model.ckpt'), global_step=global_step)
#with open(os.path.join(self.checkpoint_path, "episodes_trained"), 'w+') as f: with open(os.path.join(self.checkpoint_path, "episodes_trained"), 'w+') as f:
# print("[NETWK] ({name}) Saving model to:".format(name=self.name), print("[NETWK] ({name}) Saving model to:".format(name=self.name),
# os.path.join(self.checkpoint_path, 'model.ckpt')) os.path.join(self.checkpoint_path, 'model.ckpt'))
# f.write(str(episode_count) + "\n") f.write(str(episode_count) + "\n")
def calc_vals(self, states): def calc_vals(self, states):
values = self.model.predict_on_batch(states) values = self.model.predict_on_batch(states)
self.save_model(0, 432)
return values return values
@ -120,9 +124,9 @@ class Network:
latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_path) latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_path)
print("[NETWK] ({name}) Restoring model from:".format(name=self.name), print("[NETWK] ({name}) Restoring model from:".format(name=self.name),
str(latest_checkpoint)) str(latest_checkpoint))
tfe.Saver(model.variables).restore(latest_checkpoint) tfe.Saver(self.model.variables).restore(latest_checkpoint)
variables_names = [v.name for v in self.model.variables] # variables_names = [v.name for v in self.model.variables]
# Restore trained episode count for model # Restore trained episode count for model
@ -130,11 +134,11 @@ class Network:
if os.path.isfile(episode_count_path): if os.path.isfile(episode_count_path):
with open(episode_count_path, 'r') as f: with open(episode_count_path, 'r') as f:
self.config['start_episode'] = int(f.read()) self.config['start_episode'] = int(f.read())
else: # else:
latest_checkpoint = tf.train.latest_checkpoint("./") # latest_checkpoint = tf.train.latest_checkpoint("./")
print("[NETWK] ({name}) Restoring model from:".format(name=self.name), # print("[NETWK] ({name}) Restoring model from:".format(name=self.name),
str(latest_checkpoint)) # str(latest_checkpoint))
tfe.Saver(self.model.variables).restore(latest_checkpoint) # tfe.Saver(self.model.variables).restore(latest_checkpoint)
#variables_names = [v.name for v in self.model.variables] #variables_names = [v.name for v in self.model.variables]
@ -143,9 +147,9 @@ class Network:
#if os.path.isfile(episode_count_path): #if os.path.isfile(episode_count_path):
# with open(episode_count_path, 'r') as f: # with open(episode_count_path, 'r') as f:
# self.config['start_episode'] = int(f.read()) # self.config['start_episode'] = int(f.read())
tf.train.get_or_create_global_step()
def make_move(self, board, roll, player):
def make_move(self, sess, board, roll, player):
""" """
Find the best move given a board, roll and a player, by finding all possible states one can go to Find the best move given a board, roll and a player, by finding all possible states one can go to
and then picking the best, by using the network to evaluate each state. The highest score is picked and then picking the best, by using the network to evaluate each state. The highest score is picked
@ -157,12 +161,19 @@ class Network:
:param player: Current player :param player: Current player
:return: A pair of the best state to go to, together with the score of that state :return: A pair of the best state to go to, together with the score of that state
""" """
legal_moves = Board.calculate_legal_states(board, player, roll) legal_states = list(Board.calculate_legal_states(board, player, roll))
moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves] legal_states = [list(tmp) for tmp in legal_states]
scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores] legal_states = np.array([Board.board_features_quack_fat(tmp, player)[0] for tmp in legal_states])
best_score_index = np.array(scores).argmax() legal_moves = [self.board_trans_func(board, player) for board in Board.calculate_legal_states(board, player, roll)]
best_move_pair = moves_and_scores[best_score_index]
return best_move_pair scores = self.model.predict_on_batch(legal_states)
transformed_scores = [x if np.sign(player) > 0 else 1 - x for x in scores]
best_score_idx = np.argmax(np.array(transformed_scores))
best_move = legal_moves[best_score_idx]
best_score = scores[best_score_idx]
self.episodes_trained += 1
return [best_move, best_score]
def make_move_n_ply(self, sess, board, roll, player, n = 1): def make_move_n_ply(self, sess, board, roll, player, n = 1):
best_pair = self.calc_n_ply(n, sess, board, player, roll) best_pair = self.calc_n_ply(n, sess, board, player, roll)

View File

@ -9,7 +9,7 @@ from board import Board
import main import main
config = main.config.copy() config = main.config.copy()
config['model'] = "tesauro_blah" config['model'] = "eager_testings"
config['force_creation'] = True config['force_creation'] = True
config['board_representation'] = 'quack-fat' config['board_representation'] = 'quack-fat'
network = Network(config, config['model']) network = Network(config, config['model'])
@ -75,10 +75,18 @@ def calculate_possible_states(board):
#print(network.calculate_1_ply(session, Board.initial_state, [2,4], 1)) #print(network.calculate_1_ply(session, Board.initial_state, [2,4], 1))
board = network.board_trans_func(Board.initial_state, 1) board = network.board_trans_func(Board.initial_state, 1)
#print(board)
input = [0, 2, 0, 0, 0, 0, -5, 0, -3, 0, 0, 0, 5, -5, 0, 0, 0, 3, 0, 5, 0, 0, 0, 0, -2, 0, 0, 0, 1, 0] pair = network.make_move(Board.initial_state, [3,2], 1)
all_input = np.array([input for _ in range(20)])
print(network.calc_vals(all_input)) print(pair[1])
network.do_backprop(board, 0.9)
network.save_model(2, 342)
# all_input = np.array([input for _ in range(20)])
# print(network.calc_vals(all_input))
#print(" "*10 + "network_test") #print(" "*10 + "network_test")

View File

@ -1,6 +1,7 @@
import time import time
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from board import Board
import tensorflow.contrib.eager as tfe import tensorflow.contrib.eager as tfe
@ -23,12 +24,14 @@ model = tf.keras.Sequential([
#tfe.Saver(model.variables).restore(tf.train.latest_checkpoint("./")) #tfe.Saver(model.variables).restore(tf.train.latest_checkpoint("./"))
input = [0, 2, 0, 0, 0, 0, -5, 0, -3, 0, 0, 0, 5, -5, 0, 0, 0, 3, 0, 5, 0, 0, 0, 0, -2, 0, 0, 0, 1, 0] input = [0, 2, 0, 0, 0, 0, -5, 0, -3, 0, 0, 0, 5, -5, 0, 0, 0, 3, 0, 5, 0, 0, 0, 0, -2, 0]
all_input = np.array([input for _ in range(20)])
single_in = np.array(input).reshape(1,-1)
all_input = np.array([Board.board_features_quack_fat(input, 1) for _ in range(20)])
single_in = Board.board_features_quack_fat(input, 1)
start = time.time() start = time.time()
@ -48,10 +51,10 @@ print(time.time() - start)
print("-"*30) print("-"*30)
with tf.GradientTape() as tape: with tf.GradientTape() as tape:
val = model(np.array(input).reshape(1,-1)) val = model(single_in)
grads = tape.gradient(val, model.variables) grads = tape.gradient(val, model.variables)
grads = [0.1*val-np.random.uniform(-1,1)+grad for grad, trainable_var in zip(grads, model.variables)] # grads = [0.1*val-np.random.uniform(-1,1)+grad for grad, trainable_var in zip(grads, model.variables)]
# print(model.variables[0][0]) # print(model.variables[0][0])
weights_before = model.weights[0] weights_before = model.weights[0]
@ -60,14 +63,20 @@ start = time.time()
#[trainable_var.assign_add(0.1*val-0.3+grad) for grad, trainable_var in zip(grads, model.variables)] #[trainable_var.assign_add(0.1*val-0.3+grad) for grad, trainable_var in zip(grads, model.variables)]
start = time.time() start = time.time()
#for gradient, trainable_var in zip(grads, model.variables): for gradient, trainable_var in zip(grads, model.variables):
# backprop_calc = 0.1 * (val - np.random.uniform(-1, 1)) * gradient backprop_calc = 0.1 * (0.9 - val) * gradient
# trainable_var.assign_add(backprop_calc) trainable_var.assign_add(backprop_calc)
opt.apply_gradients(zip(grads, model.variables)) # opt.apply_gradients(zip(grads, model.variables))
print(time.time() - start) print(time.time() - start)
print(model(np.array(input).reshape(1,-1))) print(model(single_in))
tfe.Saver(model.variables).save("./tmp_ckpt") vals = model.predict_on_batch(all_input)
vals = list(vals)
vals[3] = 4
print(vals)
print(np.argmax(np.array(vals)))
# tfe.Saver(model.variables).save("./tmp_ckpt")

View File

@ -35,14 +35,15 @@ class Everything:
trainable_vars = tf.trainable_variables() trainable_vars = tf.trainable_variables()
gradients = tf.gradients(self.value, trainable_vars) gradients = tf.gradients(self.value, trainable_vars)
difference_in_values = tf.reshape(tf.subtract(0.9, self.value, name='difference_in_values'), [])
with tf.variable_scope('apply_gradients'): with tf.variable_scope('apply_gradients'):
for gradient, trainable_var in zip(gradients, trainable_vars): for gradient, trainable_var in zip(gradients, trainable_vars):
backprop_calc = self.learning_rate * difference_in_values * gradient backprop_calc = 0.1 * difference_in_values * gradient
grad_apply = trainable_var.assign_add(backprop_calc) grad_apply = trainable_var.assign_add(backprop_calc)
apply_gradients.append(grad_apply) apply_gradients.append(grad_apply)
with tf.control_dependencies([global_step_op]):
self.training_op = tf.group(*apply_gradients, name='training_op') self.training_op = tf.group(*apply_gradients, name='training_op')
@ -56,7 +57,9 @@ class Everything:
val = sess.run(self.value, feed_dict={self.input: input.reshape(1,-1)}) val = sess.run(self.value, feed_dict={self.input: input.reshape(1,-1)})
print(time.time() - start) print(time.time() - start)
print(val) print(val)
sess.run(self.training_op, feed_dict={self.input: input.reshape(1,-1)})
val = sess.run(self.value, feed_dict={self.input: input.reshape(1, -1)})
print(val)
everything = Everything() everything = Everything()
everything.eval() everything.eval()