Getting closer to functionality. We're capable of evaluating moves
and a rework of global_step has begun, such that we now use episode_count as a way of calculating exp_decay, which have been implemented as a function.
This commit is contained in:
parent
9a2d87516e
commit
cb7e7b519c
73
network.py
73
network.py
|
@ -43,7 +43,10 @@ class Network:
|
||||||
self.hidden_size = 40
|
self.hidden_size = 40
|
||||||
self.max_learning_rate = 0.1
|
self.max_learning_rate = 0.1
|
||||||
self.min_learning_rate = 0.001
|
self.min_learning_rate = 0.001
|
||||||
self.global_step = "lol"
|
|
||||||
|
self.global_step = tf.train.get_or_create_global_step()
|
||||||
|
|
||||||
|
#tf.train.get_or_create_global_step()
|
||||||
# Restore trained episode count for model
|
# Restore trained episode count for model
|
||||||
episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained")
|
episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained")
|
||||||
if os.path.isfile(episode_count_path):
|
if os.path.isfile(episode_count_path):
|
||||||
|
@ -62,47 +65,48 @@ class Network:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def exp_decay(self, max_lr, epi_counter, decay_rate, decay_steps):
|
||||||
|
res = max_lr * decay_rate**(epi_counter // decay_steps)
|
||||||
|
return res
|
||||||
|
|
||||||
def do_backprop(self, prev_state, value_next):
|
def do_backprop(self, prev_state, value_next):
|
||||||
|
|
||||||
self.learning_rate = tf.maximum(self.min_learning_rate,
|
self.learning_rate = tf.maximum(self.min_learning_rate,
|
||||||
tf.train.exponential_decay(self.max_learning_rate,
|
self.exp_decay(self.max_learning_rate, self.episodes_trained, 0.96, 50000),
|
||||||
self.global_step, 50000,
|
name="learning_rate")
|
||||||
0.96,
|
|
||||||
staircase=True),
|
|
||||||
name="learning_rate")
|
|
||||||
|
|
||||||
|
|
||||||
|
# self.learning_rate = 0.1
|
||||||
|
print(tf.train.get_global_step())
|
||||||
with tf.GradientTape() as tape:
|
with tf.GradientTape() as tape:
|
||||||
value = self.model(np.array(input).reshape(1, -1))
|
value = self.model(prev_state.reshape(1,-1))
|
||||||
grads = tape.gradient(value, self.model.variables)
|
grads = tape.gradient(value, self.model.variables)
|
||||||
|
|
||||||
difference_in_values = tf.reshape(tf.subtract(value_next, value, name='difference_in_values'), [])
|
difference_in_values = tf.reshape(tf.subtract(value_next, value, name='difference_in_values'), [])
|
||||||
tf.summary.scalar("difference_in_values", tf.abs(difference_in_values))
|
tf.summary.scalar("difference_in_values", tf.abs(difference_in_values))
|
||||||
|
|
||||||
global_step_op = self.global_step.assign_add(1)
|
# global_step_op = self.global_step.assign_add(1)
|
||||||
|
|
||||||
with tf.variable_scope('apply_gradients'):
|
with tf.variable_scope('apply_gradients'):
|
||||||
for grad, train_var in zip(grads, self.model.variables):
|
for grad, train_var in zip(grads, self.model.variables):
|
||||||
backprop_calc = self.learning_rate * difference_in_values * grad
|
backprop_calc = self.learning_rate * difference_in_values * grad
|
||||||
train_var.assign_add(backprop_calc)
|
train_var.assign_add(backprop_calc)
|
||||||
|
|
||||||
|
print(self.episodes_trained)
|
||||||
|
|
||||||
def eval_state(self, sess, state):
|
def eval_state(self, sess, state):
|
||||||
return sess.run(self.value, feed_dict={self.x: state})
|
return sess.run(self.value, feed_dict={self.x: state})
|
||||||
|
|
||||||
def save_model(self, episode_count, global_step):
|
def save_model(self, episode_count, global_step):
|
||||||
tfe.Saver(self.model.variables).save("./tmp_ckpt", global_step=global_step)
|
tfe.Saver(self.model.variables).save(os.path.join(self.checkpoint_path, 'model.ckpt'), global_step=self.global_step)
|
||||||
#self.saver.save(sess, os.path.join(self.checkpoint_path, 'model.ckpt'), global_step=global_step)
|
#self.saver.save(sess, os.path.join(self.checkpoint_path, 'model.ckpt'), global_step=global_step)
|
||||||
#with open(os.path.join(self.checkpoint_path, "episodes_trained"), 'w+') as f:
|
with open(os.path.join(self.checkpoint_path, "episodes_trained"), 'w+') as f:
|
||||||
# print("[NETWK] ({name}) Saving model to:".format(name=self.name),
|
print("[NETWK] ({name}) Saving model to:".format(name=self.name),
|
||||||
# os.path.join(self.checkpoint_path, 'model.ckpt'))
|
os.path.join(self.checkpoint_path, 'model.ckpt'))
|
||||||
# f.write(str(episode_count) + "\n")
|
f.write(str(episode_count) + "\n")
|
||||||
|
|
||||||
|
|
||||||
def calc_vals(self, states):
|
def calc_vals(self, states):
|
||||||
values = self.model.predict_on_batch(states)
|
values = self.model.predict_on_batch(states)
|
||||||
self.save_model(0, 432)
|
|
||||||
return values
|
return values
|
||||||
|
|
||||||
|
|
||||||
|
@ -120,9 +124,9 @@ class Network:
|
||||||
latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_path)
|
latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_path)
|
||||||
print("[NETWK] ({name}) Restoring model from:".format(name=self.name),
|
print("[NETWK] ({name}) Restoring model from:".format(name=self.name),
|
||||||
str(latest_checkpoint))
|
str(latest_checkpoint))
|
||||||
tfe.Saver(model.variables).restore(latest_checkpoint)
|
tfe.Saver(self.model.variables).restore(latest_checkpoint)
|
||||||
|
|
||||||
variables_names = [v.name for v in self.model.variables]
|
# variables_names = [v.name for v in self.model.variables]
|
||||||
|
|
||||||
|
|
||||||
# Restore trained episode count for model
|
# Restore trained episode count for model
|
||||||
|
@ -130,11 +134,11 @@ class Network:
|
||||||
if os.path.isfile(episode_count_path):
|
if os.path.isfile(episode_count_path):
|
||||||
with open(episode_count_path, 'r') as f:
|
with open(episode_count_path, 'r') as f:
|
||||||
self.config['start_episode'] = int(f.read())
|
self.config['start_episode'] = int(f.read())
|
||||||
else:
|
# else:
|
||||||
latest_checkpoint = tf.train.latest_checkpoint("./")
|
# latest_checkpoint = tf.train.latest_checkpoint("./")
|
||||||
print("[NETWK] ({name}) Restoring model from:".format(name=self.name),
|
# print("[NETWK] ({name}) Restoring model from:".format(name=self.name),
|
||||||
str(latest_checkpoint))
|
# str(latest_checkpoint))
|
||||||
tfe.Saver(self.model.variables).restore(latest_checkpoint)
|
# tfe.Saver(self.model.variables).restore(latest_checkpoint)
|
||||||
|
|
||||||
#variables_names = [v.name for v in self.model.variables]
|
#variables_names = [v.name for v in self.model.variables]
|
||||||
|
|
||||||
|
@ -143,9 +147,9 @@ class Network:
|
||||||
#if os.path.isfile(episode_count_path):
|
#if os.path.isfile(episode_count_path):
|
||||||
# with open(episode_count_path, 'r') as f:
|
# with open(episode_count_path, 'r') as f:
|
||||||
# self.config['start_episode'] = int(f.read())
|
# self.config['start_episode'] = int(f.read())
|
||||||
|
tf.train.get_or_create_global_step()
|
||||||
|
|
||||||
|
def make_move(self, board, roll, player):
|
||||||
def make_move(self, sess, board, roll, player):
|
|
||||||
"""
|
"""
|
||||||
Find the best move given a board, roll and a player, by finding all possible states one can go to
|
Find the best move given a board, roll and a player, by finding all possible states one can go to
|
||||||
and then picking the best, by using the network to evaluate each state. The highest score is picked
|
and then picking the best, by using the network to evaluate each state. The highest score is picked
|
||||||
|
@ -157,12 +161,19 @@ class Network:
|
||||||
:param player: Current player
|
:param player: Current player
|
||||||
:return: A pair of the best state to go to, together with the score of that state
|
:return: A pair of the best state to go to, together with the score of that state
|
||||||
"""
|
"""
|
||||||
legal_moves = Board.calculate_legal_states(board, player, roll)
|
legal_states = list(Board.calculate_legal_states(board, player, roll))
|
||||||
moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves]
|
legal_states = [list(tmp) for tmp in legal_states]
|
||||||
scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores]
|
legal_states = np.array([Board.board_features_quack_fat(tmp, player)[0] for tmp in legal_states])
|
||||||
best_score_index = np.array(scores).argmax()
|
legal_moves = [self.board_trans_func(board, player) for board in Board.calculate_legal_states(board, player, roll)]
|
||||||
best_move_pair = moves_and_scores[best_score_index]
|
|
||||||
return best_move_pair
|
scores = self.model.predict_on_batch(legal_states)
|
||||||
|
transformed_scores = [x if np.sign(player) > 0 else 1 - x for x in scores]
|
||||||
|
|
||||||
|
best_score_idx = np.argmax(np.array(transformed_scores))
|
||||||
|
best_move = legal_moves[best_score_idx]
|
||||||
|
best_score = scores[best_score_idx]
|
||||||
|
self.episodes_trained += 1
|
||||||
|
return [best_move, best_score]
|
||||||
|
|
||||||
def make_move_n_ply(self, sess, board, roll, player, n = 1):
|
def make_move_n_ply(self, sess, board, roll, player, n = 1):
|
||||||
best_pair = self.calc_n_ply(n, sess, board, player, roll)
|
best_pair = self.calc_n_ply(n, sess, board, player, roll)
|
||||||
|
|
|
@ -9,7 +9,7 @@ from board import Board
|
||||||
import main
|
import main
|
||||||
|
|
||||||
config = main.config.copy()
|
config = main.config.copy()
|
||||||
config['model'] = "tesauro_blah"
|
config['model'] = "eager_testings"
|
||||||
config['force_creation'] = True
|
config['force_creation'] = True
|
||||||
config['board_representation'] = 'quack-fat'
|
config['board_representation'] = 'quack-fat'
|
||||||
network = Network(config, config['model'])
|
network = Network(config, config['model'])
|
||||||
|
@ -75,10 +75,18 @@ def calculate_possible_states(board):
|
||||||
#print(network.calculate_1_ply(session, Board.initial_state, [2,4], 1))
|
#print(network.calculate_1_ply(session, Board.initial_state, [2,4], 1))
|
||||||
|
|
||||||
board = network.board_trans_func(Board.initial_state, 1)
|
board = network.board_trans_func(Board.initial_state, 1)
|
||||||
|
#print(board)
|
||||||
|
|
||||||
input = [0, 2, 0, 0, 0, 0, -5, 0, -3, 0, 0, 0, 5, -5, 0, 0, 0, 3, 0, 5, 0, 0, 0, 0, -2, 0, 0, 0, 1, 0]
|
pair = network.make_move(Board.initial_state, [3,2], 1)
|
||||||
all_input = np.array([input for _ in range(20)])
|
|
||||||
print(network.calc_vals(all_input))
|
print(pair[1])
|
||||||
|
|
||||||
|
network.do_backprop(board, 0.9)
|
||||||
|
|
||||||
|
network.save_model(2, 342)
|
||||||
|
|
||||||
|
# all_input = np.array([input for _ in range(20)])
|
||||||
|
# print(network.calc_vals(all_input))
|
||||||
|
|
||||||
|
|
||||||
#print(" "*10 + "network_test")
|
#print(" "*10 + "network_test")
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import time
|
import time
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
from board import Board
|
||||||
import tensorflow.contrib.eager as tfe
|
import tensorflow.contrib.eager as tfe
|
||||||
|
|
||||||
|
|
||||||
|
@ -23,12 +24,14 @@ model = tf.keras.Sequential([
|
||||||
|
|
||||||
#tfe.Saver(model.variables).restore(tf.train.latest_checkpoint("./"))
|
#tfe.Saver(model.variables).restore(tf.train.latest_checkpoint("./"))
|
||||||
|
|
||||||
input = [0, 2, 0, 0, 0, 0, -5, 0, -3, 0, 0, 0, 5, -5, 0, 0, 0, 3, 0, 5, 0, 0, 0, 0, -2, 0, 0, 0, 1, 0]
|
input = [0, 2, 0, 0, 0, 0, -5, 0, -3, 0, 0, 0, 5, -5, 0, 0, 0, 3, 0, 5, 0, 0, 0, 0, -2, 0]
|
||||||
|
|
||||||
all_input = np.array([input for _ in range(20)])
|
|
||||||
|
|
||||||
|
|
||||||
single_in = np.array(input).reshape(1,-1)
|
|
||||||
|
all_input = np.array([Board.board_features_quack_fat(input, 1) for _ in range(20)])
|
||||||
|
|
||||||
|
|
||||||
|
single_in = Board.board_features_quack_fat(input, 1)
|
||||||
|
|
||||||
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
@ -48,10 +51,10 @@ print(time.time() - start)
|
||||||
|
|
||||||
print("-"*30)
|
print("-"*30)
|
||||||
with tf.GradientTape() as tape:
|
with tf.GradientTape() as tape:
|
||||||
val = model(np.array(input).reshape(1,-1))
|
val = model(single_in)
|
||||||
grads = tape.gradient(val, model.variables)
|
grads = tape.gradient(val, model.variables)
|
||||||
|
|
||||||
grads = [0.1*val-np.random.uniform(-1,1)+grad for grad, trainable_var in zip(grads, model.variables)]
|
# grads = [0.1*val-np.random.uniform(-1,1)+grad for grad, trainable_var in zip(grads, model.variables)]
|
||||||
|
|
||||||
# print(model.variables[0][0])
|
# print(model.variables[0][0])
|
||||||
weights_before = model.weights[0]
|
weights_before = model.weights[0]
|
||||||
|
@ -60,14 +63,20 @@ start = time.time()
|
||||||
#[trainable_var.assign_add(0.1*val-0.3+grad) for grad, trainable_var in zip(grads, model.variables)]
|
#[trainable_var.assign_add(0.1*val-0.3+grad) for grad, trainable_var in zip(grads, model.variables)]
|
||||||
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
#for gradient, trainable_var in zip(grads, model.variables):
|
for gradient, trainable_var in zip(grads, model.variables):
|
||||||
# backprop_calc = 0.1 * (val - np.random.uniform(-1, 1)) * gradient
|
backprop_calc = 0.1 * (0.9 - val) * gradient
|
||||||
# trainable_var.assign_add(backprop_calc)
|
trainable_var.assign_add(backprop_calc)
|
||||||
|
|
||||||
opt.apply_gradients(zip(grads, model.variables))
|
# opt.apply_gradients(zip(grads, model.variables))
|
||||||
|
|
||||||
print(time.time() - start)
|
print(time.time() - start)
|
||||||
|
|
||||||
print(model(np.array(input).reshape(1,-1)))
|
print(model(single_in))
|
||||||
|
|
||||||
tfe.Saver(model.variables).save("./tmp_ckpt")
|
vals = model.predict_on_batch(all_input)
|
||||||
|
vals = list(vals)
|
||||||
|
vals[3] = 4
|
||||||
|
print(vals)
|
||||||
|
print(np.argmax(np.array(vals)))
|
||||||
|
|
||||||
|
# tfe.Saver(model.variables).save("./tmp_ckpt")
|
||||||
|
|
|
@ -35,15 +35,16 @@ class Everything:
|
||||||
trainable_vars = tf.trainable_variables()
|
trainable_vars = tf.trainable_variables()
|
||||||
gradients = tf.gradients(self.value, trainable_vars)
|
gradients = tf.gradients(self.value, trainable_vars)
|
||||||
|
|
||||||
|
difference_in_values = tf.reshape(tf.subtract(0.9, self.value, name='difference_in_values'), [])
|
||||||
|
|
||||||
with tf.variable_scope('apply_gradients'):
|
with tf.variable_scope('apply_gradients'):
|
||||||
for gradient, trainable_var in zip(gradients, trainable_vars):
|
for gradient, trainable_var in zip(gradients, trainable_vars):
|
||||||
backprop_calc = self.learning_rate * difference_in_values * gradient
|
backprop_calc = 0.1 * difference_in_values * gradient
|
||||||
grad_apply = trainable_var.assign_add(backprop_calc)
|
grad_apply = trainable_var.assign_add(backprop_calc)
|
||||||
apply_gradients.append(grad_apply)
|
apply_gradients.append(grad_apply)
|
||||||
|
|
||||||
with tf.control_dependencies([global_step_op]):
|
|
||||||
self.training_op = tf.group(*apply_gradients, name='training_op')
|
self.training_op = tf.group(*apply_gradients, name='training_op')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -56,7 +57,9 @@ class Everything:
|
||||||
val = sess.run(self.value, feed_dict={self.input: input.reshape(1,-1)})
|
val = sess.run(self.value, feed_dict={self.input: input.reshape(1,-1)})
|
||||||
print(time.time() - start)
|
print(time.time() - start)
|
||||||
print(val)
|
print(val)
|
||||||
|
sess.run(self.training_op, feed_dict={self.input: input.reshape(1,-1)})
|
||||||
|
val = sess.run(self.value, feed_dict={self.input: input.reshape(1, -1)})
|
||||||
|
print(val)
|
||||||
|
|
||||||
everything = Everything()
|
everything = Everything()
|
||||||
everything.eval()
|
everything.eval()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user