backgammon/network.py

126 lines
6.0 KiB
Python
Raw Normal View History

2018-03-04 16:35:36 +00:00
import tensorflow as tf
from cup import Cup
import numpy as np
from board import Board
import os
2018-03-08 15:27:16 +00:00
class Network:
2018-03-04 16:35:36 +00:00
hidden_size = 40
input_size = 26
output_size = 1
# Can't remember the best learning_rate, look this up
learning_rate = 0.1
2018-02-07 14:31:05 +00:00
2018-03-04 16:35:36 +00:00
# TODO: Actually compile tensorflow properly
2018-03-06 11:04:56 +00:00
#os.environ["TF_CPP_MIN_LOG_LEVEL"]="2"
def custom_tanh(self, x, name=None):
a = tf.Variable(2.00, tf.float32)
return tf.scalar_mul(a, tf.tanh(x, name))
2018-03-04 16:35:36 +00:00
2018-03-08 15:27:16 +00:00
def __init__(self, session, config = None):
self.config = config
2018-03-04 16:35:36 +00:00
self.session = session
2018-03-08 15:27:16 +00:00
self.checkpoint_path = config['model_path']
# Restore trained episode count for model
2018-03-11 23:11:55 +00:00
episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained")
if os.path.isfile(episode_count_path):
with open(episode_count_path, 'r') as f:
self.config['start_episode'] = int(f.read())
2018-03-08 15:27:16 +00:00
2018-03-04 16:35:36 +00:00
# input = x
2018-03-08 15:27:16 +00:00
self.x = tf.placeholder('float', [1, Network.input_size], name='x')
self.value_next = tf.placeholder('float', [1, Network.output_size], name="value_next")
2018-03-04 16:35:36 +00:00
xavier_init = tf.contrib.layers.xavier_initializer()
2018-03-08 15:27:16 +00:00
W_1 = tf.Variable(xavier_init((Network.input_size, Network.hidden_size)))
W_2 = tf.Variable(xavier_init((Network.hidden_size, Network.output_size)))
2018-03-04 16:35:36 +00:00
2018-03-08 15:27:16 +00:00
b_1 = tf.zeros(Network.hidden_size,)
b_2 = tf.zeros(Network.output_size,)
2018-03-04 16:35:36 +00:00
value_after_input = self.custom_tanh(tf.matmul(self.x, W_1) + b_1, name='hidden_layer')
2018-03-04 16:35:36 +00:00
# TODO: Remember to make this tanh * 2
self.value = self.custom_tanh(tf.matmul(value_after_input, W_2) + b_2, name='output_layer')
2018-03-04 16:35:36 +00:00
# tf.reduce_sum basically finds the sum of it's input, so this gives the difference between the two values, in case they should be lists, which they might be if our input changes
difference_in_values = tf.reduce_sum(self.value_next - self.value, name='difference')
trainable_vars = tf.trainable_variables()
gradients = tf.gradients(self.value, trainable_vars)
apply_gradients = []
with tf.variable_scope('apply_gradients'):
for gradient, trainable_var in zip(gradients, trainable_vars):
# Hopefully this is Δw_t = α(V_t+1 - V_t)▿_wV_t.
2018-03-08 15:27:16 +00:00
backprop_calc = Network.learning_rate * difference_in_values * gradient
2018-03-04 16:35:36 +00:00
grad_apply = trainable_var.assign_add(backprop_calc)
apply_gradients.append(grad_apply)
self.training_op = tf.group(*apply_gradients, name='training_op')
self.saver = tf.train.Saver(max_to_keep=1)
self.session.run(tf.global_variables_initializer())
2018-02-07 14:31:05 +00:00
def eval_state(self, state):
# Run state through a network
2018-03-04 16:35:36 +00:00
# Remember to create placeholders for everything because wtf tensorflow and graphs
# Remember to create the dense layers
# Figure out a way of giving a layer a custom activiation function (we want something which gives [-2,2]. Naively tahn*2, however I fell this is wrong.
# tf.group, groups a bunch of actions, so calculate the different gradients for the different weights, by using tf.trainable_variables() to find all variables and tf.gradients(current_value, trainable_variables) to find all the gradients. We can then loop through this and calculate the trace for each gradient and variable pair (note, zip can be used to combine the two lists found before), and then we can calculate the overall change in weights, based on the formula listed in tesauro (learning_rate * difference_in_values * trace), this calculation can be assigned to a tf variable and put in a list and then this can be grouped into a single operation, essentially building our own backprop function.
# Grouping them is done by tf.group(*the_gradients_from_before_we_want_to_apply, name="training_op")
# If we remove the eligibily trace to begin with, we only have
# to implement learning_rate * (difference_in_values) * gradients (the before-mentioned calculation.
# print("Network is evaluating")
val = self.session.run(self.value, feed_dict={self.x: state})
return val
def save_model(self, episode_count):
2018-03-08 15:27:16 +00:00
self.saver.save(self.session, os.path.join(self.checkpoint_path, 'model.ckpt'))
2018-03-11 23:11:55 +00:00
with open(os.path.join(self.checkpoint_path, "episodes_trained"), 'w+') as f:
f.write(str(episode_count) + "\n")
2018-03-04 16:35:36 +00:00
def restore_model(self):
2018-03-06 10:53:42 +00:00
if os.path.isfile(self.checkpoint_path):
latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_path)
self.saver.restore(self.session, latest_checkpoint)
2018-03-04 16:35:36 +00:00
# Have a circular dependency, #fuck, need to rewrite something
2018-03-06 15:23:08 +00:00
def train(self, board, v_next):
2018-03-04 16:35:36 +00:00
# print("lol")
2018-03-06 15:23:08 +00:00
board = np.array(board).reshape((1,26))
self.session.run(self.training_op, feed_dict = {self.x:board, self.value_next: v_next})
2018-03-04 16:35:36 +00:00
# while game isn't done:
#x_next = g.next_move()
#value_next = network.eval_state(x_next)
#self.session.run(self.training_op, feed_dict={self.x: x, self.value_next: value_next})
#x = x_next
# take turn, which finds the best state and picks it, based on the current network
# save current state
# run training operation (session.run(self.training_op, {x:x, value_next, value_next})), (something which does the backprop, based on the state after having taken a turn, found before, and the state we saved in the beginning and from now we'll save it at the end of the turn
# save the current state again, so we can continue running backprop based on the "previous" turn.
# NOTE: We need to make a method so that we can take a single turn or at least just pick the next best move, so we know how to evaluate according to TD-learning. Right now, our game just continues in a while loop without nothing to stop it!