Potentially functioning network

2018-03-04 17:35:36 +01:00 · 2018-03-04 17:35:36 +01:00 · d3fe3c918c
commit d3fe3c918c
parent 275689c002
6 changed files with 291 additions and 13 deletions
--- a/board.py
+++ b/board.py
@ -216,12 +216,12 @@ class Board:
        temp = []
        for x in board:
            if x > 0:
-                temp.append(" {}".format(white(x)))
+                temp.append(" {}".format(x))
            elif x < 0:
-                temp.append("{}".format(black(x)))
+                temp.append("{}".format(x))
            else: temp.append("  ")
                
-        return """\033[0;47m
+        return """
  13  14  15  16  17  18               19  20  21  22  23  24
 +--------------------------------------------------------------------------+
 | {11}| {10}| {9}| {8}| {7}| {6}| bar -1: {24} | {5}| {4}| {3}| {2}| {1}| {0}| end -1: TODO|
@ -229,7 +229,7 @@ class Board:
 | {12}| {13}| {14}| {15}| {16}| {17}| bar  1: {25} | {18}| {19}| {20}| {21}| {22}| {23}| end  1: TODO|
 +--------------------------------------------------------------------------+
  12  11  10   9   8   7                6   5   4   3   2   1 
- \033[0m""".format(*temp)
+ """.format(*temp)

    @staticmethod
    def do_move(board, player, move):
--- a/bot.py
+++ b/bot.py
@ -1,4 +1,7 @@
 from cup import Cup
+import tensorflow as tf
+from network import Network
+import numpy as np
 from board import Board
 import random

@ -7,6 +10,11 @@ class Bot:
    def __init__(self, sym):
        self.cup = Cup()
        self.sym = sym
+        self.graph = tf.Graph()
+        with self.graph.as_default():
+            self.session = tf.Session()
+            self.network = Network(self.session)
+            

    def roll(self):
        print("{} rolled: ".format(self.sym))
@ -18,11 +26,23 @@ class Bot:
    def switch(self,cur):
        return -1 if cur == 1 else 1

+    def get_session(self):
+        return self.session
+        
    def get_sym(self):
        return self.sym

+    def get_network(self):
+        return self.network
+
    def make_move(self, board, sym, roll):
        # print(Board.pretty(board))
        legal_moves = Board.calculate_legal_states(board, sym, roll)
+        legal_list = list(legal_moves)
+        lol = []
+        for move in legal_list:
+            lol.append(self.network.eval_state(np.array(move).reshape(1,26)))
+        print("Found the best state, being:", np.array(lol).argmax())
+        return [legal_list[np.array(lol).argmax()], max(lol)]
        
-        return random.choice(list(legal_moves))
+#        return random.choice(list(legal_moves))
--- a/game.py
+++ b/game.py
@ -2,19 +2,73 @@ import time
 from human import Human
 from board import Board
 from bot import Bot
-from network import Network
+from restore_bot import Restore_bot
+import tensorflow as tf
+import numpy as np
+import random
+
 from cup import Cup

 class Game:
    def __init__(self):
        self.board = Board.initial_state
+#        self.session = tf.Session()
+#        self.restored_network = Network(self.session)
+#        self.network = Network(self.session)
+#        self.restored_network.restore_model()
+        
        self.p1 = Bot(1)
-        self.p2 = Bot(-1)
+        self.p2 = Restore_bot(-1)
        self.cup = Cup()

    def roll(self):
        return self.cup.roll()

+    def roll_and_find_best_for_bot(self):
+        roll = self.roll()
+        move_and_val = self.p1.make_move(self.board, self.p1.get_sym(), roll)
+        self.board = move_and_val[0]
+        return move_and_val
+    
+    def next_round(self):
+        roll = self.roll()
+        print(roll)
+        self.board = self.p2.make_move(self.board, self.p2.get_sym(),roll)
+        return self.board
+
+    def board_state(self):
+        return self.board
+
+    def train_model(self):
+        episodes = 100
+        outcomes = []
+        for episode in range(episodes):
+            self.board = Board.initial_state
+            x = self.board
+            while Board.outcome(self.board) == None:
+                x_next, v_next = self.roll_and_find_best_for_bot()
+                self.p1.get_network().train(x, v_next)
+                x = x_next
+                self.next_round()
+            print("Outcome:",Board.outcome(self.board)[1])
+            outcomes.append(Board.outcome(self.board)[1])
+            self.p1.get_network().train(x, np.array([Board.outcome(self.board)[1]]).reshape((1,1)))
+            print("trained an episode")
+            if episode % 10 == 0:
+                print("Saving ....")
+                self.p1.get_network().save_model()
+        print(outcomes)
+                
+    
+    def next_round_test(self):
+        print(self.board)
+        print()
+        self.next_round()
+        print("--------------------------------")
+        print(self.board)
+        print("--------------------------------")
+
+
    def play(self):
        count = 0
        while Board.outcome(self.board) == None:
@ -24,11 +78,11 @@ class Game:

            roll = self.roll()

-            #print("type of board: ", type(self.board))
+            print("type of board: ", type(self.board))
            print("Board:",self.board)
            print("{} rolled: {}".format(self.p1.get_sym(), roll))

-            self.board = self.p1.make_move(self.board, self.p1.get_sym(), roll)
+            self.board = (self.p1.make_move(self.board, self.p1.get_sym(), roll))[0]

            print(self.board)

@ -47,6 +101,16 @@ class Game:
            print_winner = "-1: Black " + str(Board.outcome(self.board))
        print("The winner is {}!".format(print_winner))
        print("Final board:",Board.pretty(self.board))
+        return count

+highest = 0
+
+#for i in range(100000):
+#    try:
 g = Game()
-g.play()
+g.train_model()
+#count = g.play()
+ #       highest = max(highest,count)
+ #   except KeyboardInterrupt:
+ #       break
+#print("\nHighest amount of turns is:",highest)
--- a/network.py
+++ b/network.py
@ -1,5 +1,124 @@
+import tensorflow as tf
+from cup import Cup
+import numpy as np
+from board import Board
+#from game import Game
+import os
+
+class Config():
+    hidden_size = 40
+    input_size = 26
+    output_size = 1
+    # Can't remember the best learning_rate, look this up
+    learning_rate = 0.3
+    checkpoint_path = "/tmp/"
+
+    
 class Network:

+    # TODO: Actually compile tensorflow properly
+    os.environ["TF_CPP_MIN_LOG_LEVEL"]="2"
+    
+    def __init__(self, session):
+        self.session = session
+        self.config = Config
+        input_size           = self.config.input_size
+        hidden_size          = self.config.hidden_size
+        output_size          = self.config.output_size
+        learning_rate        = self.config.learning_rate
+        self.checkpoint_path = self.config.checkpoint_path
+
+        # input = x
+        self.x = tf.placeholder('float', [1,input_size], name='x')
+        self.value_next = tf.placeholder('float', [1,output_size], name="value_next")
+
+        xavier_init = tf.contrib.layers.xavier_initializer()
+        
+        W_1 = tf.Variable(xavier_init((input_size, hidden_size)))
+        W_2 = tf.Variable(xavier_init((hidden_size, output_size)))
+
+        b_1 = tf.zeros(hidden_size,)
+        b_2 = tf.zeros(output_size,)
+
+        value_after_input = tf.sigmoid(tf.matmul(self.x, W_1) + b_1, name='hidden_layer')
+
+        # TODO: Remember to make this tanh * 2
+      #  self.value = tf.layers.dense(input=value_after_input, units=hidden_size, \
+      #                               activation=self.custom_tanh, kernel_initializer=xavier_init())
+        self.value = 2*tf.nn.tanh(tf.matmul(value_after_input, W_2) + b_2, name='output_layer')
+
+        # tf.reduce_sum basically finds the sum of it's input, so this gives the difference between the two values, in case they should be lists, which they might be if our input changes
+        difference_in_values = tf.reduce_sum(self.value_next - self.value, name='difference')
+        
+        trainable_vars = tf.trainable_variables()
+        gradients = tf.gradients(self.value, trainable_vars)
+
+        apply_gradients = []
+        
+        with tf.variable_scope('apply_gradients'):
+            for gradient, trainable_var in zip(gradients, trainable_vars):
+                # Hopefully this is Δw_t = α(V_t+1 - V_t)▿_wV_t.
+                backprop_calc = learning_rate * difference_in_values * gradient
+                grad_apply = trainable_var.assign_add(backprop_calc)
+                apply_gradients.append(grad_apply)
+            
+            self.training_op = tf.group(*apply_gradients, name='training_op')
+            
+        self.saver = tf.train.Saver(max_to_keep=1)
+        self.session.run(tf.global_variables_initializer())
+
+        
    def eval_state(self, state):
        # Run state through a network
-        print("Network is evaluating")
+
+        # Remember to create placeholders for everything because wtf tensorflow and graphs
+
+        # Remember to create the dense layers
+
+        # Figure out a way of giving a layer a custom activiation function (we want something which gives [-2,2]. Naively tahn*2, however I fell this is wrong.
+
+        # tf.group, groups a bunch of actions, so calculate the different gradients for the different weights, by using tf.trainable_variables() to find all variables and tf.gradients(current_value, trainable_variables) to find all the gradients. We can then loop through this and calculate the trace for each gradient and variable pair (note, zip can be used to combine the two lists found before), and then we can calculate the overall change in weights, based on the formula listed in tesauro (learning_rate * difference_in_values * trace), this calculation can be assigned to a tf variable and put in a list and then this can be grouped into a single operation, essentially building our own backprop function.
+        # Grouping them is done by tf.group(*the_gradients_from_before_we_want_to_apply, name="training_op")
+
+        # If we remove the eligibily trace to begin with, we only have
+        # to implement learning_rate * (difference_in_values) * gradients (the before-mentioned calculation.
+
+        
+      #  print("Network is evaluating")
+        val = self.session.run(self.value, feed_dict={self.x: state})
+
+        return val
+
+
+
+    def save_model(self):
+        self.saver.save(self.session, self.checkpoint_path + 'model.ckpt')
+    
+    def restore_model(self):
+        latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_path)
+        self.saver.restore(self.session, latest_checkpoint)
+
+    # Have a circular dependency, #fuck, need to rewrite something
+    def train(self, x, v_next):
+#        print("lol")
+        x = np.array(x).reshape((1,26))
+        self.session.run(self.training_op, feed_dict = {self.x:x, self.value_next: v_next})
+                
+
+            # while game isn't done:
+                #x_next = g.next_move()
+                #value_next = network.eval_state(x_next)
+                #self.session.run(self.training_op, feed_dict={self.x: x, self.value_next: value_next})
+                #x = x_next
+
+
+
+
+            
+                # take turn, which finds the best state and picks it, based on the current network
+                # save current state
+                # run training operation (session.run(self.training_op, {x:x, value_next, value_next})), (something which does the backprop, based on the state after having taken a turn, found before, and the state we saved in the beginning and from now we'll save it at the end of the turn
+                # save the current state again, so we can continue running backprop based on the "previous" turn.
+
+        # NOTE: We need to make a method so that we can take a single turn or at least just pick the next best move, so we know how to evaluate according to TD-learning. Right now, our game just continues in a while loop without nothing to stop it!
+        
--- a/network_test.py
+++ b/network_test.py
@ -0,0 +1,32 @@
+from network import Network
+import tensorflow as tf
+import random
+import numpy as np
+
+session = tf.Session()
+graph_lol = tf.Graph()
+
+
+
+network = Network(session)
+
+initial_state = np.array(( 0,
+                  2, 0, 0, 0, 0, -5,
+                  0, -3, 0, 0, 0, 5,
+                  -5, 0, 0, 0, 3, 0,
+                  5, 0, 0, 0, 0, -2,
+                  0 )).reshape((1,26))
+    
+
+
+
+#print(x.shape)
+with graph_lol.as_default():
+    session_2 = tf.Session(graph = graph_lol)
+    network_2 = Network(session_2)
+    network_2.restore_model()
+    print(network_2.eval_state(initial_state))
+    
+print(network.eval_state(initial_state))
+
+
--- a/restore_bot.py
+++ b/restore_bot.py
@ -0,0 +1,43 @@
+from cup import Cup
+import numpy as np
+import tensorflow as tf
+from network import Network
+from board import Board
+import random
+
+class Restore_bot:
+
+    def __init__(self, sym):
+        self.cup = Cup()
+        self.sym = sym
+
+        self.graph = tf.Graph()
+        with self.graph.as_default():
+            self.session = tf.Session(graph = self.graph)
+            self.network = Network(self.session)
+            self.network.restore_model()
+
+    def roll(self):
+        print("{} rolled: ".format(self.sym))
+        roll = self.cup.roll()
+        print(roll)
+        return roll
+        
+
+    def switch(self,cur):
+        return -1 if cur == 1 else 1
+
+    def get_sym(self):
+        return self.sym
+
+    def make_move(self, board, sym, roll):
+        # print(Board.pretty(board))
+        legal_moves = Board.calculate_legal_states(board, sym, roll)
+        legal_list = list(legal_moves)
+        lol = []
+        for move in legal_list:
+            lol.append(self.network.eval_state(np.array(move).reshape(1,26)))
+        print("Found the best state, being:", np.array(lol).argmax())
+        return legal_list[np.array(lol).argmax()]
+        
+#        return random.choice(list(legal_moves))