From 85ec8d8e4e61e840b38bfafe89c7bfb69ede633e Mon Sep 17 00:00:00 2001
From: Alexander Munch-Hansen <alexmunchhansen@gmail.com>
Date: Tue, 20 Mar 2018 17:29:29 +0100
Subject: [PATCH] Added tesauro + sigmoid

---
 board.py   | 36 ++++++++++++++++++++++++++++++++++--
 network.py | 35 +++++++++++++++++++++++++----------
 2 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/board.py b/board.py
index bfa7998..cc500f3 100644
--- a/board.py
+++ b/board.py
@@ -34,8 +34,40 @@ class Board:
         board.append(15 - sum(positives))
         board.append(-15 - sum(negatives))
         return tuple(board)
-            
-            
+
+    # The original tesauro also takes in the player, so [1,0] for one of them and [0,1] for the other
+    # Not sure if this should be included
+    @staticmethod
+    def map_to_tesauro(board):
+        features = []
+        for i in range(1,25):
+            idx = list(board)[i]
+            place = [0]*8
+            if (idx != 0):
+                if idx > 0:
+                    for i in range(min(int(idx),3)):
+                        place[i]=1.
+                    if idx>3:
+                        place[3]+=(idx-3)/2
+                else:
+                    for i in range(min(abs(int(idx)),3)):
+                        place[i+4]=1.
+                    if idx>3:
+                        place[3+4]+=(idx-3)/2    
+            features+=place
+
+        nega_hits = list(board)[0]/2 
+        posi_hits = list(board)[25]/2 
+        positives = [x if x > 0 else 0 for x in board]
+        negatives = [x if x < 0 else 0 for x in board]
+        posi_home = ((15 - sum(positives))/15)
+        nega_home = ((-15 - sum(negatives))/15)
+        features.append(nega_hits)
+        features.append(posi_hits)
+        features.append(posi_home)
+        features.append(nega_home)
+#        print(features)
+        return features
 
 
     @staticmethod
diff --git a/network.py b/network.py
index 62b1d17..069826a 100644
--- a/network.py
+++ b/network.py
@@ -10,7 +10,7 @@ from eval import Eval
 
 class Network:
     hidden_size = 40
-    input_size = 26
+    input_size = 196
     output_size = 1
     # Can't remember the best learning_rate, look this up
     learning_rate = 0.1
@@ -43,17 +43,20 @@ class Network:
         b_2 = tf.get_variable("b_2", (Network.output_size,),
                               initializer=tf.zeros_initializer)
 
-        value_after_input = self.custom_tanh(tf.matmul(self.x, W_1) + b_1, name='hidden_layer')
+        value_after_input = tf.sigmoid(tf.matmul(self.x, W_1) + b_1, name='hidden_layer')
 
-        self.value = self.custom_tanh(tf.matmul(value_after_input, W_2) + b_2, name='output_layer')
+
+        
+        
+        self.value = tf.sigmoid(tf.matmul(value_after_input, W_2) + b_2, name='output_layer')
 
         # tf.reduce_sum basically finds the sum of its input, so this gives the
         # difference between the two values, in case they should be lists, which
         # they might be if our input changes
 
         # TODO: Alexander thinks that self.value will be computed twice (instead of once)
-        difference_in_values = tf.reduce_sum(self.value_next - self.value, name='difference')
-        
+        difference_in_values = tf.reduce_sum(tf.subtract(self.value_next, self.value, name='difference'))
+
         trainable_vars = tf.trainable_variables()
         gradients = tf.gradients(self.value, trainable_vars)
 
@@ -140,7 +143,7 @@ class Network:
     # Have a circular dependency, #fuck, need to rewrite something
     def adjust_weights(self, board, v_next):
 #        print("lol")
-        board = np.array(board).reshape((1,26))
+        board = np.array(board).reshape((1,-1))
         self.session.run(self.training_op, feed_dict = { self.x: board,
                                                          self.value_next: v_next })
                 
@@ -156,7 +159,7 @@ class Network:
     def make_move(self, board, roll):
         # print(Board.pretty(board))
         legal_moves = Board.calculate_legal_states(board, 1, roll)
-        moves_and_scores = [ (move, self.eval_state(np.array(move).reshape(1,26))) for move in legal_moves ]
+        moves_and_scores = [ (move, self.eval_state(np.array(Board.map_to_tesauro(move)).reshape(1,-1))) for move in legal_moves ]
         scores = [ x[1] for x in moves_and_scores ]
         best_score_index = np.array(scores).argmax()
         best_move_pair = moves_and_scores[best_score_index]
@@ -181,20 +184,31 @@ class Network:
         outcomes = []
         for episode in range(1, episodes + 1):
             sys.stderr.write("[TRAIN] Episode {}".format(episode + trained_eps))
+#            print("greerggeregr"*10000)
             # TODO decide which player should be here
             player = 1
                 
             roll = (random.randrange(1,7), random.randrange(1,7))
+
+            def tesaurofi(board):
+                return Board.map_to_tesauro(board)
+
             prev_board, _ = self.make_move(Board.flip(Board.initial_state) if player == -1 else Board.initial_state, roll)
+
             if player == -1:
                 prev_board = Board.flip(prev_board)
-            
+
+            # print("board:",prev_board)
+            # print(len(prev_board))
+                
             # find the best move here, make this move, then change turn as the
             # first thing inside of the while loop and then call
             # best_move_and_score to get V_t+1
 
             # i = 0
             while Board.outcome(prev_board) is None:
+                #print(prev_board)
+
                 # print("-"*30)
                 # print(i)
                 # print(roll)
@@ -206,10 +220,11 @@ class Network:
                 roll = (random.randrange(1,7), random.randrange(1,7))
 
                 cur_board, cur_board_value = self.make_move(Board.flip(prev_board) if player == -1 else prev_board, roll)
+                #print("pls",cur_board_value)
                 if player == -1:
                     cur_board  = Board.flip(cur_board)
 
-                self.adjust_weights(prev_board, cur_board_value)
+                self.adjust_weights(tesaurofi(prev_board), cur_board_value)
 
                 prev_board = cur_board
 
@@ -217,7 +232,7 @@ class Network:
             sys.stderr.write("\t outcome {}".format(Board.outcome(final_board)[1]))
             outcomes.append(Board.outcome(final_board)[1])
             final_score = np.array([ Board.outcome(final_board)[1] ])
-            self.adjust_weights(prev_board, final_score.reshape((1, 1)))
+            self.adjust_weights(tesaurofi(prev_board), final_score.reshape((1, 1)))
 
             sys.stderr.write("\n")