backgammon/network.py

import tensorflow as tf
from cup import Cup
import numpy as np
from board import Board
import os
import time
import sys
import random
from eval import Eval
import glob

class Network:
    # board_features_quack has size 28
    # board_features_quack_fat has size 30
    # board_features_tesauro has size 198

    board_reps = {
        'quack-fat' : (30, Board.board_features_quack_fat),
        'quack'     : (28, Board.board_features_quack),
        'tesauro'   : (198, Board.board_features_tesauro)
    }

    def custom_tanh(self, x, name=None):
        return tf.scalar_mul(tf.constant(2.00), tf.tanh(x, name))

    def __init__(self, config, name):
        self.config = config
        self.checkpoint_path = os.path.join(config['model_storage_path'], config['model'])

        self.name = name

        # Set board representation from config
        self.input_size, self.board_trans_func = Network.board_reps[
            self.config['board_representation']
        ]
        self.output_size = 1
        self.hidden_size = 40
        # Can't remember the best learning_rate, look this up
        self.max_learning_rate = 0.1
        self.min_learning_rate = 0.001
        self.learning_rate = 0.01

        self.global_step = tf.Variable(0, trainable=False, name="global_step")
        # self.learning_rate = tf.maximum(self.min_learning_rate, tf.train.exponential_decay(self.max_learning_rate, self.global_step, 50000, 0.96, staircase=True), name="learning_rate")


        # Restore trained episode count for model
        episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained")
        if os.path.isfile(episode_count_path):
            with open(episode_count_path, 'r') as f:
                self.episodes_trained = int(f.read())
        else:
            self.episodes_trained = 0

        # input = x
        self.x = tf.placeholder('float', [1, self.input_size], name='input')
        self.value_next = tf.placeholder('float', [1, self.output_size], name="value_next")

        xavier_init = tf.contrib.layers.xavier_initializer()

        W_1 = tf.get_variable("w_1", (self.input_size, self.hidden_size),
                              initializer=xavier_init)
        W_2 = tf.get_variable("w_2", (self.hidden_size, self.output_size),
                              initializer=xavier_init)

        b_1 = tf.get_variable("b_1", (self.hidden_size,),
                              initializer=tf.zeros_initializer)
        b_2 = tf.get_variable("b_2", (self.output_size,),
                              initializer=tf.zeros_initializer)


        value_after_input = tf.sigmoid(tf.matmul(self.x, W_1) + b_1, name='hidden_layer')

        self.value = tf.sigmoid(tf.matmul(value_after_input, W_2) + b_2, name='output_layer')

        # tf.reduce_sum basically finds the sum of its input, so this gives the
        # difference between the two values, in case they should be lists, which
        # they might be if our input changes

        # TODO: Alexander thinks that self.value will be computed twice (instead of once)
        difference_in_values = tf.reshape(tf.subtract(self.value_next, self.value, name='difference_in_values'), [])
        tf.summary.scalar("difference_in_values", tf.abs(difference_in_values))

        trainable_vars = tf.trainable_variables()
        gradients = tf.gradients(self.value, trainable_vars)

        apply_gradients = []

        global_step_op = self.global_step.assign_add(1)


        with tf.variable_scope('apply_gradients'):
            for gradient, trainable_var in zip(gradients, trainable_vars):
                # Hopefully this is Δw_t = α(V_t+1 - V_t)▿_wV_t.
                backprop_calc = self.learning_rate * difference_in_values * gradient
                grad_apply = trainable_var.assign_add(backprop_calc)
                apply_gradients.append(grad_apply)


        with tf.control_dependencies([global_step_op]):

            self.training_op = tf.group(*apply_gradients, name='training_op')

        self.saver = tf.train.Saver(max_to_keep=1)

    def eval_state(self, sess, state):
        # Run state through a network

        # Remember to create placeholders for everything because wtf tensorflow
        # and graphs

        # Remember to create the dense layers

        # Figure out a way of giving a layer a custom activiation function (we
        # want something which gives [-2,2]. Naively tahn*2, however I fell this
        # is wrong.

        # tf.group, groups a bunch of actions, so calculate the different
        # gradients for the different weights, by using tf.trainable_variables()
        # to find all variables and tf.gradients(current_value,
        # trainable_variables) to find all the gradients. We can then loop
        # through this and calculate the trace for each gradient and variable
        # pair (note, zip can be used to combine the two lists found before),
        # and then we can calculate the overall change in weights, based on the
        # formula listed in tesauro (learning_rate * difference_in_values *
        # trace), this calculation can be assigned to a tf variable and put in a
        # list and then this can be grouped into a single operation, essentially
        # building our own backprop function.

        # Grouping them is done by
        # tf.group(*the_gradients_from_before_we_want_to_apply,
        # name="training_op")

        # If we remove the eligibily trace to begin with, we only have to
        # implement learning_rate * (difference_in_values) * gradients (the
        # before-mentioned calculation.

        # print("Network is evaluating")
        # print("eval ({})".format(self.name), state, val, sep="\n")

        return sess.run(self.value, feed_dict={self.x: state})

    def save_model(self, sess, episode_count, global_step):
        self.saver.save(sess, os.path.join(self.checkpoint_path, 'model.ckpt'), global_step=global_step)
        with open(os.path.join(self.checkpoint_path, "episodes_trained"), 'w+') as f:
            print("[NETWK] ({name}) Saving model to:".format(name=self.name),
                  os.path.join(self.checkpoint_path, 'model.ckpt'))
            f.write(str(episode_count) + "\n")

    def restore_model(self, sess):
        if glob.glob(os.path.join(self.checkpoint_path, 'model.ckpt*.index')):

            latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_path)
            print("[NETWK] ({name}) Restoring model from:".format(name=self.name),
                  str(latest_checkpoint))
            self.saver.restore(sess, latest_checkpoint)
            variables_names = [v.name for v in tf.trainable_variables()]
            values = sess.run(variables_names)
            for k, v in zip(variables_names, values):
                print("Variable: ", k)
                print("Shape: ", v.shape)
                print(v)

            # Restore trained episode count for model
            episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained")
            if os.path.isfile(episode_count_path):
                with open(episode_count_path, 'r') as f:
                    self.config['start_episode'] = int(f.read())
        else:
            assert False

    def make_move(self, sess, board, roll, player):
        # print(Board.pretty(board))
        legal_moves = Board.calculate_legal_states(board, player, roll)
        moves_and_scores = [(move, self.eval_state(sess, self.board_trans_func(move, player))) for move in legal_moves]
        scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores]
        best_score_index = np.array(scores).argmax()
        best_move_pair = moves_and_scores[best_score_index]
        # print("Found the best state, being:", np.array(move_scores).argmax())
        return best_move_pair

    def eval(self, episode_count, trained_eps = 0, tf_session = None):
        def do_eval(sess, method, episodes = 1000, trained_eps = 0):
            start_time = time.time()

            def print_time_estimate(eps_completed):
                cur_time = time.time()
                time_diff = cur_time - start_time
                eps_per_sec = eps_completed / time_diff
                secs_per_ep = time_diff / eps_completed
                eps_remaining = (episodes - eps_completed)
                sys.stderr.write(
                    "[EVAL ] Averaging {per_sec} episodes per second\n".format(per_sec=round(eps_per_sec, 2)))
                sys.stderr.write(
                    "[EVAL ] {eps_remaining} episodes remaining; approx. {time_remaining} seconds remaining\n".format(
                        eps_remaining=eps_remaining, time_remaining=int(eps_remaining * secs_per_ep)))

            sys.stderr.write(
                "[EVAL ] Evaluating {eps} episode(s) with method '{method}'\n".format(eps=episodes, method=method))

            if method == 'random':
                outcomes = []
                """for i in range(1, episodes + 1):
                    sys.stderr.write("[EVAL ] Episode {}".format(i))
                    board = Board.initial_state
                    while Board.outcome(board) is None:
                        roll = (random.randrange(1, 7), random.randrange(1, 7))
                        board = (self.p1.make_move(sess, board, self.p1.get_sym(), roll))[0]
                        roll = (random.randrange(1, 7), random.randrange(1, 7))
                        board = Board.flip(Eval.make_random_move(Board.flip(board), 1, roll))
                    sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1]))
                    outcomes.append(Board.outcome(board)[1])
                    sys.stderr.write("\n")

                    if i % 50 == 0:
                        print_time_estimate(i)"""
                return outcomes
            elif method == 'pubeval':
                outcomes = []
                # Add the evaluation code for pubeval, the bot has a method make_pubeval_move(board, sym, roll),
                #  which can be used to get the best move according to pubeval
                for i in range(1, episodes + 1):
                    sys.stderr.write("[EVAL ] Episode {}".format(i))
                    board = Board.initial_state
                    # print("init:", board, sep="\n")
                    while Board.outcome(board) is None:
                        # print("-"*30)
                        roll = (random.randrange(1, 7), random.randrange(1, 7))
                        # print(roll)

                        # prev_board = tuple(board)
                        board = (self.make_move(sess, board, roll, 1))[0]
                        # print("post p1:", board, sep="\n")

                        # print("."*30)
                        roll = (random.randrange(1, 7), random.randrange(1, 7))
                        # print(roll)

                        # prev_board = tuple(board)
                        board = Eval.make_pubeval_move(board, -1, roll)[0][0:26]
                        # print("post pubeval:", board, sep="\n")

                    sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1]))
                    outcomes.append(Board.outcome(board)[1])
                    sys.stderr.write("\n")

                    if i % 10 == 0:
                        print_time_estimate(i)

                return outcomes

            elif method == 'dumbeval':
                outcomes = []
                # Add the evaluation code for pubeval, the bot has a method make_pubeval_move(board, sym, roll),
                #  which can be used to get the best move according to pubeval
                for i in range(1, episodes + 1):
                    sys.stderr.write("[EVAL ] Episode {}".format(i))
                    board = Board.initial_state
                    # print("init:", board, sep="\n")
                    while Board.outcome(board) is None:
                        # print("-"*30)
                        roll = (random.randrange(1, 7), random.randrange(1, 7))
                        # print(roll)

                        # prev_board = tuple(board)
                        board = (self.make_move(sess, board, roll, 1))[0]
                        # print("post p1:", board, sep="\n")

                        # print("."*30)
                        roll = (random.randrange(1, 7), random.randrange(1, 7))
                        # print(roll)

                        # prev_board = tuple(board)
                        board = Eval.make_dumbeval_move(board, -1, roll)[0][0:26]
                        # print("post pubeval:", board, sep="\n")

                    sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1]))
                    outcomes.append(Board.outcome(board)[1])
                    sys.stderr.write("\n")

                    if i % 10 == 0:
                        print_time_estimate(i)

                return outcomes

            elif method == 'dumbmodel':
                outcomes = []
                """
                config_prime = self.config.copy()
                config_prime['model_path'] = os.path.join(config_prime['model_storage_path'], 'dumbmodel')
                eval_bot = Bot(1, config = config_prime, name = "dumbmodel")
                #print(self.config, "\n", config_prime)
                outcomes = []
                for i in range(1, episodes + 1):
                sys.stderr.write("[EVAL ] Episode {}".format(i))
                board = Board.initial_state
                while Board.outcome(board) is None:
                roll = (random.randrange(1,7), random.randrange(1,7))
                board = (self.make_move(board, self.p1.get_sym(), roll))[0]

                roll = (random.randrange(1,7), random.randrange(1,7))
                board = Board.flip(eval_bot.make_move(Board.flip(board), self.p1.get_sym(), roll)[0])
                sys.stderr.write("\t outcome {}".format(Board.outcome(board)[1]))
                outcomes.append(Board.outcome(board)[1])
                sys.stderr.write("\n")

                if i % 50 == 0:
                print_time_estimate(i)
                """
                return outcomes
            else:
                sys.stderr.write("[EVAL ] Evaluation method '{}' is not defined\n".format(method))
                return [0]

        if tf_session == None:
            with tf.Session() as session:
                session.run(tf.global_variables_initializer())
                self.restore_model(session)
                outcomes = [ (method, do_eval(session,
                                              method,
                                              episode_count,
                                              trained_eps = trained_eps))
                             for method
                             in self.config['eval_methods'] ]
                return outcomes
        else:
            outcomes = [ (method, do_eval(tf_session,
                                          method,
                                          episode_count,
                                          trained_eps = trained_eps))
                         for method
                         in self.config['eval_methods'] ]
            return outcomes

    def train_model(self, episodes=1000, save_step_size=100, trained_eps=0):
        with tf.Session() as sess:
            writer = tf.summary.FileWriter("/tmp/log/tf", sess.graph)

            sess.run(tf.global_variables_initializer())
            self.restore_model(sess)

            variables_names = [v.name for v in tf.trainable_variables()]
            values = sess.run(variables_names)
            for k, v in zip(variables_names, values):
                print("Variable: ", k)
                print("Shape: ", v.shape)
                print(v)

            start_time = time.time()

            def print_time_estimate(eps_completed):
                cur_time = time.time()
                time_diff = cur_time - start_time
                eps_per_sec = eps_completed / time_diff
                secs_per_ep = time_diff / eps_completed
                eps_remaining = (episodes - eps_completed)
                sys.stderr.write(
                    "[TRAIN] Averaging {per_sec} episodes per second\n".format(per_sec=round(eps_per_sec, 2)))
                sys.stderr.write(
                    "[TRAIN] {eps_remaining} episodes remaining; approx. {time_remaining} seconds remaining\n".format(
                        eps_remaining=eps_remaining, time_remaining=int(eps_remaining * secs_per_ep)))

            sys.stderr.write("[TRAIN] Training {} episodes and save_step_size {}\n".format(episodes, save_step_size))
            outcomes = []
            for episode in range(1, episodes + 1):
                sys.stderr.write("[TRAIN] Episode {}".format(episode + trained_eps))
                # TODO decide which player should be here

                player = 1

                prev_board = Board.initial_state

                # find the best move here, make this move, then change turn as the
                # first thing inside of the while loop and then call
                # best_move_and_score to get V_t+1

                i = 0
                while Board.outcome(prev_board) is None:
                    i += 1

                    #print("PREEEV_BOOOOAAARD:",prev_board)
                    cur_board, cur_board_value = self.make_move(sess,
                                                                prev_board,
                                                                (random.randrange(1, 7), random.randrange(1, 7)), player)

                    #print("The current value:",cur_board_value)

                    # adjust weights
                    sess.run(self.training_op,
                             feed_dict={self.x: self.board_trans_func(prev_board, player),
                                        self.value_next: cur_board_value})

                    player *= -1


                    prev_board = cur_board

                final_board = prev_board
                sys.stderr.write("\t outcome {}\t turns {}".format(Board.outcome(final_board)[1], i))
                outcomes.append(Board.outcome(final_board)[1])
                final_score = np.array([Board.outcome(final_board)[1]])
                scaled_final_score = ((final_score + 2) / 4)
                #print("The difference in values:", scaled_final_score - cur_board_value)
                # print("scaled_final_score",scaled_final_score)

                with tf.name_scope("final"):
                    merged = tf.summary.merge_all()
                    global_step, summary, _ = sess.run([self.global_step, merged, self.training_op],
                                          feed_dict={self.x: self.board_trans_func(prev_board, player),
                                                     self.value_next: scaled_final_score.reshape((1, 1))})
                    writer.add_summary(summary, episode + trained_eps)

                sys.stderr.write("\n")

                if episode % min(save_step_size, episodes) == 0:
                    sys.stderr.write("[TRAIN] Saving model...\n")
                    self.save_model(sess, episode + trained_eps, global_step)

                if episode % 50 == 0:
                    print_time_estimate(episode)

            sys.stderr.write("[TRAIN] Saving model for final episode...\n")
            self.save_model(sess, episode+trained_eps, global_step)

            writer.close()

            return outcomes


                # take turn, which finds the best state and picks it, based on the current network
                # save current state
                # run training operation (session.run(self.training_op, {x:x, value_next, value_next})), (something which does the backprop, based on the state after having taken a turn, found before, and the state we saved in the beginning and from now we'll save it at the end of the turn
                # save the current state again, so we can continue running backprop based on the "previous" turn.

        # NOTE: We need to make a method so that we can take a single turn or at least just pick the next best move, so we know how to evaluate according to TD-learning. Right now, our game just continues in a while loop without nothing to stop it!