Merge branch 'network_dev' into 'master'

Network dev

See merge request Pownie/backgammon!1
This commit is contained in:
Alexander Munch-Hansen 2018-03-06 10:07:41 +00:00
commit f573eaaadd
7 changed files with 448 additions and 16 deletions

167
.gitignore vendored Normal file
View File

@ -0,0 +1,167 @@
### Emacs ###
# -*- mode: gitignore; -*-
*~
\#*\#
/.emacs.desktop
/.emacs.desktop.lock
*.elc
auto-save-list
tramp
.\#*
# Org-mode
.org-id-locations
*_archive
# flymake-mode
*_flymake.*
# eshell files
/eshell/history
/eshell/lastdir
# elpa packages
/elpa/
# reftex files
*.rel
# AUCTeX auto folder
/auto/
# cask packages
.cask/
dist/
# Flycheck
flycheck_*.el
# server auth directory
/server/
# projectiles files
.projectile
projectile-bookmarks.eld
# directory configuration
.dir-locals.el
# saveplace
places
# url cache
url/cache/
# cedet
ede-projects.el
# smex
smex-items
# company-statistics
company-statistics-cache.el
# anaconda-mode
anaconda-mode/
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
.pytest_cache/
nosetests.xml
coverage.xml
*.cover
.hypothesis/
# Translations
*.mo
*.pot
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule.*
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
# End of https://www.gitignore.io/api/emacs,python

View File

@ -216,12 +216,12 @@ class Board:
temp = []
for x in board:
if x > 0:
temp.append(" {}".format(white(x)))
temp.append(" {}".format(x))
elif x < 0:
temp.append("{}".format(black(x)))
temp.append("{}".format(x))
else: temp.append(" ")
return """\033[0;47m
return """
13 14 15 16 17 18 19 20 21 22 23 24
+--------------------------------------------------------------------------+
| {11}| {10}| {9}| {8}| {7}| {6}| bar -1: {24} | {5}| {4}| {3}| {2}| {1}| {0}| end -1: TODO|
@ -229,7 +229,7 @@ class Board:
| {12}| {13}| {14}| {15}| {16}| {17}| bar 1: {25} | {18}| {19}| {20}| {21}| {22}| {23}| end 1: TODO|
+--------------------------------------------------------------------------+
12 11 10 9 8 7 6 5 4 3 2 1
\033[0m""".format(*temp)
""".format(*temp)
@staticmethod
def do_move(board, player, move):

22
bot.py
View File

@ -1,4 +1,7 @@
from cup import Cup
import tensorflow as tf
from network import Network
import numpy as np
from board import Board
import random
@ -7,6 +10,11 @@ class Bot:
def __init__(self, sym):
self.cup = Cup()
self.sym = sym
self.graph = tf.Graph()
with self.graph.as_default():
self.session = tf.Session()
self.network = Network(self.session)
def roll(self):
print("{} rolled: ".format(self.sym))
@ -18,11 +26,23 @@ class Bot:
def switch(self,cur):
return -1 if cur == 1 else 1
def get_session(self):
return self.session
def get_sym(self):
return self.sym
def get_network(self):
return self.network
def make_move(self, board, sym, roll):
# print(Board.pretty(board))
legal_moves = Board.calculate_legal_states(board, sym, roll)
legal_list = list(legal_moves)
lol = []
for move in legal_list:
lol.append(self.network.eval_state(np.array(move).reshape(1,26)))
print("Found the best state, being:", np.array(lol).argmax())
return [legal_list[np.array(lol).argmax()], max(lol)]
return random.choice(list(legal_moves))
# return random.choice(list(legal_moves))

75
game.py
View File

@ -1,34 +1,79 @@
import time
from human import Human
from board import Board
from bot import Bot
from network import Network
from restore_bot import RestoreBot
import numpy as np
from cup import Cup
class Game:
def __init__(self):
self.board = Board.initial_state
self.p1 = Bot(1)
self.p2 = Bot(-1)
self.p2 = RestoreBot(-1)
self.cup = Cup()
def roll(self):
return self.cup.roll()
def roll_and_find_best_for_bot(self):
roll = self.roll()
move_and_val = self.p1.make_move(self.board, self.p1.get_sym(), roll)
self.board = move_and_val[0]
return move_and_val
def next_round(self):
roll = self.roll()
print(roll)
self.board = self.p2.make_move(self.board, self.p2.get_sym(),roll)
return self.board
def board_state(self):
return self.board
def train_model(self):
episodes = 100
outcomes = []
for episode in range(episodes):
self.board = Board.initial_state
prev_board = self.board
while Board.outcome(self.board) is None:
cur_board, cur_board_value = self.roll_and_find_best_for_bot()
self.p1.get_network().train(prev_board, cur_board_value)
prev_board = cur_board
self.next_round()
print("Outcome:", Board.outcome(self.board)[1])
outcomes.append(Board.outcome(self.board)[1])
final_score = np.array([ Board.outcome(self.board)[1] ]).reshape((1, 1))
self.p1.get_network().train(prev_board, final_score)
print("trained episode {}".format(episode))
if episode % 10 == 0:
print("Saving...")
self.p1.get_network().save_model()
print(outcomes)
def next_round_test(self):
print(self.board)
print()
self.next_round()
print("--------------------------------")
print(self.board)
print("--------------------------------")
def play(self):
count = 0
while Board.outcome(self.board) == None:
while Board.outcome(self.board) is None:
count += 1
print("Turn:",count)
roll = self.roll()
#print("type of board: ", type(self.board))
print("type of board: ", type(self.board))
print("Board:",self.board)
print("{} rolled: {}".format(self.p1.get_sym(), roll))
self.board = self.p1.make_move(self.board, self.p1.get_sym(), roll)
self.board = (self.p1.make_move(self.board, self.p1.get_sym(), roll))[0]
print(self.board)
@ -47,6 +92,16 @@ class Game:
print_winner = "-1: Black " + str(Board.outcome(self.board))
print("The winner is {}!".format(print_winner))
print("Final board:",Board.pretty(self.board))
return count
g = Game()
g.play()
highest = 0
#for i in range(100000):
# try:
g = Game()
g.train_model()
#count = g.play()
# highest = max(highest,count)
# except KeyboardInterrupt:
# break
#print("\nHighest amount of turns is:",highest)

View File

@ -1,5 +1,122 @@
import tensorflow as tf
from cup import Cup
import numpy as np
from board import Board
#from game import Game
import os
class Config():
hidden_size = 40
input_size = 26
output_size = 1
# Can't remember the best learning_rate, look this up
learning_rate = 0.3
checkpoint_path = "/tmp/"
class Network:
# TODO: Actually compile tensorflow properly
os.environ["TF_CPP_MIN_LOG_LEVEL"]="2"
def __init__(self, session):
self.session = session
self.config = Config
input_size = self.config.input_size
hidden_size = self.config.hidden_size
output_size = self.config.output_size
learning_rate = self.config.learning_rate
self.checkpoint_path = self.config.checkpoint_path
# input = x
self.x = tf.placeholder('float', [1,input_size], name='x')
self.value_next = tf.placeholder('float', [1,output_size], name="value_next")
xavier_init = tf.contrib.layers.xavier_initializer()
W_1 = tf.Variable(xavier_init((input_size, hidden_size)))
W_2 = tf.Variable(xavier_init((hidden_size, output_size)))
b_1 = tf.zeros(hidden_size,)
b_2 = tf.zeros(output_size,)
value_after_input = tf.sigmoid(tf.matmul(self.x, W_1) + b_1, name='hidden_layer')
# TODO: Remember to make this tanh * 2
# self.value = tf.layers.dense(input=value_after_input, units=hidden_size, \
# activation=self.custom_tanh, kernel_initializer=xavier_init())
self.value = 2*tf.nn.tanh(tf.matmul(value_after_input, W_2) + b_2, name='output_layer')
# tf.reduce_sum basically finds the sum of it's input, so this gives the difference between the two values, in case they should be lists, which they might be if our input changes
difference_in_values = tf.reduce_sum(self.value_next - self.value, name='difference')
trainable_vars = tf.trainable_variables()
gradients = tf.gradients(self.value, trainable_vars)
apply_gradients = []
with tf.variable_scope('apply_gradients'):
for gradient, trainable_var in zip(gradients, trainable_vars):
# Hopefully this is Δw_t = α(V_t+1 - V_t)▿_wV_t.
backprop_calc = learning_rate * difference_in_values * gradient
grad_apply = trainable_var.assign_add(backprop_calc)
apply_gradients.append(grad_apply)
self.training_op = tf.group(*apply_gradients, name='training_op')
self.saver = tf.train.Saver(max_to_keep=1)
self.session.run(tf.global_variables_initializer())
def eval_state(self, state):
# Run state through a network
print("Network is evaluating")
# Remember to create placeholders for everything because wtf tensorflow and graphs
# Remember to create the dense layers
# Figure out a way of giving a layer a custom activiation function (we want something which gives [-2,2]. Naively tahn*2, however I fell this is wrong.
# tf.group, groups a bunch of actions, so calculate the different gradients for the different weights, by using tf.trainable_variables() to find all variables and tf.gradients(current_value, trainable_variables) to find all the gradients. We can then loop through this and calculate the trace for each gradient and variable pair (note, zip can be used to combine the two lists found before), and then we can calculate the overall change in weights, based on the formula listed in tesauro (learning_rate * difference_in_values * trace), this calculation can be assigned to a tf variable and put in a list and then this can be grouped into a single operation, essentially building our own backprop function.
# Grouping them is done by tf.group(*the_gradients_from_before_we_want_to_apply, name="training_op")
# If we remove the eligibily trace to begin with, we only have
# to implement learning_rate * (difference_in_values) * gradients (the before-mentioned calculation.
# print("Network is evaluating")
val = self.session.run(self.value, feed_dict={self.x: state})
return val
def save_model(self):
self.saver.save(self.session, self.checkpoint_path + 'model.ckpt')
def restore_model(self):
latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_path)
self.saver.restore(self.session, latest_checkpoint)
# Have a circular dependency, #fuck, need to rewrite something
def train(self, x, v_next):
# print("lol")
x = np.array(x).reshape((1,26))
self.session.run(self.training_op, feed_dict = {self.x:x, self.value_next: v_next})
# while game isn't done:
#x_next = g.next_move()
#value_next = network.eval_state(x_next)
#self.session.run(self.training_op, feed_dict={self.x: x, self.value_next: value_next})
#x = x_next
# take turn, which finds the best state and picks it, based on the current network
# save current state
# run training operation (session.run(self.training_op, {x:x, value_next, value_next})), (something which does the backprop, based on the state after having taken a turn, found before, and the state we saved in the beginning and from now we'll save it at the end of the turn
# save the current state again, so we can continue running backprop based on the "previous" turn.
# NOTE: We need to make a method so that we can take a single turn or at least just pick the next best move, so we know how to evaluate according to TD-learning. Right now, our game just continues in a while loop without nothing to stop it!

32
network_test.py Normal file
View File

@ -0,0 +1,32 @@
from network import Network
import tensorflow as tf
import random
import numpy as np
session = tf.Session()
graph_lol = tf.Graph()
network = Network(session)
initial_state = np.array(( 0,
2, 0, 0, 0, 0, -5,
0, -3, 0, 0, 0, 5,
-5, 0, 0, 0, 3, 0,
5, 0, 0, 0, 0, -2,
0 )).reshape((1,26))
#print(x.shape)
with graph_lol.as_default():
session_2 = tf.Session(graph = graph_lol)
network_2 = Network(session_2)
network_2.restore_model()
print(network_2.eval_state(initial_state))
print(network.eval_state(initial_state))

41
restore_bot.py Normal file
View File

@ -0,0 +1,41 @@
from cup import Cup
import numpy as np
import tensorflow as tf
from network import Network
from board import Board
import random
class RestoreBot:
def __init__(self, sym):
self.cup = Cup()
self.sym = sym
self.graph = tf.Graph()
with self.graph.as_default():
self.session = tf.Session(graph = self.graph)
self.network = Network(self.session)
self.network.restore_model()
def roll(self):
print("{} rolled: ".format(self.sym))
roll = self.cup.roll()
print(roll)
return roll
def switch(self,cur):
return -1 if cur == 1 else 1
def get_sym(self):
return self.sym
def make_move(self, board, sym, roll):
# print(Board.pretty(board))
legal_moves = Board.calculate_legal_states(board, sym, roll)
legal_list = list(legal_moves)
move_scores = [ self.network.eval_state(np.array(move).reshape(1,26)) for move in legal_list ]
print("Found the best state, being:", np.array(move_scores).argmax())
return legal_list[np.array(move_scores).argmax()]
# return random.choice(list(legal_moves))