Training using slightly revamped version of our own board rep. Not sure if works yet.

This commit is contained in:
Alexander Munch-Hansen 2018-03-27 04:06:08 +02:00
parent ab5d2aabb2
commit f43108c239
3 changed files with 21 additions and 23 deletions

View File

@ -35,6 +35,17 @@ class Board:
board.append(-15 - sum(negatives)) board.append(-15 - sum(negatives))
return tuple(board) return tuple(board)
@staticmethod
def board_features_to_own(board, player):
board = list(board)
positives = [x if x > 0 else 0 for x in board]
negatives = [x if x < 0 else 0 for x in board]
board.append(15 - sum(positives))
board.append(-15 - sum(negatives))
board += ([1, 0] if np.sign(player) > 0 else [1, 0])
return np.array(board).reshape(1,-1)
@staticmethod @staticmethod
def board_features_to_tesauro(board, cur_player): def board_features_to_tesauro(board, cur_player):
features = [] features = []

View File

@ -11,13 +11,11 @@ from eval import Eval
class Network: class Network:
hidden_size = 40 hidden_size = 40
input_size = 198 input_size = 30
output_size = 1 output_size = 1
# Can't remember the best learning_rate, look this up # Can't remember the best learning_rate, look this up
learning_rate = 0.01 learning_rate = 0.01
board_rep = Board.board_features_to_own
# TODO: Actually compile tensorflow properly
# os.environ["TF_CPP_MIN_LOG_LEVEL"]="2"
def custom_tanh(self, x, name=None): def custom_tanh(self, x, name=None):
return tf.scalar_mul(tf.constant(2.00), tf.tanh(x, name)) return tf.scalar_mul(tf.constant(2.00), tf.tanh(x, name))
@ -147,7 +145,7 @@ class Network:
def make_move(self, sess, board, roll, player): def make_move(self, sess, board, roll, player):
# print(Board.pretty(board)) # print(Board.pretty(board))
legal_moves = Board.calculate_legal_states(board, player, roll) legal_moves = Board.calculate_legal_states(board, player, roll)
moves_and_scores = [(move, self.eval_state(sess, Board.board_features_to_tesauro(move, player))) for move in legal_moves] moves_and_scores = [(move, self.eval_state(sess, Network.board_rep(move, player))) for move in legal_moves]
scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores] scores = [x[1] if np.sign(player) > 0 else 1-x[1] for x in moves_and_scores]
best_score_index = np.array(scores).argmax() best_score_index = np.array(scores).argmax()
best_move_pair = moves_and_scores[best_score_index] best_move_pair = moves_and_scores[best_score_index]
@ -338,15 +336,6 @@ class Network:
sys.stderr.write("[TRAIN] Episode {}".format(episode + trained_eps)) sys.stderr.write("[TRAIN] Episode {}".format(episode + trained_eps))
# TODO decide which player should be here # TODO decide which player should be here
# TEST
#if episode % 1000 == 0:
# self.config['eval_methods'] = 'dumbeval'
# self.config['episodes'] = 300
# outcomes = self.eval(trained_eps)
# self.log_eval_outcomes(outcomes, trained_eps=self.episodes_trained)
#player = random.choice([-1, 1])
player = 1 player = 1
prev_board = Board.initial_state prev_board = Board.initial_state
@ -355,7 +344,6 @@ class Network:
# first thing inside of the while loop and then call # first thing inside of the while loop and then call
# best_move_and_score to get V_t+1 # best_move_and_score to get V_t+1
# i = 0
while Board.outcome(prev_board) is None: while Board.outcome(prev_board) is None:
#print("PREEEV_BOOOOAAARD:",prev_board) #print("PREEEV_BOOOOAAARD:",prev_board)
@ -367,7 +355,7 @@ class Network:
# adjust weights # adjust weights
sess.run(self.training_op, sess.run(self.training_op,
feed_dict={self.x: Board.board_features_to_tesauro(prev_board, player), feed_dict={self.x: Network.board_rep(prev_board, player),
self.value_next: cur_board_value}) self.value_next: cur_board_value})
player *= -1 player *= -1
@ -386,7 +374,7 @@ class Network:
with tf.name_scope("final"): with tf.name_scope("final"):
merged = tf.summary.merge_all() merged = tf.summary.merge_all()
summary, _ = sess.run([merged, self.training_op], summary, _ = sess.run([merged, self.training_op],
feed_dict={self.x: Board.board_features_to_tesauro(prev_board, player), feed_dict={self.x: Network.board_rep(prev_board, player),
self.value_next: scaled_final_score.reshape((1, 1))}) self.value_next: scaled_final_score.reshape((1, 1))})
writer.add_summary(summary, episode + trained_eps) writer.add_summary(summary, episode + trained_eps)
@ -415,8 +403,3 @@ class Network:
# save the current state again, so we can continue running backprop based on the "previous" turn. # save the current state again, so we can continue running backprop based on the "previous" turn.
# NOTE: We need to make a method so that we can take a single turn or at least
# just pick the next best move, so we know how to evaluate according to TD-learning.
# Right now, our game just continues in a while loop without nothing to stop it!

View File

@ -5,6 +5,10 @@ static PyObject* DumbevalError;
static float x[122]; static float x[122];
static const float wc[122] = { static const float wc[122] = {
5.6477, 6.316649999999999, 7.05515, 6.65315, 9.3171, 17.9777, 2.0235499999999993, 5.1129500000000005, 7.599200000000001, 9.68525, 3.1762, 8.05335, 16.153499999999998, 8.02445, 10.55345, 15.489600000000001, 10.525199999999998, 16.438850000000002, 12.27405, 9.6362, 12.7152, 13.2859, 1.6932499999999995, 26.79045, 10.521899999999999, 6.79635, 5.28135, 6.2059, 10.2306, 10.5485, 3.6000500000000004, 4.07825, 6.951700000000001, 4.413749999999999, 11.271450000000002, 12.9361, 11.087299999999999, 13.10085, 10.411999999999999, 8.084050000000001, 12.4893, 5.96055, 4.69195, 18.9482, 9.0946, 9.1954, 6.2592, 16.180300000000003, 8.3376, 23.24915, 14.32525, -2.6699000000000006, 19.156, 5.81445, 4.7214, 7.63055, 7.039, 5.88075, 2.00765, 14.596800000000002, 11.5208, -3.79, -3.8541000000000003, 5.358499999999999, 14.4516, 2.49015, 11.284799999999999, 14.1066, 16.2306, 5.82875, 9.34505, 16.13685, 8.1893, 2.93145, 7.83185, 12.86765, 6.90115, 20.07255, 8.93355, -0.12434999999999974, 12.0587, 11.83985, 6.34155, 7.1963, 10.571200000000001, 22.38365, 6.50745, 8.94595, 12.0434, 10.79885, 14.055800000000001, 0.022100000000000453, 10.39255, 4.088850000000001, 3.6421499999999996, 38.1298, 6.8957, 0.9804999999999997, 5.9599, 13.16055, 11.55305, 10.65015, 4.6673, 15.770999999999999, 27.700050000000005, 4.4329, 12.6349, 7.037800000000001, 3.4897, 18.91945, 10.239899999999999, 5.4625, 10.29705, 10.492799999999999, 8.850900000000001, -10.575999999999999, 10.6893, 15.30845, 17.8083, 31.88275, 11.225000000000001, 4.4806};
/*
1.5790816238841092, 1.6374860177130541, -1.7131823639980923, -0.9286186784962336, -1.0732080528763888, 1.5790816238841092, 1.6374860177130541, -1.7131823639980923, -0.9286186784962336, -1.0732080528763888,
-0.33851674519289876, 1.5798155080270462, 2.3161915581553414, 1.5625330782392322, 0.9397141260075461, -0.33851674519289876, 1.5798155080270462, 2.3161915581553414, 1.5625330782392322, 0.9397141260075461,
0.8386342522957442, 1.2380864901133144, -2.803703105809909, -1.6033863837759044, -1.9297462408169208, 0.8386342522957442, 1.2380864901133144, -2.803703105809909, -1.6033863837759044, -1.9297462408169208,
@ -30,7 +34,7 @@ static const float wc[122] = {
-0.3393405083020449, 2.787144781914554, -2.401723402781605, -1.1675562811241997, -1.1542961327714207, -0.3393405083020449, 2.787144781914554, -2.401723402781605, -1.1675562811241997, -1.1542961327714207,
0.18253192955355502, -2.418436664206371, 0.7423935287565309, 2.9903418274144666, -1.3503112004693552, 0.18253192955355502, -2.418436664206371, 0.7423935287565309, 2.9903418274144666, -1.3503112004693552,
-2.649146174480099, -0.5447080156947952 -2.649146174480099, -0.5447080156947952
}; };*/
static const float wr[122] = { static const float wr[122] = {
-0.7856, -0.50352, 0.12392, -1.00316, -2.46556, -0.1627, 0.18966, 0.0043, 0.0, -0.7856, -0.50352, 0.12392, -1.00316, -2.46556, -0.1627, 0.18966, 0.0043, 0.0,