Merge branch 'experimentation' into 'master'
tesauro fat and diffs in values See merge request Pownie/backgammon!7
This commit is contained in:
commit
7e51b44e33
56
board.py
56
board.py
|
@ -51,7 +51,6 @@ class Board:
|
||||||
# board += ([1, 0] if np.sign(player) > 0 else [0, 1])
|
# board += ([1, 0] if np.sign(player) > 0 else [0, 1])
|
||||||
# return np.array(board).reshape(1,30)
|
# return np.array(board).reshape(1,30)
|
||||||
|
|
||||||
|
|
||||||
# quack-fatter
|
# quack-fatter
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def board_features_quack_norm(board, player):
|
def board_features_quack_norm(board, player):
|
||||||
|
@ -95,11 +94,64 @@ class Board:
|
||||||
board_rep += bar_trans(board, player)
|
board_rep += bar_trans(board, player)
|
||||||
board_rep += (15 - Board.num_of_checkers_for_player(board, player),)
|
board_rep += (15 - Board.num_of_checkers_for_player(board, player),)
|
||||||
|
|
||||||
board_rep += ([1,0] if cur_player == 1 else [1,0])
|
board_rep += ([1, 0] if cur_player == 1 else [0, 1])
|
||||||
|
|
||||||
return np.array(board_rep).reshape(1, 198)
|
return np.array(board_rep).reshape(1, 198)
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def board_features_tesauro_fat(board, cur_player):
|
||||||
|
def ordinary_trans(val, player):
|
||||||
|
abs_val = val*player
|
||||||
|
if abs_val <= 0:
|
||||||
|
return (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||||
|
elif abs_val == 1:
|
||||||
|
return (1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||||
|
elif abs_val == 2:
|
||||||
|
return (1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||||
|
elif abs_val == 3:
|
||||||
|
return (1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||||
|
elif abs_val == 4:
|
||||||
|
return (1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||||
|
elif abs_val == 5:
|
||||||
|
return (1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||||
|
elif abs_val == 6:
|
||||||
|
return (1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||||
|
elif abs_val == 7:
|
||||||
|
return (1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0)
|
||||||
|
elif abs_val == 8:
|
||||||
|
return (1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0)
|
||||||
|
elif abs_val == 9:
|
||||||
|
return (1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0)
|
||||||
|
elif abs_val == 10:
|
||||||
|
return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0)
|
||||||
|
elif abs_val == 11:
|
||||||
|
return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0)
|
||||||
|
elif abs_val == 12:
|
||||||
|
return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0)
|
||||||
|
elif abs_val == 13:
|
||||||
|
return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0)
|
||||||
|
elif abs_val == 14:
|
||||||
|
return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)
|
||||||
|
elif abs_val == 15:
|
||||||
|
return (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
|
||||||
|
|
||||||
|
def bar_trans(board, player):
|
||||||
|
if player == 1: return (abs(board[0]/2),)
|
||||||
|
elif player == -1: return (abs(board[25]/2),)
|
||||||
|
|
||||||
|
board_rep = []
|
||||||
|
for player in [1, -1]:
|
||||||
|
for x in board[1:25]:
|
||||||
|
board_rep += ordinary_trans(x, player)
|
||||||
|
board_rep += bar_trans(board, player)
|
||||||
|
board_rep += (15 - Board.num_of_checkers_for_player(board, player),)
|
||||||
|
|
||||||
|
board_rep += ([1, 0] if cur_player == 1 else [0, 1])
|
||||||
|
|
||||||
|
return np.array(board_rep).reshape(1, len(board_rep))
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def board_features_tesauro_wrong(board, cur_player):
|
def board_features_tesauro_wrong(board, cur_player):
|
||||||
features = []
|
features = []
|
||||||
|
|
2
main.py
2
main.py
|
@ -86,7 +86,7 @@ def log_train_outcome(outcome, diff_in_values, trained_eps = 0, log_path = os.pa
|
||||||
'sum': sum(outcome),
|
'sum': sum(outcome),
|
||||||
'mean': sum(outcome) / len(outcome),
|
'mean': sum(outcome) / len(outcome),
|
||||||
'time': int(time.time()),
|
'time': int(time.time()),
|
||||||
'average_diff_in_vals': diff_in_values/len(outcome),
|
'average_diff_in_vals': diff_in_values
|
||||||
'commit': commit
|
'commit': commit
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
50
network.py
50
network.py
|
@ -21,10 +21,10 @@ class Network:
|
||||||
'quack' : (28, Board.board_features_quack),
|
'quack' : (28, Board.board_features_quack),
|
||||||
'tesauro' : (198, Board.board_features_tesauro),
|
'tesauro' : (198, Board.board_features_tesauro),
|
||||||
'quack-norm' : (30, Board.board_features_quack_norm),
|
'quack-norm' : (30, Board.board_features_quack_norm),
|
||||||
|
'tesauro-fat' : (726, Board.board_features_tesauro_fat),
|
||||||
'tesauro-poop': (198, Board.board_features_tesauro_wrong)
|
'tesauro-poop': (198, Board.board_features_tesauro_wrong)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def custom_tanh(self, x, name=None):
|
def custom_tanh(self, x, name=None):
|
||||||
return tf.scalar_mul(tf.constant(2.00), tf.tanh(x, name))
|
return tf.scalar_mul(tf.constant(2.00), tf.tanh(x, name))
|
||||||
|
|
||||||
|
@ -39,6 +39,11 @@ class Network:
|
||||||
'0': self.make_move_0_ply
|
'0': self.make_move_0_ply
|
||||||
}
|
}
|
||||||
|
|
||||||
|
self.max_or_min = {
|
||||||
|
1: np.argmax,
|
||||||
|
-1: np.argmin
|
||||||
|
}
|
||||||
|
|
||||||
tf.enable_eager_execution()
|
tf.enable_eager_execution()
|
||||||
|
|
||||||
xavier_init = tf.contrib.layers.xavier_initializer()
|
xavier_init = tf.contrib.layers.xavier_initializer()
|
||||||
|
@ -144,8 +149,9 @@ class Network:
|
||||||
:param episode_count:
|
:param episode_count:
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tfe.Saver(self.model.variables).save(os.path.join(self.checkpoint_path, 'model.ckpt'))
|
tfe.Saver(self.model.variables).save(os.path.join(self.checkpoint_path, 'model.ckpt'))
|
||||||
#self.saver.save(sess, os.path.join(self.checkpoint_path, 'model.ckpt'), global_step=global_step)
|
|
||||||
with open(os.path.join(self.checkpoint_path, "episodes_trained"), 'w+') as f:
|
with open(os.path.join(self.checkpoint_path, "episodes_trained"), 'w+') as f:
|
||||||
print("[NETWK] ({name}) Saving model to:".format(name=self.name),
|
print("[NETWK] ({name}) Saving model to:".format(name=self.name),
|
||||||
os.path.join(self.checkpoint_path, 'model.ckpt'))
|
os.path.join(self.checkpoint_path, 'model.ckpt'))
|
||||||
|
@ -184,9 +190,6 @@ class Network:
|
||||||
str(latest_checkpoint))
|
str(latest_checkpoint))
|
||||||
tfe.Saver(self.model.variables).restore(latest_checkpoint)
|
tfe.Saver(self.model.variables).restore(latest_checkpoint)
|
||||||
|
|
||||||
# variables_names = [v.name for v in self.model.variables]
|
|
||||||
|
|
||||||
|
|
||||||
# Restore trained episode count for model
|
# Restore trained episode count for model
|
||||||
episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained")
|
episode_count_path = os.path.join(self.checkpoint_path, "episodes_trained")
|
||||||
if os.path.isfile(episode_count_path):
|
if os.path.isfile(episode_count_path):
|
||||||
|
@ -218,9 +221,9 @@ class Network:
|
||||||
legal_states = np.array([self.board_trans_func(move, player)[0] for move in legal_moves])
|
legal_states = np.array([self.board_trans_func(move, player)[0] for move in legal_moves])
|
||||||
|
|
||||||
scores = self.model.predict_on_batch(legal_states)
|
scores = self.model.predict_on_batch(legal_states)
|
||||||
transformed_scores = [x if np.sign(player) > 0 else 1 - x for x in scores]
|
|
||||||
|
|
||||||
best_score_idx = np.argmax(np.array(transformed_scores))
|
best_score_idx = self.max_or_min[player](scores)
|
||||||
|
|
||||||
best_move, best_score = legal_moves[best_score_idx], scores[best_score_idx]
|
best_move, best_score = legal_moves[best_score_idx], scores[best_score_idx]
|
||||||
|
|
||||||
return (best_move, best_score)
|
return (best_move, best_score)
|
||||||
|
@ -263,9 +266,10 @@ class Network:
|
||||||
sorted_moves_and_scores = sorted(moves_and_scores, key=itemgetter(1), reverse=(player == 1))
|
sorted_moves_and_scores = sorted(moves_and_scores, key=itemgetter(1), reverse=(player == 1))
|
||||||
best_boards = [ x[0] for x in sorted_moves_and_scores[:10] ]
|
best_boards = [ x[0] for x in sorted_moves_and_scores[:10] ]
|
||||||
|
|
||||||
scores, trans_scores = self.do_ply(best_boards, player)
|
scores = self.do_ply(best_boards, player)
|
||||||
|
|
||||||
best_score_idx = np.array(trans_scores).argmax()
|
best_score_idx = self.max_or_min[player](scores)
|
||||||
|
# best_score_idx = np.array(trans_scores).argmax()
|
||||||
|
|
||||||
return (best_boards[best_score_idx], scores[best_score_idx])
|
return (best_boards[best_score_idx], scores[best_score_idx])
|
||||||
|
|
||||||
|
@ -308,7 +312,7 @@ class Network:
|
||||||
|
|
||||||
# print(time.time() - start)
|
# print(time.time() - start)
|
||||||
|
|
||||||
# start = time.time()
|
start = time.time()
|
||||||
|
|
||||||
all_scores = self.model.predict_on_batch(np.array(test_list))
|
all_scores = self.model.predict_on_batch(np.array(test_list))
|
||||||
|
|
||||||
|
@ -319,10 +323,10 @@ class Network:
|
||||||
from_idx += length
|
from_idx += length
|
||||||
|
|
||||||
means_splits = [tf.reduce_mean(scores) for scores in split_scores]
|
means_splits = [tf.reduce_mean(scores) for scores in split_scores]
|
||||||
transformed_means_splits = [x if player == 1 else (1-x) for x in means_splits]
|
|
||||||
# print(time.time() - start)
|
|
||||||
|
|
||||||
return (means_splits, transformed_means_splits)
|
# print(time.time() - start)
|
||||||
|
# print("/"*50)
|
||||||
|
return means_splits
|
||||||
|
|
||||||
|
|
||||||
def eval(self, episode_count, trained_eps = 0):
|
def eval(self, episode_count, trained_eps = 0):
|
||||||
|
@ -363,7 +367,6 @@ class Network:
|
||||||
sys.stderr.write(
|
sys.stderr.write(
|
||||||
"[EVAL ] Evaluating {eps} episode(s) with method '{method}'\n".format(eps=episodes, method=method))
|
"[EVAL ] Evaluating {eps} episode(s) with method '{method}'\n".format(eps=episodes, method=method))
|
||||||
|
|
||||||
|
|
||||||
if method == 'pubeval':
|
if method == 'pubeval':
|
||||||
outcomes = []
|
outcomes = []
|
||||||
for i in range(1, episodes + 1):
|
for i in range(1, episodes + 1):
|
||||||
|
@ -454,10 +457,8 @@ class Network:
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
difference_in_vals = 0
|
|
||||||
|
|
||||||
self.restore_model()
|
self.restore_model()
|
||||||
|
average_diffs = 0
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
def print_time_estimate(eps_completed):
|
def print_time_estimate(eps_completed):
|
||||||
|
@ -479,26 +480,26 @@ class Network:
|
||||||
sys.stderr.write("[TRAIN] Episode {}".format(episode + trained_eps))
|
sys.stderr.write("[TRAIN] Episode {}".format(episode + trained_eps))
|
||||||
# TODO decide which player should be here
|
# TODO decide which player should be here
|
||||||
|
|
||||||
player = 1
|
# player = 1
|
||||||
|
player = random.choice([-1,1])
|
||||||
prev_board = Board.initial_state
|
prev_board = Board.initial_state
|
||||||
i = 0
|
i = 0
|
||||||
|
difference_in_values = 0
|
||||||
while Board.outcome(prev_board) is None:
|
while Board.outcome(prev_board) is None:
|
||||||
i += 1
|
i += 1
|
||||||
self.global_step += 1
|
self.global_step += 1
|
||||||
|
|
||||||
|
|
||||||
cur_board, cur_board_value = self.make_move(prev_board,
|
cur_board, cur_board_value = self.make_move(prev_board,
|
||||||
(random.randrange(1, 7), random.randrange(1, 7)),
|
(random.randrange(1, 7), random.randrange(1, 7)),
|
||||||
player)
|
player)
|
||||||
|
|
||||||
difference_in_vals += abs((cur_board_value - self.eval_state(self.board_trans_func(prev_board, player))))
|
difference_in_values += abs((cur_board_value - self.eval_state(self.board_trans_func(prev_board, player))))
|
||||||
|
|
||||||
if self.config['verbose']:
|
if self.config['verbose']:
|
||||||
print("Difference in values:", difference_in_vals)
|
print("Difference in values:", difference_in_vals)
|
||||||
print("Current board value :", cur_board_value)
|
print("Current board value :", cur_board_value)
|
||||||
print("Current board is :\n",cur_board)
|
print("Current board is :\n",cur_board)
|
||||||
|
|
||||||
|
|
||||||
# adjust weights
|
# adjust weights
|
||||||
if Board.outcome(cur_board) is None:
|
if Board.outcome(cur_board) is None:
|
||||||
self.do_backprop(self.board_trans_func(prev_board, player), cur_board_value)
|
self.do_backprop(self.board_trans_func(prev_board, player), cur_board_value)
|
||||||
|
@ -512,6 +513,10 @@ class Network:
|
||||||
final_score = np.array([Board.outcome(final_board)[1]])
|
final_score = np.array([Board.outcome(final_board)[1]])
|
||||||
scaled_final_score = ((final_score + 2) / 4)
|
scaled_final_score = ((final_score + 2) / 4)
|
||||||
|
|
||||||
|
difference_in_values += abs(scaled_final_score-cur_board_value)
|
||||||
|
|
||||||
|
average_diffs += (difference_in_values[0][0] / (i+1))
|
||||||
|
|
||||||
self.do_backprop(self.board_trans_func(prev_board, player), scaled_final_score.reshape(1,1))
|
self.do_backprop(self.board_trans_func(prev_board, player), scaled_final_score.reshape(1,1))
|
||||||
|
|
||||||
sys.stderr.write("\n")
|
sys.stderr.write("\n")
|
||||||
|
@ -524,8 +529,9 @@ class Network:
|
||||||
print_time_estimate(episode)
|
print_time_estimate(episode)
|
||||||
|
|
||||||
sys.stderr.write("[TRAIN] Saving model for final episode...\n")
|
sys.stderr.write("[TRAIN] Saving model for final episode...\n")
|
||||||
|
|
||||||
self.save_model(episode+trained_eps)
|
self.save_model(episode+trained_eps)
|
||||||
|
|
||||||
return outcomes, difference_in_vals[0][0]
|
return outcomes, average_diffs/len(outcomes)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -57,4 +57,11 @@ boards = {initial_state,
|
||||||
|
|
||||||
# print(network.calculate_1_ply(Board.initial_state, [3,2], 1))
|
# print(network.calculate_1_ply(Board.initial_state, [3,2], 1))
|
||||||
|
|
||||||
network.play_against_network()
|
|
||||||
|
diff = [0, 0]
|
||||||
|
val = network.eval_state(Board.board_features_quack_fat(initial_state, 1))
|
||||||
|
print(val)
|
||||||
|
diff[0] += abs(-1-val)
|
||||||
|
diff[1] += 1
|
||||||
|
|
||||||
|
print(diff[1])
|
Loading…
Reference in New Issue
Block a user