diff --git a/.gitignore b/.gitignore
index 08bc86a..03ee050 100644
--- a/.gitignore
+++ b/.gitignore
@@ -169,3 +169,6 @@ venv.bak/
 README.*
 !README.org
 models/
+.DS_Store
+bench/
+
diff --git a/main.py b/main.py
index bc8de09..b5a8ad0 100644
--- a/main.py
+++ b/main.py
@@ -3,38 +3,6 @@ import sys
 import os
 import time
 
-model_storage_path = 'models'
-
-# Create models folder
-if not os.path.exists(model_storage_path):
-    os.makedirs(model_storage_path)
-
-# Define helper functions
-def log_train_outcome(outcome, trained_eps = 0):
-    format_vars = { 'trained_eps': trained_eps,
-                    'count': len(train_outcome),
-                    'sum': sum(train_outcome),
-                    'mean': sum(train_outcome) / len(train_outcome),
-                    'time': int(time.time())
-    }
-    with open(os.path.join(config['model_path'], 'logs', "train.log"), 'a+') as f:
-        f.write("{time};{trained_eps};{count};{sum};{mean}".format(**format_vars) + "\n")
-    
-
-def log_eval_outcomes(outcomes, trained_eps = 0):
-    for outcome in outcomes:
-        scores = outcome[1]
-        format_vars = { 'trained_eps': trained_eps,
-                        'method': outcome[0],
-                        'count': len(scores),
-                        'sum': sum(scores),
-                        'mean': sum(scores) / len(scores),
-                        'time': int(time.time())
-        }
-        with open(os.path.join(config['model_path'], 'logs', "eval.log"), 'a+') as f:
-            f.write("{time};{method};{trained_eps};{count};{sum};{mean}".format(**format_vars) + "\n")
-
-            
 # Parse command line arguments
 parser = argparse.ArgumentParser(description="Backgammon games")
 parser.add_argument('--episodes', action='store', dest='episode_count',
@@ -47,13 +15,15 @@ parser.add_argument('--eval-methods', action='store',
                     default=['random'], nargs='*',
                     help='specifies evaluation methods')
 parser.add_argument('--eval', action='store_true',
-                    help='whether to evaluate the neural network with a random choice bot')
+                    help='evaluate the neural network with a random choice bot')
+parser.add_argument('--bench-eval-scores', action='store_true',
+                    help='benchmark scores of evaluation measures. episode counts and model specified as options are ignored.')
 parser.add_argument('--train', action='store_true',
-                    help='whether to train the neural network')
+                    help='train the neural network')
 parser.add_argument('--eval-after-train', action='store_true', dest='eval_after_train',
-                    help='whether to evaluate after each training session')
+                    help='evaluate after each training session')
 parser.add_argument('--play', action='store_true',
-                    help='whether to play with the neural network')
+                    help='play with the neural network')
 parser.add_argument('--start-episode', action='store', dest='start_episode',
                     type=int, default=0,
                     help='episode count to start at; purely for display purposes')
@@ -66,27 +36,73 @@ args = parser.parse_args()
 
 config = {
     'model': args.model,
-    'model_path': os.path.join(model_storage_path, args.model),
     'episode_count': args.episode_count,
     'eval_methods': args.eval_methods,
     'train': args.train,
     'play': args.play,
     'eval': args.eval,
+    'bench_eval_scores': args.bench_eval_scores,
     'eval_after_train': args.eval_after_train,
     'start_episode': args.start_episode,
     'train_perpetually': args.train_perpetually,
-    'model_storage_path': model_storage_path
+    'model_storage_path': 'models',
+    'bench_storage_path': 'bench'
 }
 
+# Create models folder
+if not os.path.exists(config['model_storage_path']):
+    os.makedirs(config['model_storage_path'])
+
+model_path = lambda: os.path.join(config['model_storage_path'], config['model'])
+
 # Make sure directories exist
-model_path = os.path.join(config['model_path'])
-log_path   = os.path.join(model_path, 'logs')
-if not os.path.isdir(model_path):
-    os.mkdir(model_path)
+log_path = os.path.join(model_path(), 'logs')
+if not os.path.isdir(model_path()):
+    os.mkdir(model_path())
 if not os.path.isdir(log_path):
     os.mkdir(log_path)
+
+        
+
+# Define helper functions
+def log_train_outcome(outcome, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "train.log")):
+    format_vars = { 'trained_eps': trained_eps,
+                    'count': len(train_outcome),
+                    'sum': sum(train_outcome),
+                    'mean': sum(train_outcome) / len(train_outcome),
+                    'time': int(time.time())
+    }
+    with open(log_path, 'a+') as f:
+        f.write("{time};{trained_eps};{count};{sum};{mean}".format(**format_vars) + "\n")
     
 
+def log_eval_outcomes(outcomes, trained_eps = 0, log_path = os.path.join(model_path(), 'logs', "eval.log")):
+    for outcome in outcomes:
+        scores = outcome[1]
+        format_vars = { 'trained_eps': trained_eps,
+                        'method': outcome[0],
+                        'count': len(scores),
+                        'sum': sum(scores),
+                        'mean': sum(scores) / len(scores),
+                        'time': int(time.time())
+        }
+        with open(log_path, 'a+') as f:
+            f.write("{time};{method};{trained_eps};{count};{sum};{mean}".format(**format_vars) + "\n")
+
+def log_bench_eval_outcomes(outcomes, log_path, index, time, trained_eps = 0):
+    for outcome in outcomes:
+        scores = outcome[1]
+        format_vars = { 'trained_eps': trained_eps,
+                        'method': outcome[0],
+                        'count': len(scores),
+                        'sum': sum(scores),
+                        'mean': sum(scores) / len(scores),
+                        'time': time,
+                        'index': index,
+        }
+        with open(log_path, 'a+') as f:
+            f.write("{method};{count};{index};{time};{sum};{mean}".format(**format_vars) + "\n")
+
 # Do actions specified by command-line
 if args.list_models:
     def get_eps_trained(folder):
@@ -94,7 +110,7 @@ if args.list_models:
             return int(f.read())
     model_folders = [ f.path
                       for f
-                      in os.scandir(model_storage_path)
+                      in os.scandir(config['model_storage_path'])
                       if f.is_dir() ]
     models = [ (folder, get_eps_trained(folder)) for folder in model_folders ]
     sys.stderr.write("Found {} model(s)\n".format(len(models)))
@@ -106,13 +122,13 @@ if args.list_models:
 if __name__ == "__main__":
     # Set up network
     from network import Network
-    network = Network(config, config['model'])
-    start_episode = network.episodes_trained
 
     # Set up variables
     episode_count = config['episode_count']
                   
     if args.train:
+        network = Network(config, config['model'])
+        start_episode = network.episodes_trained
         while True:
             train_outcome = network.train_model(episodes = episode_count, trained_eps = start_episode)
             start_episode += episode_count
@@ -122,9 +138,58 @@ if __name__ == "__main__":
                 log_eval_outcomes(eval_outcomes, trained_eps = start_episode)
             if not config['train_perpetually']:
                 break
+
+            
     elif args.eval:
-        outcomes = network.eval()
+        network = Network(config, config['model'])
+        start_episode = network.episodes_trained
+        # Evaluation measures are described in `config`
+        outcomes = network.eval(config['episode_count'])
         log_eval_outcomes(outcomes, trained_eps = start_episode)
         # elif args.play:
         # g.play(episodes = episode_count)
-    
+
+        
+    elif args.bench_eval_scores:
+        # Make sure benchmark directory exists
+        if not os.path.isdir(config['bench_storage_path']):
+            os.mkdir(config['bench_storage_path'])
+
+        config = config.copy()
+        config['model'] = 'bench'
+        
+        network = Network(config, config['model'])
+        start_episode = network.episodes_trained
+
+        if start_episode == 0:
+            print("Model not trained! Beware of using non-existing models!")
+            exit()
+        
+        sample_count = 20
+        episode_counts = [25, 50, 100, 250, 500, 1000, 2500, 5000,
+                          10000, 20000]
+
+        def do_eval(sess):
+            for eval_method in config['eval_methods']:
+                result_path = os.path.join(config['bench_storage_path'],
+                                           eval_method) + "-{}.log".format(int(time.time()))
+                for n in episode_counts:
+                    for i in range(sample_count):
+                        start_time = time.time()
+                        # Evaluation measure to be benchmarked are described in `config`
+                        outcomes = network.eval(episode_count = n,
+                                                tf_session = sess)
+                        time_diff = time.time() - start_time
+                        log_bench_eval_outcomes(outcomes,
+                                                time = time_diff,
+                                                index = i,
+                                                trained_eps = start_episode,
+                                                log_path = result_path)
+
+        # CMM: oh no
+        import tensorflow as tf
+        with tf.Session() as session:
+            network.restore_model(session)
+            do_eval(session)
+        
+        
diff --git a/network.py b/network.py
index 1dc4b62..6358761 100644
--- a/network.py
+++ b/network.py
@@ -22,7 +22,7 @@ class Network:
 
     def __init__(self, config, name):
         self.config = config
-        self.checkpoint_path = config['model_path']
+        self.checkpoint_path = os.path.join(config['model_storage_path'], config['model'])
 
         self.name = name
 
@@ -388,7 +388,25 @@ class Network:
                     print_time_estimate(episode)
 
             sys.stderr.write("[TRAIN] Saving model for final episode...\n")
-            self.save_model(sess, episode + trained_eps)
+            self.save_model(sess, episode+trained_eps)
+            
+            writer.close()
+        
+            return outcomes
+
+            
+                # take turn, which finds the best state and picks it, based on the current network
+                # save current state
+                # run training operation (session.run(self.training_op, {x:x, value_next, value_next})), (something which does the backprop, based on the state after having taken a turn, found before, and the state we saved in the beginning and from now we'll save it at the end of the turn
+                # save the current state again, so we can continue running backprop based on the "previous" turn.
+
+        # NOTE: We need to make a method so that we can take a single turn or at least just pick the next best move, so we know how to evaluate according to TD-learning. Right now, our game just continues in a while loop without nothing to stop it!
+        
+
+
+    def eval(self, episode_count, trained_eps = 0, tf_session = None):
+        def do_eval(sess, method, episodes = 1000, trained_eps = 0):
+            start_time = time.time()
 
             writer.close()
 
@@ -403,3 +421,23 @@ class Network:
 
             # save the current state again, so we can continue running backprop based on the "previous" turn.
 
+
+        if tf_session == None:
+            with tf.Session():
+                session.run(tf.global_variables_initializer())
+                self.restore_model(session)
+                outcomes = [ (method, do_eval(session,
+                                              method,
+                                              episode_count,
+                                              trained_eps = trained_eps))
+                             for method
+                             in self.config['eval_methods'] ]
+                return outcomes
+        else:
+            outcomes = [ (method, do_eval(tf_session,
+                                          method,
+                                          episode_count,
+                                          trained_eps = trained_eps))
+                         for method
+                         in self.config['eval_methods'] ]
+            return outcomes
diff --git a/plot.py b/plot.py
index 5a94f51..5957854 100644
--- a/plot.py
+++ b/plot.py
@@ -9,9 +9,26 @@ import matplotlib.dates as mdates
 
 train_headers = ['timestamp', 'eps_train', 'eps_trained_session', 'sum', 'mean']
 eval_headers  = ['timestamp', 'method', 'eps_train', 'eval_eps_used', 'sum', 'mean']
+bench_headers = ['method', 'sample_count', 'i', 'time', 'sum', 'mean']
 
 model_path = 'models'
 
+def plot_bench(data_path):
+    df = pd.read_csv(data_path, sep=";",
+                     names=bench_headers, index_col=[0,1,2])
+    for method_label in df.index.levels[0]:
+        df_prime = df[['mean']].loc[method_label].unstack().T
+        plot = df_prime.plot.box()
+        plot.set_title("Evaluation variance, {}".format(method_label))
+        plot.set_xlabel("Sample count")
+        plot.set_ylabel("Mean score")
+        plt.show(plot.figure)
+
+        # for later use:
+        variances = df_prime.var()
+        print(variances)
+
+        del df_prime, plot, variances
 
 def dataframes(model_name):
     def df_timestamp_to_datetime(df):