Fix reanalyse and format

werner-duvaud · werner-duvaud · commit 6a273e0c0cd3 · 2020-09-16T23:01:11.000+02:00
diff --git a/games/atari.py b/games/atari.py
@@ -18,7 +18,7 @@ def __init__(self):
         # More information is available here: https://github.com/werner-duvaud/muzero-general/wiki/Hyperparameter-Optimization
 
         self.seed = 0  # Seed for numpy, torch and the game
-        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. By default muzero uses every GPUs available
+        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. It's usually faster to use a single GPU (set it to 1) if it has enough memory. None will use every GPUs available
 
 
 
diff --git a/games/breakout.py b/games/breakout.py
@@ -18,7 +18,7 @@ def __init__(self):
         # More information is available here: https://github.com/werner-duvaud/muzero-general/wiki/Hyperparameter-Optimization
 
         self.seed = 0  # Seed for numpy, torch and the game
-        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. By default muzero uses every GPUs available
+        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. It's usually faster to use a single GPU (set it to 1) if it has enough memory. None will use every GPUs available
 
 
 
diff --git a/games/cartpole.py b/games/cartpole.py
@@ -13,7 +13,7 @@ def __init__(self):
         # More information is available here: https://github.com/werner-duvaud/muzero-general/wiki/Hyperparameter-Optimization
 
         self.seed = 0  # Seed for numpy, torch and the game
-        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. By default muzero uses every GPUs available
+        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. It's usually faster to use a single GPU (set it to 1) if it has enough memory. None will use every GPUs available
 
 
 
diff --git a/games/connect4.py b/games/connect4.py
@@ -12,7 +12,7 @@ def __init__(self):
         # More information is available here: https://github.com/werner-duvaud/muzero-general/wiki/Hyperparameter-Optimization
 
         self.seed = 0  # Seed for numpy, torch and the game
-        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. By default muzero uses every GPUs available
+        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. It's usually faster to use a single GPU (set it to 1) if it has enough memory. None will use every GPUs available
 
 
 
diff --git a/games/gomoku.py b/games/gomoku.py
@@ -13,7 +13,7 @@ def __init__(self):
         # More information is available here: https://github.com/werner-duvaud/muzero-general/wiki/Hyperparameter-Optimization
 
         self.seed = 0  # Seed for numpy, torch and the game
-        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. By default muzero uses every GPUs available
+        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. It's usually faster to use a single GPU (set it to 1) if it has enough memory. None will use every GPUs available
 
 
 
diff --git a/games/gridworld.py b/games/gridworld.py
@@ -18,7 +18,7 @@ def __init__(self):
         # More information is available here: https://github.com/werner-duvaud/muzero-general/wiki/Hyperparameter-Optimization
 
         self.seed = 0  # Seed for numpy, torch and the game
-        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. By default muzero uses every GPUs available
+        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. It's usually faster to use a single GPU (set it to 1) if it has enough memory. None will use every GPUs available
 
 
 
diff --git a/games/lunarlander.py b/games/lunarlander.py
@@ -13,7 +13,7 @@ def __init__(self):
         # More information is available here: https://github.com/werner-duvaud/muzero-general/wiki/Hyperparameter-Optimization
 
         self.seed = 0  # Seed for numpy, torch and the game
-        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. By default muzero uses every GPUs available
+        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. It's usually faster to use a single GPU (set it to 1) if it has enough memory. None will use every GPUs available
 
 
 
diff --git a/games/simple_grid.py b/games/simple_grid.py
@@ -12,7 +12,7 @@ def __init__(self):
         # More information is available here: https://github.com/werner-duvaud/muzero-general/wiki/Hyperparameter-Optimization
 
         self.seed = 0  # Seed for numpy, torch and the game
-        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. By default muzero uses every GPUs available
+        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. It's usually faster to use a single GPU (set it to 1) if it has enough memory. None will use every GPUs available
 
 
 
diff --git a/games/tictactoe.py b/games/tictactoe.py
@@ -12,7 +12,7 @@ def __init__(self):
         # More information is available here: https://github.com/werner-duvaud/muzero-general/wiki/Hyperparameter-Optimization
 
         self.seed = 0  # Seed for numpy, torch and the game
-        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. By default muzero uses every GPUs available
+        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. It's usually faster to use a single GPU (set it to 1) if it has enough memory. None will use every GPUs available
 
 
 
diff --git a/games/twentyone.py b/games/twentyone.py
@@ -19,7 +19,7 @@ def __init__(self):
         # More information is available here: https://github.com/werner-duvaud/muzero-general/wiki/Hyperparameter-Optimization
 
         self.seed = 0  # Seed for numpy, torch and the game
-        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. By default muzero uses every GPUs available
+        self.max_num_gpus = None  # Fix the maximum number of GPUs to use. It's usually faster to use a single GPU (set it to 1) if it has enough memory. None will use every GPUs available
 
 
 
diff --git a/muzero.py b/muzero.py
@@ -633,9 +633,7 @@ def hyperparameter_search(
                 parallel_experiments = 2
                 lr_init = nevergrad.p.Log(a_min=0.0001, a_max=0.1)
                 discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
-                parametrization = nevergrad.p.Dict(
-                    lr_init=lr_init, discount=discount
-                )
+                parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
                 best_hyperparameters = hyperparameter_search(
                     game_name, parametrization, budget, parallel_experiments, 20
                 )
diff --git a/replay_buffer.py b/replay_buffer.py
@@ -17,11 +17,15 @@ class ReplayBuffer:
     def __init__(self, initial_checkpoint, initial_buffer, config):
         self.config = config
         self.buffer = copy.deepcopy(initial_buffer)
-        self.num_played_games = initial_checkpoint['num_played_games']
-        self.num_played_steps = initial_checkpoint['num_played_steps']
-        self.total_samples = sum([len(game_history.root_values) for game_history in self.buffer.values()])
+        self.num_played_games = initial_checkpoint["num_played_games"]
+        self.num_played_steps = initial_checkpoint["num_played_steps"]
+        self.total_samples = sum(
+            [len(game_history.root_values) for game_history in self.buffer.values()]
+        )
         if self.total_samples != 0:
-            print(f"Replay buffer initialized with {self.total_samples} samples ({self.num_played_games} games).\n")
+            print(
+                f"Replay buffer initialized with {self.total_samples} samples ({self.num_played_games} games).\n"
+            )
 
         # Fix random generator seed
         numpy.random.seed(self.config.seed)
@@ -203,11 +207,17 @@ def compute_target_value(self, game_history, index):
         # future, plus the discounted sum of all rewards until then.
         bootstrap_index = index + self.config.td_steps
         if bootstrap_index < len(game_history.root_values):
+            root_values = (
+                game_history.root_values
+                if game_history.reanalysed_predicted_root_values is None
+                else game_history.reanalysed_predicted_root_values
+            )
+            print(game_history.reanalysed_predicted_root_values is None)
             last_step_value = (
-                game_history.root_values[bootstrap_index]
+                root_values[bootstrap_index]
                 if game_history.to_play_history[bootstrap_index]
                 == game_history.to_play_history[index]
-                else -game_history.root_values[bootstrap_index]
+                else -root_values[bootstrap_index]
             )
 
             value = last_step_value * self.config.discount ** self.config.td_steps
@@ -323,8 +333,9 @@ def reanalyse(self, replay_buffer, shared_storage):
                     self.model.initial_inference(observations)[0],
                     self.config.support_size,
                 )
-                for i in range(len(game_history.root_values)):
-                    game_history.root_values[i] = values[i].item()
+                game_history.reanalysed_predicted_root_values = (
+                    torch.squeeze(values).detach().numpy()
+                )
 
             replay_buffer.update_game_history.remote(game_id, game_history)
             self.num_reanalysed_games += 1
diff --git a/results/cartpole/model.checkpoint b/results/cartpole/model.checkpoint
diff --git a/self_play.py b/self_play.py
@@ -136,7 +136,8 @@ def play_game(
                     numpy.array(observation).shape == self.config.observation_shape
                 ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
                 stacked_observations = game_history.get_stacked_observations(
-                    -1, self.config.stacked_observations,
+                    -1,
+                    self.config.stacked_observations,
                 )
 
                 # Choose the action
@@ -223,7 +224,7 @@ def select_opponent_action(self, opponent, stacked_observations):
     def select_action(node, temperature):
         """
         Select action according to the visit count distribution and the temperature.
-        The temperature is changed dynamically with the visit_softmax_temperature function 
+        The temperature is changed dynamically with the visit_softmax_temperature function
         in the config.
         """
         visit_counts = numpy.array(
@@ -300,7 +301,11 @@ def run(
                 set(self.config.action_space)
             ), "Legal actions should be a subset of the action space."
             root.expand(
-                legal_actions, to_play, reward, policy_logits, hidden_state,
+                legal_actions,
+                to_play,
+                reward,
+                policy_logits,
+                hidden_state,
             )
 
         if add_exploration_noise:
@@ -484,6 +489,7 @@ def __init__(self):
         self.to_play_history = []
         self.child_visits = []
         self.root_values = []
+        self.reanalysed_predicted_root_values = None
         # For PER
         self.priorities = None
         self.game_priority = None
diff --git a/trainer.py b/trainer.py
@@ -280,7 +280,12 @@ def update_lr(self):
 
     @staticmethod
     def loss_function(
-        value, reward, policy_logits, target_value, target_reward, target_policy,
+        value,
+        reward,
+        policy_logits,
+        target_value,
+        target_reward,
+        target_policy,
     ):
         # Cross-entropy seems to have a better convergence than MSE
         value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1)