diff --git a/.gitignore b/.gitignore
index f106bb6b..844f676b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -90,4 +90,6 @@ venv.bak/
 # mypy
 .mypy_cache/
 .dmypy.json
-dmypy.json
\ No newline at end of file
+dmypy.json
+
+results/
\ No newline at end of file
diff --git a/game_tournament.py b/game_tournament.py
new file mode 100644
index 00000000..81b1e363
--- /dev/null
+++ b/game_tournament.py
@@ -0,0 +1,392 @@
+import pickle
+
+import torch
+import copy
+import numpy
+
+from games.tictactoe import MuZeroConfig, Game
+import models
+import simplifiedMuZero.net2.models2 as models2
+from self_play import MCTS, GameHistory,SelfPlay
+
+class GameTournament:
+    def __init__(self, config:MuZeroConfig):
+        self.models = []
+        self.game = Game(config.seed)
+        self.config = config
+        self.board = numpy.zeros((3, 3), dtype="int32")
+        self.player = 0
+
+    def have_winner(self):
+        # Horizontal and vertical checks
+        for i in range(3):
+            if (self.board[i, :] == self.player * numpy.ones(3, dtype="int32")).all():
+                return True
+            if (self.board[:, i] == self.player * numpy.ones(3, dtype="int32")).all():
+                return True
+
+        # Diagonal checks
+        if (
+            self.board[0, 0] == self.player
+            and self.board[1, 1] == self.player
+            and self.board[2, 2] == self.player
+        ):
+            return True
+        if (
+            self.board[2, 0] == self.player
+            and self.board[1, 1] == self.player
+            and self.board[0, 2] == self.player
+        ):
+            return True
+
+        return False
+
+    def play_competition(self, model1, search_policy1, model2, search_policy2):
+        game_history = GameHistory()
+
+        observation = self.game.reset()
+
+        game_history.action_history.append(0)
+        game_history.observation_history.append(observation)  # 添加reset之后的observation
+        game_history.reward_history.append(0)
+        game_history.to_play_history.append(self.game.to_play())
+
+        done = False
+
+        model1.eval()
+        model2.eval()
+
+        is_model1 = True
+        while not done:
+            assert (
+                    len(numpy.array(observation).shape) == 3
+            ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
+            assert (
+                    numpy.array(observation).shape == self.config.observation_shape
+            ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
+            stacked_observations = game_history.get_stacked_observations(
+                -1, self.config.stacked_observations, len(self.config.action_space)
+            )
+
+            model = model1 if is_model1 else model2
+            search_policy = search_policy1 if is_model1 else search_policy2
+
+            root, mcts_info = search_policy(self.config).run(
+                model,
+                stacked_observations,
+                self.game.legal_actions(),
+                self.game.to_play(),  # to_play返回当期玩游戏的玩家ID，默认是0
+                True,
+            )
+
+            action = SelfPlay.select_action(root, 0)  # 第二个参数阈值为0表示不会偏移，选择最大的
+            observation, reward, done = self.game.step(action)
+
+            game_history.store_search_statistics(root, self.config.action_space)
+
+            # Next batch
+            game_history.action_history.append(action)
+            game_history.observation_history.append(observation)  # 添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
+            game_history.reward_history.append(reward)
+            game_history.to_play_history.append(self.game.to_play())
+
+            # 如果没有结束，就取反
+            if not done:
+                is_model1 = not is_model1
+
+            # print("is model",is_model1,  "reward is ", reward)
+
+        # 将player的id变回之前的id，否则检查是否有圣者时会发生错误
+        self.game.env.player *= -1
+
+        # 返回值处理
+        # |-----|-----|-----|
+        # |  True  |  True  |  True  | 表示模型1结束，结果为获胜。因此获胜的模型为模型1
+        # |  True  |  False  |  False  | 表示模型1结束，结果为失败。因此获胜的模型为模型2
+        # |  False  |  True  |  False  | 表示模型2结束，结果为获胜。因此获胜的模型为模型2
+        # |  False  |  False  |  True  | 表示模型2结束，结果为失败。因此获胜的模型为模型1
+        return self.game.env.have_winner(), is_model1 == (reward > 0)
+
+    def play_with_expert(self, model, search_policy, expert_first=True):
+        game_history = GameHistory()
+
+        observation = self.game.reset()
+
+        game_history.action_history.append(0)
+        game_history.observation_history.append(observation)  # 添加reset之后的observation
+        game_history.reward_history.append(0)
+        game_history.to_play_history.append(self.game.to_play())
+
+        done = False
+
+        model.eval()
+
+        is_model = not expert_first
+        while not done:
+            assert (
+                    len(numpy.array(observation).shape) == 3
+            ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
+            assert (
+                    numpy.array(observation).shape == self.config.observation_shape
+            ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
+            stacked_observations = game_history.get_stacked_observations(
+                -1, self.config.stacked_observations, len(self.config.action_space)
+            )
+
+
+            if is_model:
+                root, mcts_info = search_policy(self.config).run(
+                    model,
+                    stacked_observations,
+                    self.game.legal_actions(),
+                    self.game.to_play(),  # to_play返回当期玩游戏的玩家ID，默认是0
+                    True,
+                )
+                action = SelfPlay.select_action(root, 0)  # 第二个参数阈值为0表示不会偏移，选择最大的
+            else:
+                action = self.game.expert_agent()
+                root = None
+
+            observation, reward, done = self.game.step(action)
+
+            game_history.store_search_statistics(root, self.config.action_space)
+
+            # Next batch
+            game_history.action_history.append(action)
+            game_history.observation_history.append(observation)  # 添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
+            game_history.reward_history.append(reward)
+            game_history.to_play_history.append(self.game.to_play())
+
+            # 如果没有结束，就取反
+            if not done:
+                is_model = not is_model
+
+            # print("is model",is_model1,  "reward is ", reward)
+
+        # 将player的id变回之前的id，否则检查是否有圣者时会发生错误
+        self.game.env.player *= -1
+
+        # 返回值处理
+        # |-----|-----|-----|
+        # |  True  |  True  |  True  | 表示模型1结束，结果为获胜。因此获胜的模型为模型1
+        # |  True  |  False  |  False  | 表示模型1结束，结果为失败。因此获胜的模型为模型2
+        # |  False  |  True  |  False  | 表示模型2结束，结果为获胜。因此获胜的模型为模型2
+        # |  False  |  False  |  True  | 表示模型2结束，结果为失败。因此获胜的模型为模型1
+        return self.game.env.have_winner(), is_model == (reward > 0)
+
+    def close_game(self):
+        self.game.close()
+
+    def play_tournament(self, models, rollnum=1000):
+        model_num = len(models)
+
+        for i in range(model_num):
+            for j in range(i+1, model_num):
+                model1 = models[i]["model"]
+                model2 = models[j]["model"]
+
+                # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)])
+                model1_win_num = 0
+                model2_win_num = 0
+                no_winner_num = 0
+
+                for _ in range(rollnum):
+                    have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS)
+
+                    if have_winner:
+                        if is_model1:
+                            model1_win_num += 1
+                        else:
+                            model2_win_num += 1
+                    else:
+                        no_winner_num += 1
+
+                #  # 交换顺序，再来一遍
+                # for _ in range(rollnum):
+                #     have_winner, is_model1 = self.play_competition(model2, MCTS, model1, MCTS)
+                #
+                #     if have_winner:
+                #         if is_model1:
+                #             model2_win_num += 1
+                #         else:
+                #             model1_win_num += 1
+                #     else:
+                #         no_winner_num += 1
+
+                # print(is_model1)
+
+                print(models[i]["name"],"   ,", models[j]["name"]," :   ")
+
+                print(models[i]["name"], " win  :   ", model1_win_num)
+                print(models[j]["name"], " win  :   ", model2_win_num)
+                print("No Winner", no_winner_num)
+                print("===================================")
+
+        model1_win_num = 0
+        model2_win_num = 0
+        no_winner_num = 0
+        for i in range(model_num):
+            for j in range(i+1, model_num):
+                model1 = models[i]["model"]
+                model2 = models[j]["model"]
+
+                # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)])
+                model1_win_num = 0
+                model2_win_num = 0
+                no_winner_num = 0
+
+                for _ in range(rollnum):
+                    have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS)
+
+                    if have_winner:
+                        if is_model1:
+                            model1_win_num += 1
+                        else:
+                            model2_win_num += 1
+                    else:
+                        no_winner_num += 1
+
+
+                print(models[j]["name"],"   ,", models[i]["name"]," :   ")
+
+                print(models[j]["name"], " win  :   ", model1_win_num)
+                print(models[i]["name"], " win  :   ", model2_win_num)
+                print("No Winner", no_winner_num)
+                print("===================================")
+
+    def play_tournament_with_expert(self, models, rollnum=1000):
+        model_num = len(models)
+
+        for i in range(model_num):
+            model = models[i]["model"]
+
+            # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)])
+            model_win_num = 0
+            expert_win_num = 0
+            no_winner_num = 0
+
+            for _ in range(rollnum):
+                have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False)
+
+                if have_winner:
+                    if is_model:
+                        model_win_num += 1
+                    else:
+                        expert_win_num += 1
+                else:
+                    no_winner_num += 1
+
+                # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True)
+                #
+                # if have_winner:
+                #     if is_model:
+                #         model_win_num += 1
+                #     else:
+                #         expert_win_num += 1
+                # else:
+                #     no_winner_num += 1
+
+
+            print(models[i]["name"], "   ,", "expert :   ")
+
+            print(models[i]["name"], " win  :   ", model_win_num)
+            print("expert win  :   ", expert_win_num)
+            print("No Winner", no_winner_num)
+            print("===================================")
+
+            model_win_num = 0
+            expert_win_num = 0
+            no_winner_num = 0
+            for _ in range(rollnum):
+                # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False)
+                #
+                # if have_winner:
+                #     if is_model:
+                #         model_win_num += 1
+                #     else:
+                #         expert_win_num += 1
+                # else:
+                #     no_winner_num += 1
+
+                have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True)
+
+                if have_winner:
+                    if is_model:
+                        model_win_num += 1
+                    else:
+                        expert_win_num += 1
+                else:
+                    no_winner_num += 1
+
+            print("expert :   ", "   ,", models[i]["name"])
+
+            print("expert win  :   ", expert_win_num)
+            print(models[i]["name"], " win  :   ", model_win_num)
+            print("No Winner", no_winner_num)
+            print("===================================")
+
+
+
+def load_model(model_cls, model_path, config):
+    checkpoint = torch.load(model_path)
+    model = model_cls(config)
+    model.set_weights(checkpoint["weights"])
+
+    return model
+
+
+if __name__ == "__main__":
+    config = MuZeroConfig()
+
+    # config.network = "fullyconnected"
+    # checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-10--20-03-39\model.checkpoint"
+    checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe2\2023-08-23--16-24-04\model.checkpoint"
+    checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe2\2023-08-23--17-12-53\model.checkpoint"
+    muzero_model = load_model(models.MuZeroNetwork, checkpoint_path1, config)
+
+    # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
+    # muzero_2net_model = load_model(models.MuZeroNetwork, muzero_2net_checkpoint_path, config)
+
+    config2 = MuZeroConfig()
+    # config2.network = "resnet"
+    # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint"
+    # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-22--20-25-51\muzero_2net\model.checkpoint"
+    muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe2\2023-08-24--02-55-21\muzero_2net\model.checkpoint"
+    muzero_2net_model = load_model(models2.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config2)
+
+    # uniform_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--08-20-50\muzero_uniform\model.checkpoint"
+    # uniform_model = load_model(models.MuZeroNetwork, uniform_checkpoint_path, config)
+    #
+    # without_rb_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-16--04-35-40\muzero_without_rb\model.checkpoint"
+    # without_rb_model = load_model(models.MuZeroNetwork, without_rb_checkpoint_path, config)
+    #
+    # muzero_no_policy_value_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
+    # muzero_no_policy_model = load_model(models.MuZeroNetwork, muzero_no_policy_value_checkpoint_path, config)
+    #
+    #
+    # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint"
+    # simplified_muzero = load_model(models.MuZeroNetwork, simplified_muzero_checkpoint_path, config)
+    #
+    # # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-18--03-02-10\MuZeroNetwork_2net\model.checkpoint"
+    # # simplified_muzero = load_model(models_2net.SimplifiedMuZeroNetwork, simplified_muzero_checkpoint_path, config)
+
+
+    game_tournament = GameTournament(config)
+
+    models = [
+        {"name":"muzero_2net", "model":muzero_2net_model},
+        # {"name":"uniform", "model":uniform_model},
+        {"name":"muzero", "model":muzero_model},
+        # {"name": "muzero2", "model": muzero_model},
+        # {"name": "without_rb", "model": without_rb_model},
+        # {"name": "no policy value", "model": muzero_no_policy_model},
+        # {"name": "simplified_muzero", "model": without_rb_model},
+    ]
+
+
+    # game_tournament.play_tournament(models, rollnum=1000)
+    # game_tournament.play_tournament(models, rollnum=1000)
+    game_tournament.play_tournament_with_expert(models, rollnum=500)
+
+    game_tournament.close_game()
+
diff --git a/games/simple_grid.py b/games/simple_grid.py
index f26ae429..d163d7de 100644
--- a/games/simple_grid.py
+++ b/games/simple_grid.py
@@ -23,6 +23,8 @@ def __init__(self):
         self.players = list(range(1))  # List of players. You should only edit the length
         self.stacked_observations = 0  # Number of previous observations and previous actions to add to the current observation
 
+        self.action_replace = True
+
         # Evaluate
         self.muzero_player = 0  # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second)
         self.opponent = None  # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class
diff --git a/games/tictactoe.py b/games/tictactoe.py
index f331a9ae..ff9a90bf 100644
--- a/games/tictactoe.py
+++ b/games/tictactoe.py
@@ -27,7 +27,8 @@ def __init__(self):
         self.muzero_player = 0  # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second)
         self.opponent = "expert"  # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class
 
-
+        # 动作是否能重复
+        self.action_replace = False
 
         ### Self-Play
         self.num_workers = 1  # Number of simultaneous threads/workers self-playing to feed the replay buffer
@@ -48,7 +49,8 @@ def __init__(self):
 
 
         ### Network
-        self.network = "resnet"  # "resnet" / "fullyconnected"
+        # self.network = "resnet"  # "resnet" / "fullyconnected"
+        self.network = "fullyconnected"
         self.support_size = 10  # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size. Choose it so that support_size <= sqrt(max(abs(discounted reward)))
 
         # Residual Network
@@ -63,19 +65,27 @@ def __init__(self):
         self.resnet_fc_policy_layers = [8]  # Define the hidden layers in the policy head of the prediction network
 
         # Fully Connected Network
+        # self.encoding_size = 32
+        # self.fc_representation_layers = []  # Define the hidden layers in the representation network
+        # self.fc_dynamics_layers = [16]  # Define the hidden layers in the dynamics network
+        # self.fc_reward_layers = [16]  # Define the hidden layers in the reward network
+        # self.fc_value_layers = []  # Define the hidden layers in the value network
+        # self.fc_policy_layers = []  # Define the hidden layers in the policy network
+
         self.encoding_size = 32
-        self.fc_representation_layers = []  # Define the hidden layers in the representation network
+        self.fc_representation_layers = [16]  # Define the hidden layers in the representation network
         self.fc_dynamics_layers = [16]  # Define the hidden layers in the dynamics network
         self.fc_reward_layers = [16]  # Define the hidden layers in the reward network
-        self.fc_value_layers = []  # Define the hidden layers in the value network
-        self.fc_policy_layers = []  # Define the hidden layers in the policy network
-
+        self.fc_value_layers = [16]  # Define the hidden layers in the value network
+        self.fc_policy_layers = [16]
 
 
         ### Training
         self.results_path = pathlib.Path(__file__).resolve().parents[1] / "results" / pathlib.Path(__file__).stem / datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S")  # Path to store the model weights and TensorBoard logs
         self.save_model = True  # Save the checkpoint in results_path as model.checkpoint
-        self.training_steps = 1000000  # Total number of training steps (ie weights update according to a batch)
+        # self.training_steps = 1000000  # Total number of training steps (ie weights update according to a batch)
+        # self.training_steps = 50000
+        self.training_steps = 500000
         self.batch_size = 64  # Number of parts of games to train on at each training step
         self.checkpoint_interval = 10  # Number of training steps before using the model for self-playing
         self.value_loss_weight = 0.25  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
diff --git a/models.py b/models.py
index be847fef..d4b8bc2f 100644
--- a/models.py
+++ b/models.py
@@ -94,6 +94,7 @@ def __init__(
         super().__init__()
         self.action_space_size = action_space_size
         self.full_support_size = 2 * support_size + 1
+        # support_size 表示的应该是一个选择的范围【-support_size, support_size】.最后+1是因为range最后不包含最后的数
 
         self.representation_network = torch.nn.DataParallel(
             mlp(
@@ -107,6 +108,7 @@ def __init__(
             )
         )
 
+        #dynamics的输入是encoding_size+action_space_size
         self.dynamics_encoded_state_network = torch.nn.DataParallel(
             mlp(
                 encoding_size + self.action_space_size,
@@ -115,14 +117,14 @@ def __init__(
             )
         )
         self.dynamics_reward_network = torch.nn.DataParallel(
-            mlp(encoding_size, fc_reward_layers, self.full_support_size)
+            mlp(encoding_size, fc_reward_layers, self.full_support_size) #最后的输出为full_support_size，因为范围是[-support_size, support_size]
         )
 
         self.prediction_policy_network = torch.nn.DataParallel(
-            mlp(encoding_size, fc_policy_layers, self.action_space_size)
+            mlp(encoding_size, fc_policy_layers, self.action_space_size) #输出action的概率
         )
         self.prediction_value_network = torch.nn.DataParallel(
-            mlp(encoding_size, fc_value_layers, self.full_support_size)
+            mlp(encoding_size, fc_value_layers, self.full_support_size) #最后的输出为full_support_size，因为范围是[-support_size, support_size]
         )
 
     def prediction(self, encoded_state):
@@ -134,16 +136,19 @@ def representation(self, observation):
         encoded_state = self.representation_network(
             observation.view(observation.shape[0], -1)
         )
+
+        # 正则化
         # Scale encoded state between [0, 1] (See appendix paper Training)
         min_encoded_state = encoded_state.min(1, keepdim=True)[0]
         max_encoded_state = encoded_state.max(1, keepdim=True)[0]
         scale_encoded_state = max_encoded_state - min_encoded_state
-        scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5
+        scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5 # 防止为0，造成NAN
         encoded_state_normalized = (
             encoded_state - min_encoded_state
         ) / scale_encoded_state
         return encoded_state_normalized
 
+    # dynamic同representation的唯一不同就是前者需要将encoded_state和action合并在一起作为输入，而representation不需要绑定action
     def dynamics(self, encoded_state, action):
         # Stack encoded_state with a game specific one hot encoded action (See paper appendix Network Architecture)
         action_one_hot = (
@@ -151,18 +156,19 @@ def dynamics(self, encoded_state, action):
             .to(action.device)
             .float()
         )
-        action_one_hot.scatter_(1, action.long(), 1.0)
+        action_one_hot.scatter_(1, action.long(), 1.0) #将action的位置赋值为1
         x = torch.cat((encoded_state, action_one_hot), dim=1)
 
         next_encoded_state = self.dynamics_encoded_state_network(x)
 
         reward = self.dynamics_reward_network(next_encoded_state)
 
+        # 正则化
         # Scale encoded state between [0, 1] (See paper appendix Training)
         min_next_encoded_state = next_encoded_state.min(1, keepdim=True)[0]
         max_next_encoded_state = next_encoded_state.max(1, keepdim=True)[0]
         scale_next_encoded_state = max_next_encoded_state - min_next_encoded_state
-        scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5
+        scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5 # 防止为0，造成NAN
         next_encoded_state_normalized = (
             next_encoded_state - min_next_encoded_state
         ) / scale_next_encoded_state
@@ -172,7 +178,7 @@ def dynamics(self, encoded_state, action):
     def initial_inference(self, observation):
         encoded_state = self.representation(observation)
         policy_logits, value = self.prediction(encoded_state)
-        # reward equal to 0 for consistency
+        # reward equal to 0 for consistency 一致性奖励等于 0
         reward = torch.log(
             (
                 torch.zeros(1, self.full_support_size)
@@ -181,6 +187,7 @@ def initial_inference(self, observation):
                 .to(observation.device)
             )
         )
+        # reward的样子为[[0,0,...,0,1,0,...,0,0]，...]。即中间值为1，其余全为0，然后重复于observation行数相同的次数
 
         return (
             value,
@@ -605,8 +612,8 @@ def initial_inference(self, observation):
         reward = torch.log(
             (
                 torch.zeros(1, self.full_support_size)
-                .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0)
-                .repeat(len(observation), 1)
+                .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0) # 将support_size位置设为1
+                .repeat(len(observation), 1) # 根据observation的长度复制，保证reward的维度于observation的一致，即之前的observation也赋值
                 .to(observation.device)
             )
         )
@@ -637,29 +644,29 @@ def mlp(
     sizes = [input_size] + layer_sizes + [output_size]
     layers = []
     for i in range(len(sizes) - 1):
-        act = activation if i < len(sizes) - 2 else output_activation
+        act = activation if i < len(sizes) - 2 else output_activation #激活函数，最后一层是output_activation，其余的都一样
         layers += [torch.nn.Linear(sizes[i], sizes[i + 1]), act()]
     return torch.nn.Sequential(*layers)
 
 
-def support_to_scalar(logits, support_size):
+def support_to_scalar(logits, support_size): # logits 是 value的对数值，support_size是转换后的范围。
     """
     Transform a categorical representation to a scalar
     See paper appendix Network Architecture
     """
     # Decode to a scalar
-    probabilities = torch.softmax(logits, dim=1)
+    probabilities = torch.softmax(logits, dim=1) # softmax在指定的向量和为1，softmax扩大大的，缩小下的，shape为[stacked_size, fully_support_size]
     support = (
-        torch.tensor([x for x in range(-support_size, support_size + 1)])
+        torch.tensor([x for x in range(-support_size, support_size + 1)]) # 范围是-support_size, support_szie。因为support_size+1
         .expand(probabilities.shape)
         .float()
         .to(device=probabilities.device)
-    )
-    x = torch.sum(support * probabilities, dim=1, keepdim=True)
+    ) # shape 为【stacked_size, fully_support_size】，
+    x = torch.sum(support * probabilities, dim=1, keepdim=True) # 输出为【1，fully_support_size】，因为dim=1，另外keep_dim=True，所有是【1，fully_support_size】而不是【fully_support_size]
 
     # Invert the scaling (defined in https://arxiv.org/abs/1805.11593)
-    x = torch.sign(x) * (
-        ((torch.sqrt(1 + 4 * 0.001 * (torch.abs(x) + 1 + 0.001)) - 1) / (2 * 0.001))
+    x = torch.sign(x) * ( # sign函数为分段函数, x小于0为-1，大于0为1，0为0。主要是获取x的符号
+        ((torch.sqrt(1 + 4 * 0.001 * (torch.abs(x) + 1 + 0.001)) - 1) / (2 * 0.001)) # (sqrt(1+0.04*(|x|+1.001))-1)/0.002
         ** 2
         - 1
     )
@@ -675,9 +682,9 @@ def scalar_to_support(x, support_size):
     x = torch.sign(x) * (torch.sqrt(torch.abs(x) + 1) - 1) + 0.001 * x
 
     # Encode on a vector
-    x = torch.clamp(x, -support_size, support_size)
-    floor = x.floor()
-    prob = x - floor
+    x = torch.clamp(x, -support_size, support_size) # 裁剪x的范围，使x的范围定为[-support_size, support_size]
+    floor = x.floor() # floor向下取整，类似的，ceil为向上取整
+    prob = x - floor # 减去整数，保留小数部分（因为在support_to_scala部分是index位置乘上概率)
     logits = torch.zeros(x.shape[0], x.shape[1], 2 * support_size + 1).to(x.device)
     logits.scatter_(
         2, (floor + support_size).long().unsqueeze(-1), (1 - prob).unsqueeze(-1)
diff --git a/muzero.py b/muzero.py
index f7601c9b..3e075e96 100644
--- a/muzero.py
+++ b/muzero.py
@@ -43,6 +43,7 @@ def __init__(self, game_name, config=None, split_resources_in=1):
         # Load the game and the config from the module with the game name
         try:
             game_module = importlib.import_module("games." + game_name)
+            print("games." + game_name)
             self.Game = game_module.Game
             self.config = game_module.MuZeroConfig()
         except ModuleNotFoundError as err:
@@ -671,7 +672,10 @@ def load_model_menu(muzero, game_name):
                 choice = input("Invalid input, enter a number listed above: ")
             choice = int(choice)
             if choice == 0:
+                start_time = time.time()
                 muzero.train()
+                end_time = time.time()
+                print("耗时: {:.2f}秒".format(end_time - start_time))
             elif choice == 1:
                 load_model_menu(muzero, game_name)
             elif choice == 2:
diff --git a/muzero_2net.py b/muzero_2net.py
new file mode 100644
index 00000000..fe9f6478
--- /dev/null
+++ b/muzero_2net.py
@@ -0,0 +1,723 @@
+import copy
+import importlib
+import json
+import math
+import pathlib
+import pickle
+import sys
+import time
+
+import nevergrad
+import numpy
+import ray
+import torch
+from torch.utils.tensorboard import SummaryWriter
+
+sys.path.append("")
+
+import diagnose_model
+# import simplifiedMuZero.net2.models_2net as models
+import models
+from simplifiedMuZero.net2.models2 import MuZeroNetwork_2net
+import simplifiedMuZero.net2.replay_buffer_2net as replay_buffer
+import simplifiedMuZero.net2.self_play_2net as self_play
+import shared_storage
+import simplifiedMuZero.net2.trainer_2net as trainer
+
+
+class MuZero_2Net:
+    """
+    Main class to manage MuZero.
+
+    Args:
+        game_name (str): Name of the game module, it should match the name of a .py file
+        in the "./games" directory.
+
+        config (dict, MuZeroConfig, optional): Override the default config of the game.
+
+        split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances.
+
+    Example:
+        >>> muzero = MuZero_2Net("cartpole")
+        >>> muzero.train()
+        >>> muzero.test(render=True)
+    """
+
+    def __init__(self, game_name, config=None, split_resources_in=1):
+        # Load the game and the config from the module with the game name
+        try:
+            game_module = importlib.import_module("games." + game_name)
+            print("games." + game_name)
+            self.Game = game_module.Game
+            self.config = game_module.MuZeroConfig()
+        except ModuleNotFoundError as err:
+            print(
+                f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.'
+            )
+            raise err
+
+        # Overwrite the config
+        if config:
+            if type(config) is dict:
+                for param, value in config.items():
+                    if hasattr(self.config, param):
+                        setattr(self.config, param, value)
+                    else:
+                        raise AttributeError(
+                            f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters."
+                        )
+            else:
+                self.config = config
+
+        # 重命名路径，以便区分不同的模型
+        self.config.results_path /= "muzero_2net"
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Manage GPUs
+        if self.config.max_num_gpus == 0 and (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            raise ValueError(
+                "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu."
+            )
+        if (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            total_gpus = (
+                self.config.max_num_gpus
+                if self.config.max_num_gpus is not None
+                else torch.cuda.device_count()
+            )
+        else:
+            total_gpus = 0
+        self.num_gpus = total_gpus / split_resources_in
+        if 1 < self.num_gpus:
+            self.num_gpus = math.floor(self.num_gpus)
+
+        ray.init(num_gpus=total_gpus, ignore_reinit_error=True)
+
+        # Checkpoint and replay buffer used to initialize workers
+        self.checkpoint = {
+            "weights": None,
+            "optimizer_state": None,
+            "total_reward": 0,
+            "muzero_reward": 0,
+            "opponent_reward": 0,
+            "episode_length": 0,
+            "mean_value": 0,
+            "training_step": 0,
+            "lr": 0,
+            "total_loss": 0,
+            "value_loss": 0,
+            "reward_loss": 0,
+            "policy_loss": 0,
+            "num_played_games": 0,
+            "num_played_steps": 0,
+            "num_reanalysed_games": 0,
+            "terminate": False,
+        }
+        self.replay_buffer = {}
+
+        cpu_actor = CPUActor.remote()
+        cpu_weights = cpu_actor.get_initial_weights.remote(self.config)
+        self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights))
+
+        # Workers
+        self.self_play_workers = None
+        self.test_worker = None
+        self.training_worker = None
+        self.reanalyse_worker = None
+        self.replay_buffer_worker = None
+        self.shared_storage_worker = None
+
+    def train(self, log_in_tensorboard=True):
+        """
+        Spawn ray workers and launch the training.
+
+        Args:
+            log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard.
+        """
+        if log_in_tensorboard or self.config.save_model:
+            self.config.results_path.mkdir(parents=True, exist_ok=True)
+
+        # Manage GPUs
+        if 0 < self.num_gpus:
+            num_gpus_per_worker = self.num_gpus / (
+                self.config.train_on_gpu
+                + self.config.num_workers * self.config.selfplay_on_gpu
+                + log_in_tensorboard * self.config.selfplay_on_gpu
+                + self.config.use_last_model_value * self.config.reanalyse_on_gpu
+            )
+            if 1 < num_gpus_per_worker:
+                num_gpus_per_worker = math.floor(num_gpus_per_worker)
+        else:
+            num_gpus_per_worker = 0
+
+        # Initialize workers
+        self.training_worker = trainer.Trainer.options(
+            num_cpus=0,
+            num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0,
+        ).remote(self.checkpoint, self.config)
+
+        self.shared_storage_worker = shared_storage.SharedStorage.remote(
+            self.checkpoint,
+            self.config,
+        )
+        self.shared_storage_worker.set_info.remote("terminate", False)
+
+        self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote(
+            self.checkpoint, self.replay_buffer, self.config
+        )
+
+        if self.config.use_last_model_value:
+            self.reanalyse_worker = replay_buffer.Reanalyse.options(
+                num_cpus=0,
+                num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0,
+            ).remote(self.checkpoint, self.config)
+
+        self.self_play_workers = [
+            self_play.SelfPlay.options(
+                num_cpus=0,
+                num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
+            ).remote(
+                self.checkpoint,
+                self.Game,
+                self.config,
+                self.config.seed + seed,
+            )
+            for seed in range(self.config.num_workers)
+        ]
+
+        # Launch workers
+        [
+            self_play_worker.continuous_self_play.remote(
+                self.shared_storage_worker, self.replay_buffer_worker
+            )
+            for self_play_worker in self.self_play_workers
+        ]
+        self.training_worker.continuous_update_weights.remote(
+            self.replay_buffer_worker, self.shared_storage_worker
+        )
+        if self.config.use_last_model_value:
+            self.reanalyse_worker.reanalyse.remote(
+                self.replay_buffer_worker, self.shared_storage_worker
+            )
+
+        if log_in_tensorboard:
+            self.logging_loop(
+                num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
+            )
+
+    def logging_loop(self, num_gpus):
+        """
+        Keep track of the training performance.
+        """
+        # Launch the test worker to get performance metrics
+        self.test_worker = self_play.SelfPlay.options(
+            num_cpus=0,
+            num_gpus=num_gpus,
+        ).remote(
+            self.checkpoint,
+            self.Game,
+            self.config,
+            self.config.seed + self.config.num_workers,
+        )
+        self.test_worker.continuous_self_play.remote(
+            self.shared_storage_worker, None, True
+        )
+
+        # Write everything in TensorBoard
+        writer = SummaryWriter(self.config.results_path)
+
+        print(
+            "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
+        )
+
+        # Save hyperparameters to TensorBoard
+        hp_table = [
+            f"| {key} | {value} |" for key, value in self.config.__dict__.items()
+        ]
+        writer.add_text(
+            "Hyperparameters",
+            "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
+        )
+        # Save model representation
+        writer.add_text(
+            "Model summary",
+            self.summary,
+        )
+        # Loop for updating the training performance
+        counter = 0
+        keys = [
+            "total_reward",
+            "muzero_reward",
+            "opponent_reward",
+            "episode_length",
+            "mean_value",
+            "training_step",
+            "lr",
+            "total_loss",
+            "value_loss",
+            "reward_loss",
+            "policy_loss",
+            "num_played_games",
+            "num_played_steps",
+            "num_reanalysed_games",
+        ]
+        info = ray.get(self.shared_storage_worker.get_info.remote(keys))
+        try:
+            while info["training_step"] < self.config.training_steps:
+                info = ray.get(self.shared_storage_worker.get_info.remote(keys))
+                writer.add_scalar(
+                    "1.Total_reward/1.Total_reward",
+                    info["total_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/2.Mean_value",
+                    info["mean_value"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/3.Episode_length",
+                    info["episode_length"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/4.MuZero_reward",
+                    info["muzero_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/5.Opponent_reward",
+                    info["opponent_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/1.Self_played_games",
+                    info["num_played_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/2.Training_steps", info["training_step"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/3.Self_played_steps", info["num_played_steps"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/4.Reanalysed_games",
+                    info["num_reanalysed_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/5.Training_steps_per_self_played_step_ratio",
+                    info["training_step"] / max(1, info["num_played_steps"]),
+                    counter,
+                )
+                writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter)
+                writer.add_scalar(
+                    "3.Loss/1.Total_weighted_loss", info["total_loss"], counter
+                )
+                writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter)
+                writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter)
+                writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter)
+                print(
+                    f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}',
+                    end="\r",
+                )
+                counter += 1
+                time.sleep(0.5)
+        except KeyboardInterrupt:
+            pass
+
+        self.terminate_workers()
+
+        if self.config.save_model:
+            # Persist replay buffer to disk
+            path = self.config.results_path / "replay_buffer.pkl"
+            print(f"\n\nPersisting replay buffer games to disk at {path}")
+            pickle.dump(
+                {
+                    "buffer": self.replay_buffer,
+                    "num_played_games": self.checkpoint["num_played_games"],
+                    "num_played_steps": self.checkpoint["num_played_steps"],
+                    "num_reanalysed_games": self.checkpoint["num_reanalysed_games"],
+                },
+                open(path, "wb"),
+            )
+
+    def terminate_workers(self):
+        """
+        Softly terminate the running tasks and garbage collect the workers.
+        """
+        if self.shared_storage_worker:
+            self.shared_storage_worker.set_info.remote("terminate", True)
+            self.checkpoint = ray.get(
+                self.shared_storage_worker.get_checkpoint.remote()
+            )
+        if self.replay_buffer_worker:
+            self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote())
+
+        print("\nShutting down workers...")
+
+        self.self_play_workers = None
+        self.test_worker = None
+        self.training_worker = None
+        self.reanalyse_worker = None
+        self.replay_buffer_worker = None
+        self.shared_storage_worker = None
+
+    def test(
+        self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0
+    ):
+        """
+        Test the model in a dedicated thread.
+
+        Args:
+            render (bool): To display or not the environment. Defaults to True.
+
+            opponent (str): "self" for self-play, "human" for playing against MuZero and "random"
+            for a random agent, None will use the opponent in the config. Defaults to None.
+
+            muzero_player (int): Player number of MuZero in case of multiplayer
+            games, None let MuZero play all players turn by turn, None will use muzero_player in
+            the config. Defaults to None.
+
+            num_tests (int): Number of games to average. Defaults to 1.
+
+            num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0.
+        """
+        opponent = opponent if opponent else self.config.opponent
+        muzero_player = muzero_player if muzero_player else self.config.muzero_player
+        self_play_worker = self_play.SelfPlay.options(
+            num_cpus=0,
+            num_gpus=num_gpus,
+        ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000))
+        results = []
+        for i in range(num_tests):
+            print(f"Testing {i+1}/{num_tests}")
+            results.append(
+                ray.get(
+                    self_play_worker.play_game.remote(
+                        0,
+                        0,
+                        render,
+                        opponent,
+                        muzero_player,
+                    )
+                )
+            )
+        self_play_worker.close_game.remote()
+
+        if len(self.config.players) == 1:
+            result = numpy.mean([sum(history.reward_history) for history in results])
+        else:
+            result = numpy.mean(
+                [
+                    sum(
+                        reward
+                        for i, reward in enumerate(history.reward_history)
+                        if history.to_play_history[i - 1] == muzero_player
+                    )
+                    for history in results
+                ]
+            )
+        return result
+
+    def load_model(self, checkpoint_path=None, replay_buffer_path=None):
+        """
+        Load a model and/or a saved replay buffer.
+
+        Args:
+            checkpoint_path (str): Path to model.checkpoint or model.weights.
+
+            replay_buffer_path (str): Path to replay_buffer.pkl
+        """
+        # Load checkpoint
+        if checkpoint_path:
+            checkpoint_path = pathlib.Path(checkpoint_path)
+            self.checkpoint = torch.load(checkpoint_path)
+            print(f"\nUsing checkpoint from {checkpoint_path}")
+
+        # Load replay buffer
+        if replay_buffer_path:
+            replay_buffer_path = pathlib.Path(replay_buffer_path)
+            with open(replay_buffer_path, "rb") as f:
+                replay_buffer_infos = pickle.load(f)
+            self.replay_buffer = replay_buffer_infos["buffer"]
+            self.checkpoint["num_played_steps"] = replay_buffer_infos[
+                "num_played_steps"
+            ]
+            self.checkpoint["num_played_games"] = replay_buffer_infos[
+                "num_played_games"
+            ]
+            self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[
+                "num_reanalysed_games"
+            ]
+
+            print(f"\nInitializing replay buffer with {replay_buffer_path}")
+        else:
+            print(f"Using empty buffer.")
+            self.replay_buffer = {}
+            self.checkpoint["training_step"] = 0
+            self.checkpoint["num_played_steps"] = 0
+            self.checkpoint["num_played_games"] = 0
+            self.checkpoint["num_reanalysed_games"] = 0
+
+    def diagnose_model(self, horizon):
+        """
+        Play a game only with the learned model then play the same trajectory in the real
+        environment and display information.
+
+        Args:
+            horizon (int): Number of timesteps for which we collect information.
+        """
+        game = self.Game(self.config.seed)
+        obs = game.reset()
+        dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config)
+        dm.compare_virtual_with_real_trajectories(obs, game, horizon)
+        input("Press enter to close all plots")
+        dm.close_all()
+
+
+@ray.remote(num_cpus=0, num_gpus=0)
+class CPUActor:
+    # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU
+    def __init__(self):
+        pass
+
+    def get_initial_weights(self, config):
+        # model = models.SimplifiedMuZeroNetwork(config)
+        model = MuZeroNetwork_2net(config)
+        weigths = model.get_weights()
+        summary = str(model).replace("\n", " \n\n")
+        return weigths, summary
+
+
+def hyperparameter_search(
+    game_name, parametrization, budget, parallel_experiments, num_tests
+):
+    """
+    Search for hyperparameters by launching parallel experiments.
+
+    Args:
+        game_name (str): Name of the game module, it should match the name of a .py file
+        in the "./games" directory.
+
+        parametrization : Nevergrad parametrization, please refer to nevergrad documentation.
+
+        budget (int): Number of experiments to launch in total.
+
+        parallel_experiments (int): Number of experiments to launch in parallel.
+
+        num_tests (int): Number of games to average for evaluating an experiment.
+    """
+    optimizer = nevergrad.optimizers.OnePlusOne(
+        parametrization=parametrization, budget=budget
+    )
+
+    running_experiments = []
+    best_training = None
+    try:
+        # Launch initial experiments
+        for i in range(parallel_experiments):
+            if 0 < budget:
+                param = optimizer.ask()
+                print(f"Launching new experiment: {param.value}")
+                muzero = MuZero_2Net(game_name, param.value, parallel_experiments)
+                muzero.param = param
+                muzero.train(False)
+                running_experiments.append(muzero)
+                budget -= 1
+
+        while 0 < budget or any(running_experiments):
+            for i, experiment in enumerate(running_experiments):
+                if experiment and experiment.config.training_steps <= ray.get(
+                    experiment.shared_storage_worker.get_info.remote("training_step")
+                ):
+                    experiment.terminate_workers()
+                    result = experiment.test(False, num_tests=num_tests)
+                    if not best_training or best_training["result"] < result:
+                        best_training = {
+                            "result": result,
+                            "config": experiment.config,
+                            "checkpoint": experiment.checkpoint,
+                        }
+                    print(f"Parameters: {experiment.param.value}")
+                    print(f"Result: {result}")
+                    optimizer.tell(experiment.param, -result)
+
+                    if 0 < budget:
+                        param = optimizer.ask()
+                        print(f"Launching new experiment: {param.value}")
+                        muzero = MuZero_2Net(game_name, param.value, parallel_experiments)
+                        muzero.param = param
+                        muzero.train(False)
+                        running_experiments[i] = muzero
+                        budget -= 1
+                    else:
+                        running_experiments[i] = None
+
+    except KeyboardInterrupt:
+        for experiment in running_experiments:
+            if isinstance(experiment, MuZero_2Net):
+                experiment.terminate_workers()
+
+    recommendation = optimizer.provide_recommendation()
+    print("Best hyperparameters:")
+    print(recommendation.value)
+    if best_training:
+        # Save best training weights (but it's not the recommended weights)
+        best_training["config"].results_path.mkdir(parents=True, exist_ok=True)
+        torch.save(
+            best_training["checkpoint"],
+            best_training["config"].results_path / "model.checkpoint",
+        )
+        # Save the recommended hyperparameters
+        text_file = open(
+            best_training["config"].results_path / "best_parameters.txt",
+            "w",
+        )
+        text_file.write(str(recommendation.value))
+        text_file.close()
+    return recommendation.value
+
+
+def load_model_menu(muzero, game_name):
+    # Configure running options
+    options = ["Specify paths manually"] + sorted(
+        (pathlib.Path("results") / game_name).glob("*/")
+    )
+    options.reverse()
+    print()
+    for i in range(len(options)):
+        print(f"{i}. {options[i]}")
+
+    choice = input("Enter a number to choose a model to load: ")
+    valid_inputs = [str(i) for i in range(len(options))]
+    while choice not in valid_inputs:
+        choice = input("Invalid input, enter a number listed above: ")
+    choice = int(choice)
+
+    if choice == (len(options) - 1):
+        # manual path option
+        checkpoint_path = input(
+            "Enter a path to the model.checkpoint, or ENTER if none: "
+        )
+        while checkpoint_path and not pathlib.Path(checkpoint_path).is_file():
+            checkpoint_path = input("Invalid checkpoint path. Try again: ")
+        replay_buffer_path = input(
+            "Enter a path to the replay_buffer.pkl, or ENTER if none: "
+        )
+        while replay_buffer_path and not pathlib.Path(replay_buffer_path).is_file():
+            replay_buffer_path = input("Invalid replay buffer path. Try again: ")
+    else:
+        checkpoint_path = options[choice] / "model.checkpoint"
+        replay_buffer_path = options[choice] / "replay_buffer.pkl"
+
+    muzero.load_model(
+        checkpoint_path=checkpoint_path,
+        replay_buffer_path=replay_buffer_path,
+    )
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 2:
+        # Train directly with: python muzero.py cartpole
+        muzero = MuZero_2Net(sys.argv[1])
+        muzero.train()
+    elif len(sys.argv) == 3:
+        # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
+        config = json.loads(sys.argv[2])
+        muzero = MuZero_2Net(sys.argv[1], config)
+        muzero.train()
+    else:
+        print("\nWelcome to MuZero! Here's a list of games:")
+        # Let user pick a game
+        games = [
+            filename.stem
+            for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
+            if filename.name != "abstract_game.py"
+        ]
+        for i in range(len(games)):
+            print(f"{i}. {games[i]}")
+        choice = input("Enter a number to choose the game: ")
+        valid_inputs = [str(i) for i in range(len(games))]
+        while choice not in valid_inputs:
+            choice = input("Invalid input, enter a number listed above: ")
+
+        # Initialize MuZero
+        choice = int(choice)
+        game_name = games[choice]
+        muzero = MuZero_2Net(game_name)
+
+        while True:
+            # Configure running options
+            options = [
+                "Train",
+                "Load pretrained model",
+                "Diagnose model",
+                "Render some self play games",
+                "Play against MuZero",
+                "Test the game manually",
+                "Hyperparameter search",
+                "Exit",
+            ]
+            print()
+            for i in range(len(options)):
+                print(f"{i}. {options[i]}")
+
+            choice = input("Enter a number to choose an action: ")
+            valid_inputs = [str(i) for i in range(len(options))]
+            while choice not in valid_inputs:
+                choice = input("Invalid input, enter a number listed above: ")
+            choice = int(choice)
+            if choice == 0:
+                start_time = time.time()
+                muzero.train()
+                end_time = time.time()
+                print("耗时: {:.2f}秒".format(end_time - start_time))
+            elif choice == 1:
+                load_model_menu(muzero, game_name)
+            elif choice == 2:
+                muzero.diagnose_model(30)
+            elif choice == 3:
+                muzero.test(render=True, opponent="self", muzero_player=None)
+            elif choice == 4:
+                muzero.test(render=True, opponent="human", muzero_player=0)
+            elif choice == 5:
+                env = muzero.Game()
+                env.reset()
+                env.render()
+
+                done = False
+                while not done:
+                    action = env.human_to_action()
+                    observation, reward, done = env.step(action)
+                    print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
+                    env.render()
+            elif choice == 6:
+                # Define here the parameters to tune
+                # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
+                muzero.terminate_workers()
+                del muzero
+                budget = 20
+                parallel_experiments = 2
+                lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
+                discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
+                parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
+                best_hyperparameters = hyperparameter_search(
+                    game_name, parametrization, budget, parallel_experiments, 20
+                )
+                muzero = MuZero_2Net(game_name, best_hyperparameters)
+            else:
+                break
+            print("\nDone")
+
+    ray.shutdown()
diff --git a/muzero_general.py b/muzero_general.py
new file mode 100644
index 00000000..b3fb9411
--- /dev/null
+++ b/muzero_general.py
@@ -0,0 +1,416 @@
+import importlib
+import ray
+import pathlib
+
+import numpy
+import torch
+from torch.utils.tensorboard import SummaryWriter
+
+import math
+import copy
+
+from simplifiedMuZero.without_rb.game_play import GamePlay
+from simplifiedMuZero.without_rb.play_buffer import PlayBuffer
+from simplifiedMuZero.without_rb.trainer_no_PV import Trainer
+from muzero import load_model_menu, hyperparameter_search
+
+import models
+
+
+class CPUActorWithClass:
+    # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU
+    def __init__(self):
+        pass
+
+    def get_initial_weights(self, config, model_cls):
+        model = model_cls(config)
+        weigths = model.get_weights()
+        summary = str(model).replace("\n", " \n\n")
+        return weigths, summary
+
+class MuZeroGeneral:
+    def __init__(self, game_name, model_cls, config=None, split_resources_in=1, save_path_ex=None):
+        # Load the game and the config from the module with the game name
+        try:
+            game_module = importlib.import_module("games." + game_name)
+            print("games." + game_name)
+            self.Game = game_module.Game
+            self.config = game_module.MuZeroConfig()
+            if save_path_ex:
+                self.config.results_path /= save_path_ex
+            else:
+                self.config.results_path /= model_cls.__name__
+        except ModuleNotFoundError as err:
+            print(
+                f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.'
+            )
+            raise err
+
+        self.model_cls = model_cls
+
+        # Overwrite the config
+        if config:
+            if type(config) is dict:
+                for param, value in config.items():
+                    if hasattr(self.config, param):
+                        setattr(self.config, param, value)
+                    else:
+                        raise AttributeError(
+                            f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters."
+                        )
+            else:
+                self.config = config
+
+        # using random search instand of MCTS
+        self.config.temperature_threshold = 0
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Manage GPUs
+        if self.config.max_num_gpus == 0 and (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            raise ValueError(
+                "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu."
+            )
+        if (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            total_gpus = (
+                self.config.max_num_gpus
+                if self.config.max_num_gpus is not None
+                else torch.cuda.device_count()
+            )
+        else:
+            total_gpus = 0
+        self.num_gpus = total_gpus / split_resources_in
+        if 1 < self.num_gpus:
+            self.num_gpus = math.floor(self.num_gpus)
+
+        # Checkpoint and replay buffer used to initialize workers
+        self.checkpoint = {
+            "weights": None,
+            "optimizer_state": None,
+            "total_reward": 0,
+            "muzero_reward": 0,
+            "opponent_reward": 0,
+            "episode_length": 0,
+            "mean_value": 0,
+            "training_step": 0,
+            "lr": 0,
+            "total_loss": 0,
+            "value_loss": 0,
+            "reward_loss": 0,
+            "policy_loss": 0,
+            "num_played_games": 0,
+            "num_played_steps": 0,
+            "num_reanalysed_games": 0,
+            "terminate": False,
+        }
+        self.replay_buffer = {}
+
+        cpu_actor = CPUActorWithClass()
+        cpu_weights = cpu_actor.get_initial_weights(self.config, self.model_cls)
+        self.checkpoint["weights"], self.summary = copy.deepcopy((cpu_weights))
+
+
+    def logging_loop(self, writer, training_steps):
+
+        # print(
+        #     "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
+        # )
+
+        # Save hyperparameters to TensorBoard
+        hp_table = [
+            f"| {key} | {value} |" for key, value in self.config.__dict__.items()
+        ]
+        writer.add_text(
+            "Hyperparameters",
+            "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
+        )
+        # # Save model representation
+        # writer.add_text(
+        #     "Model summary",
+        #     str(model).replace("\n", " \n\n") # self.summary, 换成其它的
+        # )
+        # Loop for updating the training performance
+        counter = training_steps
+
+        try:
+            if True:
+            # while checkpoint["training_step"] < config.training_steps:
+                writer.add_scalar(
+                    "1.Total_reward/1.Total_reward",
+                    self.checkpoint["total_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/2.Mean_value",
+                    self.checkpoint["mean_value"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/3.Episode_length",
+                    self.checkpoint["episode_length"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/4.MuZero_reward",
+                    self.checkpoint["muzero_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/5.Opponent_reward",
+                    self.checkpoint["opponent_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/1.Self_played_games",
+                    self.checkpoint["num_played_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/2.Training_steps", self.checkpoint["training_step"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/3.Self_played_steps", self.checkpoint["num_played_steps"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/4.Reanalysed_games",
+                    self.checkpoint["num_reanalysed_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/5.Training_steps_per_self_played_step_ratio",
+                    self.checkpoint["training_step"] / max(1, self.checkpoint["num_played_steps"]),
+                    counter,
+                )
+                writer.add_scalar("2.Workers/6.Learning_rate", self.checkpoint["lr"], counter)
+                writer.add_scalar(
+                    "3.Loss/1.Total_weighted_loss", self.checkpoint["total_loss"], counter
+                )
+                writer.add_scalar("3.Loss/Value_loss", self.checkpoint["value_loss"], counter)
+                writer.add_scalar("3.Loss/Reward_loss", self.checkpoint["reward_loss"], counter)
+                writer.add_scalar("3.Loss/Policy_loss", self.checkpoint["policy_loss"], counter)
+                print(
+                    f'Last test reward: {self.checkpoint["total_reward"]:.2f}. Training step: {self.checkpoint["training_step"]}/{self.config.training_steps}. Played games: {self.checkpoint["num_played_games"]}. Loss: {self.checkpoint["total_loss"]:.2f}',
+                    end="\r",
+                )
+                counter += 1
+                # time.sleep(0.5)
+        except KeyboardInterrupt:
+            pass
+
+        # if config.save_model:
+        #     # Persist replay buffer to disk
+        #     path = config.results_path / "replay_buffer.pkl"
+        #     print(f"\n\nPersisting replay buffer games to disk at {path}")
+        #     pickle.dump(
+        #         {
+        #             "buffer": buffer,
+        #             "num_played_games": checkpoint["num_played_games"],
+        #             "num_played_steps": checkpoint["num_played_steps"],
+        #             "num_reanalysed_games": checkpoint["num_reanalysed_games"],
+        #         },
+        #         open(path, "wb"),
+        #     )
+
+    def update_gameplay_checkpoint(self, game_history):
+        self.checkpoint["episode_length"] = len(game_history.action_history) - 1
+        self.checkpoint["total_reward"] = sum(game_history.reward_history)
+        self.checkpoint["mean_value"] = numpy.mean( [value for value in game_history.root_values if value])
+
+        if 1 < len(self.config.players):
+            self.checkpoint["muzero_reward"] = sum(
+                        reward
+                        for i, reward in enumerate(game_history.reward_history)
+                        if game_history.to_play_history[i - 1]
+                        == self.config.muzero_player
+                    )
+            self.checkpoint["opponent_reward"] = sum(
+                        reward
+                        for i, reward in enumerate(game_history.reward_history)
+                        if game_history.to_play_history[i - 1]
+                        != self.config.muzero_player
+                    )
+
+    def save_checkpoint(self, path=None): #将模型存储在文件中
+        if not path:
+            path = self.config.results_path / "model.checkpoint"
+
+        torch.save(self.checkpoint, path)
+
+    def train(self, log_in_tensorboard=True):
+        if log_in_tensorboard or self.config.save_model:
+            self.config.results_path.mkdir(parents=True, exist_ok=True)
+
+
+        trainer = Trainer(self.model_cls, self.checkpoint, self.config)
+        game_play = GamePlay(trainer.model, self.checkpoint, self.Game, self.config, self.config.seed)
+        buffer = {}
+        play_buffer = PlayBuffer(self.checkpoint, buffer, self.config)
+
+        step = 1 # 间隔，即每次模拟后训练多少次
+        max_steps = int(self.config.training_steps/step)
+        # max_steps = 2000
+
+        writer = SummaryWriter(self.config.results_path)
+
+        for episode in range(max_steps):
+            game_id, game_history = game_play.play_game(game_play.config.visit_softmax_temperature_fn(0), game_play.config.temperature_threshold, False, "self",0)
+
+            # print(game_id)
+            # print(game_history.action_history)
+            # print(game_history.reward_history)
+            # print(game_history.to_play_history)
+            # # print(game_history.observation_history)
+            # print("child visits", game_history.child_visits)
+            # print(game_history.root_values) # root value指的是root节点的UCB值
+
+            play_buffer.update_game_history(game_id, game_history)
+            self.update_gameplay_checkpoint( game_history)
+
+            for i in range(step):
+                index_batch, batch = play_buffer.get_batch()
+                # print(batch[1])
+                trainer.update_lr()
+                (
+                    priorities,
+                    total_loss,
+                    value_loss,
+                    reward_loss,
+                    policy_loss,
+                ) = trainer.update_weights(batch)
+
+
+                training_step = episode * step + i
+                if training_step % self.config.checkpoint_interval == 0:
+                    self.checkpoint["weights"] = copy.deepcopy(trainer.model.get_weights())
+                    self.checkpoint["optimizer_state"] =copy.deepcopy(models.dict_to_cpu(trainer.optimizer.state_dict()) )
+
+                    if self.config.save_model:
+                        self.save_checkpoint()
+                self.checkpoint["training_step"] = training_step
+                self.checkpoint["lr"] = trainer.optimizer.param_groups[0]["lr"]
+                self.checkpoint["total_loss"] = total_loss
+                self.checkpoint["value_loss"] = value_loss
+                self.checkpoint["reward_loss"] = reward_loss
+                self.checkpoint["policy_loss"] = policy_loss
+
+            # print(training_step)
+            # if training_step % 500 == 0:
+            # if training_step % config.checkpoint_interval == 0:
+            #     # print(training_step)
+            #     logging_loop(config, checkpoint, writer)
+
+            self.logging_loop(writer, training_step)
+
+
+        writer.close()
+
+        game_play.close_game()
+
+# if __name__ == "__main__":
+#     # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb")
+#     # start_time = time.time()
+#     # muzero.train()
+#     # end_time = time.time()
+#     # print("耗时: {:.2f}秒".format(end_time - start_time))
+#     model_cls = models.MuZeroNetwork
+#     if len(sys.argv) == 2:
+#         # Train directly with: python muzero.py cartpole
+#         muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls)
+#         muzero.train()
+#     elif len(sys.argv) == 3:
+#         # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
+#         config = json.loads(sys.argv[2])
+#         muzero = MuZeroGeneral(sys.argv[1], config, model_cls=model_cls)
+#         muzero.train()
+#     else:
+#         print("\nWelcome to MuZero! Here's a list of games:")
+#         # Let user pick a game
+#         games = [
+#             filename.stem
+#             for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
+#             if filename.name != "abstract_game.py"
+#         ]
+#         for i in range(len(games)):
+#             print(f"{i}. {games[i]}")
+#         choice = input("Enter a number to choose the game: ")
+#         valid_inputs = [str(i) for i in range(len(games))]
+#         while choice not in valid_inputs:
+#             choice = input("Invalid input, enter a number listed above: ")
+#
+#         # Initialize MuZero
+#         choice = int(choice)
+#         game_name = games[choice]
+#         muzero = MuZeroGeneral(game_name, model_cls=model_cls)
+#
+#         while True:
+#             # Configure running options
+#             options = [
+#                 "Train",
+#                 "Load pretrained model",
+#                 "Diagnose model",
+#                 "Render some self play games",
+#                 "Play against MuZero",
+#                 "Test the game manually",
+#                 "Hyperparameter search",
+#                 "Exit",
+#             ]
+#             print()
+#             for i in range(len(options)):
+#                 print(f"{i}. {options[i]}")
+#
+#             choice = input("Enter a number to choose an action: ")
+#             valid_inputs = [str(i) for i in range(len(options))]
+#             while choice not in valid_inputs:
+#                 choice = input("Invalid input, enter a number listed above: ")
+#             choice = int(choice)
+#             if choice == 0:
+#                 start_time = time.time()
+#                 muzero.train()
+#                 end_time = time.time()
+#                 print("耗时: {:.2f}秒".format(end_time - start_time))
+#             elif choice == 1:
+#                 load_model_menu(muzero, game_name)
+#             elif choice == 2:
+#                 muzero.diagnose_model(30)
+#             elif choice == 3:
+#                 muzero.test(render=True, opponent="self", muzero_player=None)
+#             elif choice == 4:
+#                 muzero.test(render=True, opponent="human", muzero_player=0)
+#             elif choice == 5:
+#                 env = muzero.Game()
+#                 env.reset()
+#                 env.render()
+#
+#                 done = False
+#                 while not done:
+#                     action = env.human_to_action()
+#                     observation, reward, done = env.step(action)
+#                     print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
+#                     env.render()
+#             elif choice == 6:
+#                 # Define here the parameters to tune
+#                 # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
+#                 muzero.terminate_workers()
+#                 del muzero
+#                 budget = 20
+#                 parallel_experiments = 2
+#                 lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
+#                 discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
+#                 parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
+#                 best_hyperparameters = hyperparameter_search(
+#                     game_name, parametrization, budget, parallel_experiments, 20
+#                 )
+#                 muzero = MuZeroGeneral(game_name, best_hyperparameters, model_cls=model_cls)
+#             else:
+#                 break
+#             print("\nDone")
diff --git a/muzero_no_pv.py b/muzero_no_pv.py
new file mode 100644
index 00000000..e94789ed
--- /dev/null
+++ b/muzero_no_pv.py
@@ -0,0 +1,716 @@
+import copy
+import importlib
+import json
+import math
+import pathlib
+import pickle
+import sys
+import time
+
+import nevergrad
+import numpy
+import ray
+import torch
+from torch.utils.tensorboard import SummaryWriter
+
+import diagnose_model
+import models
+import replay_buffer
+import self_play
+import shared_storage
+import simplifiedMuZero.no_pv.trainer_no_pv as trainer
+
+
+class MuZero:
+    """
+    Main class to manage MuZero.
+
+    Args:
+        game_name (str): Name of the game module, it should match the name of a .py file
+        in the "./games" directory.
+
+        config (dict, MuZeroConfig, optional): Override the default config of the game.
+
+        split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances.
+
+    Example:
+        >>> muzero = MuZero("cartpole")
+        >>> muzero.train()
+        >>> muzero.test(render=True)
+    """
+
+    def __init__(self, game_name, config=None, split_resources_in=1):
+        # Load the game and the config from the module with the game name
+        try:
+            game_module = importlib.import_module("games." + game_name)
+            print("games." + game_name)
+            self.Game = game_module.Game
+            self.config = game_module.MuZeroConfig()
+        except ModuleNotFoundError as err:
+            print(
+                f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.'
+            )
+            raise err
+
+        # Overwrite the config
+        if config:
+            if type(config) is dict:
+                for param, value in config.items():
+                    if hasattr(self.config, param):
+                        setattr(self.config, param, value)
+                    else:
+                        raise AttributeError(
+                            f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters."
+                        )
+            else:
+                self.config = config
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Manage GPUs
+        if self.config.max_num_gpus == 0 and (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            raise ValueError(
+                "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu."
+            )
+        if (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            total_gpus = (
+                self.config.max_num_gpus
+                if self.config.max_num_gpus is not None
+                else torch.cuda.device_count()
+            )
+        else:
+            total_gpus = 0
+        self.num_gpus = total_gpus / split_resources_in
+        if 1 < self.num_gpus:
+            self.num_gpus = math.floor(self.num_gpus)
+
+        ray.init(num_gpus=total_gpus, ignore_reinit_error=True)
+
+        # Checkpoint and replay buffer used to initialize workers
+        self.checkpoint = {
+            "weights": None,
+            "optimizer_state": None,
+            "total_reward": 0,
+            "muzero_reward": 0,
+            "opponent_reward": 0,
+            "episode_length": 0,
+            "mean_value": 0,
+            "training_step": 0,
+            "lr": 0,
+            "total_loss": 0,
+            "value_loss": 0,
+            "reward_loss": 0,
+            "policy_loss": 0,
+            "num_played_games": 0,
+            "num_played_steps": 0,
+            "num_reanalysed_games": 0,
+            "terminate": False,
+        }
+        self.replay_buffer = {}
+
+        cpu_actor = CPUActor.remote()
+        cpu_weights = cpu_actor.get_initial_weights.remote(self.config)
+        self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights))
+
+        # Workers
+        self.self_play_workers = None
+        self.test_worker = None
+        self.training_worker = None
+        self.reanalyse_worker = None
+        self.replay_buffer_worker = None
+        self.shared_storage_worker = None
+
+    def train(self, log_in_tensorboard=True):
+        """
+        Spawn ray workers and launch the training.
+
+        Args:
+            log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard.
+        """
+        if log_in_tensorboard or self.config.save_model:
+            self.config.results_path.mkdir(parents=True, exist_ok=True)
+
+        # Manage GPUs
+        if 0 < self.num_gpus:
+            num_gpus_per_worker = self.num_gpus / (
+                self.config.train_on_gpu
+                + self.config.num_workers * self.config.selfplay_on_gpu
+                + log_in_tensorboard * self.config.selfplay_on_gpu
+                + self.config.use_last_model_value * self.config.reanalyse_on_gpu
+            )
+            if 1 < num_gpus_per_worker:
+                num_gpus_per_worker = math.floor(num_gpus_per_worker)
+        else:
+            num_gpus_per_worker = 0
+
+        # Initialize workers
+        self.training_worker = trainer.Trainer.options(
+            num_cpus=0,
+            num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0,
+        ).remote(self.checkpoint, self.config)
+
+        self.shared_storage_worker = shared_storage.SharedStorage.remote(
+            self.checkpoint,
+            self.config,
+        )
+        self.shared_storage_worker.set_info.remote("terminate", False)
+
+        self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote(
+            self.checkpoint, self.replay_buffer, self.config
+        )
+
+        if self.config.use_last_model_value:
+            self.reanalyse_worker = replay_buffer.Reanalyse.options(
+                num_cpus=0,
+                num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0,
+            ).remote(self.checkpoint, self.config)
+
+        self.self_play_workers = [
+            self_play.SelfPlay.options(
+                num_cpus=0,
+                num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
+            ).remote(
+                self.checkpoint,
+                self.Game,
+                self.config,
+                self.config.seed + seed,
+            )
+            for seed in range(self.config.num_workers)
+        ]
+
+        # Launch workers
+        [
+            self_play_worker.continuous_self_play.remote(
+                self.shared_storage_worker, self.replay_buffer_worker
+            )
+            for self_play_worker in self.self_play_workers
+        ]
+        self.training_worker.continuous_update_weights.remote(
+            self.replay_buffer_worker, self.shared_storage_worker
+        )
+        if self.config.use_last_model_value:
+            self.reanalyse_worker.reanalyse.remote(
+                self.replay_buffer_worker, self.shared_storage_worker
+            )
+
+        if log_in_tensorboard:
+            self.logging_loop(
+                num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
+            )
+
+    def logging_loop(self, num_gpus):
+        """
+        Keep track of the training performance.
+        """
+        # Launch the test worker to get performance metrics
+        self.test_worker = self_play.SelfPlay.options(
+            num_cpus=0,
+            num_gpus=num_gpus,
+        ).remote(
+            self.checkpoint,
+            self.Game,
+            self.config,
+            self.config.seed + self.config.num_workers,
+        )
+        self.test_worker.continuous_self_play.remote(
+            self.shared_storage_worker, None, True
+        )
+
+        # Write everything in TensorBoard
+        writer = SummaryWriter(self.config.results_path)
+
+        print(
+            "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
+        )
+
+        # Save hyperparameters to TensorBoard
+        hp_table = [
+            f"| {key} | {value} |" for key, value in self.config.__dict__.items()
+        ]
+        writer.add_text(
+            "Hyperparameters",
+            "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
+        )
+        # Save model representation
+        writer.add_text(
+            "Model summary",
+            self.summary,
+        )
+        # Loop for updating the training performance
+        counter = 0
+        keys = [
+            "total_reward",
+            "muzero_reward",
+            "opponent_reward",
+            "episode_length",
+            "mean_value",
+            "training_step",
+            "lr",
+            "total_loss",
+            "value_loss",
+            "reward_loss",
+            "policy_loss",
+            "num_played_games",
+            "num_played_steps",
+            "num_reanalysed_games",
+        ]
+        info = ray.get(self.shared_storage_worker.get_info.remote(keys))
+        try:
+            while info["training_step"] < self.config.training_steps:
+                info = ray.get(self.shared_storage_worker.get_info.remote(keys))
+                writer.add_scalar(
+                    "1.Total_reward/1.Total_reward",
+                    info["total_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/2.Mean_value",
+                    info["mean_value"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/3.Episode_length",
+                    info["episode_length"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/4.MuZero_reward",
+                    info["muzero_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/5.Opponent_reward",
+                    info["opponent_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/1.Self_played_games",
+                    info["num_played_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/2.Training_steps", info["training_step"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/3.Self_played_steps", info["num_played_steps"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/4.Reanalysed_games",
+                    info["num_reanalysed_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/5.Training_steps_per_self_played_step_ratio",
+                    info["training_step"] / max(1, info["num_played_steps"]),
+                    counter,
+                )
+                writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter)
+                writer.add_scalar(
+                    "3.Loss/1.Total_weighted_loss", info["total_loss"], counter
+                )
+                writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter)
+                writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter)
+                writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter)
+                print(
+                    f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}',
+                    end="\r",
+                )
+                counter += 1
+                time.sleep(0.5)
+        except KeyboardInterrupt:
+            pass
+
+        self.terminate_workers()
+
+        if self.config.save_model:
+            # Persist replay buffer to disk
+            path = self.config.results_path / "replay_buffer.pkl"
+            print(f"\n\nPersisting replay buffer games to disk at {path}")
+            pickle.dump(
+                {
+                    "buffer": self.replay_buffer,
+                    "num_played_games": self.checkpoint["num_played_games"],
+                    "num_played_steps": self.checkpoint["num_played_steps"],
+                    "num_reanalysed_games": self.checkpoint["num_reanalysed_games"],
+                },
+                open(path, "wb"),
+            )
+
+    def terminate_workers(self):
+        """
+        Softly terminate the running tasks and garbage collect the workers.
+        """
+        if self.shared_storage_worker:
+            self.shared_storage_worker.set_info.remote("terminate", True)
+            self.checkpoint = ray.get(
+                self.shared_storage_worker.get_checkpoint.remote()
+            )
+        if self.replay_buffer_worker:
+            self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote())
+
+        print("\nShutting down workers...")
+
+        self.self_play_workers = None
+        self.test_worker = None
+        self.training_worker = None
+        self.reanalyse_worker = None
+        self.replay_buffer_worker = None
+        self.shared_storage_worker = None
+
+    def test(
+        self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0
+    ):
+        """
+        Test the model in a dedicated thread.
+
+        Args:
+            render (bool): To display or not the environment. Defaults to True.
+
+            opponent (str): "self" for self-play, "human" for playing against MuZero and "random"
+            for a random agent, None will use the opponent in the config. Defaults to None.
+
+            muzero_player (int): Player number of MuZero in case of multiplayer
+            games, None let MuZero play all players turn by turn, None will use muzero_player in
+            the config. Defaults to None.
+
+            num_tests (int): Number of games to average. Defaults to 1.
+
+            num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0.
+        """
+        opponent = opponent if opponent else self.config.opponent
+        muzero_player = muzero_player if muzero_player else self.config.muzero_player
+        self_play_worker = self_play.SelfPlay.options(
+            num_cpus=0,
+            num_gpus=num_gpus,
+        ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000))
+        results = []
+        for i in range(num_tests):
+            print(f"Testing {i+1}/{num_tests}")
+            results.append(
+                ray.get(
+                    self_play_worker.play_game.remote(
+                        0,
+                        0,
+                        render,
+                        opponent,
+                        muzero_player,
+                    )
+                )
+            )
+        self_play_worker.close_game.remote()
+
+        if len(self.config.players) == 1:
+            result = numpy.mean([sum(history.reward_history) for history in results])
+        else:
+            result = numpy.mean(
+                [
+                    sum(
+                        reward
+                        for i, reward in enumerate(history.reward_history)
+                        if history.to_play_history[i - 1] == muzero_player
+                    )
+                    for history in results
+                ]
+            )
+        return result
+
+    def load_model(self, checkpoint_path=None, replay_buffer_path=None):
+        """
+        Load a model and/or a saved replay buffer.
+
+        Args:
+            checkpoint_path (str): Path to model.checkpoint or model.weights.
+
+            replay_buffer_path (str): Path to replay_buffer.pkl
+        """
+        # Load checkpoint
+        if checkpoint_path:
+            checkpoint_path = pathlib.Path(checkpoint_path)
+            self.checkpoint = torch.load(checkpoint_path)
+            print(f"\nUsing checkpoint from {checkpoint_path}")
+
+        # Load replay buffer
+        if replay_buffer_path:
+            replay_buffer_path = pathlib.Path(replay_buffer_path)
+            with open(replay_buffer_path, "rb") as f:
+                replay_buffer_infos = pickle.load(f)
+            self.replay_buffer = replay_buffer_infos["buffer"]
+            self.checkpoint["num_played_steps"] = replay_buffer_infos[
+                "num_played_steps"
+            ]
+            self.checkpoint["num_played_games"] = replay_buffer_infos[
+                "num_played_games"
+            ]
+            self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[
+                "num_reanalysed_games"
+            ]
+
+            print(f"\nInitializing replay buffer with {replay_buffer_path}")
+        else:
+            print(f"Using empty buffer.")
+            self.replay_buffer = {}
+            self.checkpoint["training_step"] = 0
+            self.checkpoint["num_played_steps"] = 0
+            self.checkpoint["num_played_games"] = 0
+            self.checkpoint["num_reanalysed_games"] = 0
+
+    def diagnose_model(self, horizon):
+        """
+        Play a game only with the learned model then play the same trajectory in the real
+        environment and display information.
+
+        Args:
+            horizon (int): Number of timesteps for which we collect information.
+        """
+        game = self.Game(self.config.seed)
+        obs = game.reset()
+        dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config)
+        dm.compare_virtual_with_real_trajectories(obs, game, horizon)
+        input("Press enter to close all plots")
+        dm.close_all()
+
+
+@ray.remote(num_cpus=0, num_gpus=0)
+class CPUActor:
+    # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU
+    def __init__(self):
+        pass
+
+    def get_initial_weights(self, config):
+        model = models.MuZeroNetwork(config)
+        weigths = model.get_weights()
+        summary = str(model).replace("\n", " \n\n")
+        return weigths, summary
+
+
+def hyperparameter_search(
+    game_name, parametrization, budget, parallel_experiments, num_tests
+):
+    """
+    Search for hyperparameters by launching parallel experiments.
+
+    Args:
+        game_name (str): Name of the game module, it should match the name of a .py file
+        in the "./games" directory.
+
+        parametrization : Nevergrad parametrization, please refer to nevergrad documentation.
+
+        budget (int): Number of experiments to launch in total.
+
+        parallel_experiments (int): Number of experiments to launch in parallel.
+
+        num_tests (int): Number of games to average for evaluating an experiment.
+    """
+    optimizer = nevergrad.optimizers.OnePlusOne(
+        parametrization=parametrization, budget=budget
+    )
+
+    running_experiments = []
+    best_training = None
+    try:
+        # Launch initial experiments
+        for i in range(parallel_experiments):
+            if 0 < budget:
+                param = optimizer.ask()
+                print(f"Launching new experiment: {param.value}")
+                muzero = MuZero(game_name, param.value, parallel_experiments)
+                muzero.param = param
+                muzero.train(False)
+                running_experiments.append(muzero)
+                budget -= 1
+
+        while 0 < budget or any(running_experiments):
+            for i, experiment in enumerate(running_experiments):
+                if experiment and experiment.config.training_steps <= ray.get(
+                    experiment.shared_storage_worker.get_info.remote("training_step")
+                ):
+                    experiment.terminate_workers()
+                    result = experiment.test(False, num_tests=num_tests)
+                    if not best_training or best_training["result"] < result:
+                        best_training = {
+                            "result": result,
+                            "config": experiment.config,
+                            "checkpoint": experiment.checkpoint,
+                        }
+                    print(f"Parameters: {experiment.param.value}")
+                    print(f"Result: {result}")
+                    optimizer.tell(experiment.param, -result)
+
+                    if 0 < budget:
+                        param = optimizer.ask()
+                        print(f"Launching new experiment: {param.value}")
+                        muzero = MuZero(game_name, param.value, parallel_experiments)
+                        muzero.param = param
+                        muzero.train(False)
+                        running_experiments[i] = muzero
+                        budget -= 1
+                    else:
+                        running_experiments[i] = None
+
+    except KeyboardInterrupt:
+        for experiment in running_experiments:
+            if isinstance(experiment, MuZero):
+                experiment.terminate_workers()
+
+    recommendation = optimizer.provide_recommendation()
+    print("Best hyperparameters:")
+    print(recommendation.value)
+    if best_training:
+        # Save best training weights (but it's not the recommended weights)
+        best_training["config"].results_path.mkdir(parents=True, exist_ok=True)
+        torch.save(
+            best_training["checkpoint"],
+            best_training["config"].results_path / "model.checkpoint",
+        )
+        # Save the recommended hyperparameters
+        text_file = open(
+            best_training["config"].results_path / "best_parameters.txt",
+            "w",
+        )
+        text_file.write(str(recommendation.value))
+        text_file.close()
+    return recommendation.value
+
+
+def load_model_menu(muzero, game_name):
+    # Configure running options
+    options = ["Specify paths manually"] + sorted(
+        (pathlib.Path("results") / game_name).glob("*/")
+    )
+    options.reverse()
+    print()
+    for i in range(len(options)):
+        print(f"{i}. {options[i]}")
+
+    choice = input("Enter a number to choose a model to load: ")
+    valid_inputs = [str(i) for i in range(len(options))]
+    while choice not in valid_inputs:
+        choice = input("Invalid input, enter a number listed above: ")
+    choice = int(choice)
+
+    if choice == (len(options) - 1):
+        # manual path option
+        checkpoint_path = input(
+            "Enter a path to the model.checkpoint, or ENTER if none: "
+        )
+        while checkpoint_path and not pathlib.Path(checkpoint_path).is_file():
+            checkpoint_path = input("Invalid checkpoint path. Try again: ")
+        replay_buffer_path = input(
+            "Enter a path to the replay_buffer.pkl, or ENTER if none: "
+        )
+        while replay_buffer_path and not pathlib.Path(replay_buffer_path).is_file():
+            replay_buffer_path = input("Invalid replay buffer path. Try again: ")
+    else:
+        checkpoint_path = options[choice] / "model.checkpoint"
+        replay_buffer_path = options[choice] / "replay_buffer.pkl"
+
+    muzero.load_model(
+        checkpoint_path=checkpoint_path,
+        replay_buffer_path=replay_buffer_path,
+    )
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 2:
+        # Train directly with: python muzero.py cartpole
+        muzero = MuZero(sys.argv[1])
+        muzero.train()
+    elif len(sys.argv) == 3:
+        # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
+        config = json.loads(sys.argv[2])
+        muzero = MuZero(sys.argv[1], config)
+        muzero.train()
+    else:
+        print("\nWelcome to MuZero! Here's a list of games:")
+        # Let user pick a game
+        games = [
+            filename.stem
+            for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
+            if filename.name != "abstract_game.py"
+        ]
+        for i in range(len(games)):
+            print(f"{i}. {games[i]}")
+        choice = input("Enter a number to choose the game: ")
+        valid_inputs = [str(i) for i in range(len(games))]
+        while choice not in valid_inputs:
+            choice = input("Invalid input, enter a number listed above: ")
+
+        # Initialize MuZero
+        choice = int(choice)
+        game_name = games[choice]
+        muzero = MuZero(game_name)
+
+        while True:
+            # Configure running options
+            options = [
+                "Train",
+                "Load pretrained model",
+                "Diagnose model",
+                "Render some self play games",
+                "Play against MuZero",
+                "Test the game manually",
+                "Hyperparameter search",
+                "Exit",
+            ]
+            print()
+            for i in range(len(options)):
+                print(f"{i}. {options[i]}")
+
+            choice = input("Enter a number to choose an action: ")
+            valid_inputs = [str(i) for i in range(len(options))]
+            while choice not in valid_inputs:
+                choice = input("Invalid input, enter a number listed above: ")
+            choice = int(choice)
+            if choice == 0:
+                start_time = time.time()
+                muzero.train()
+                end_time = time.time()
+                print("耗时: {:.2f}秒".format(end_time - start_time))
+            elif choice == 1:
+                load_model_menu(muzero, game_name)
+            elif choice == 2:
+                muzero.diagnose_model(30)
+            elif choice == 3:
+                muzero.test(render=True, opponent="self", muzero_player=None)
+            elif choice == 4:
+                muzero.test(render=True, opponent="human", muzero_player=0)
+            elif choice == 5:
+                env = muzero.Game()
+                env.reset()
+                env.render()
+
+                done = False
+                while not done:
+                    action = env.human_to_action()
+                    observation, reward, done = env.step(action)
+                    print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
+                    env.render()
+            elif choice == 6:
+                # Define here the parameters to tune
+                # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
+                muzero.terminate_workers()
+                del muzero
+                budget = 20
+                parallel_experiments = 2
+                lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
+                discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
+                parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
+                best_hyperparameters = hyperparameter_search(
+                    game_name, parametrization, budget, parallel_experiments, 20
+                )
+                muzero = MuZero(game_name, best_hyperparameters)
+            else:
+                break
+            print("\nDone")
+
+    ray.shutdown()
diff --git a/muzero_rhea.py b/muzero_rhea.py
new file mode 100644
index 00000000..07ceee18
--- /dev/null
+++ b/muzero_rhea.py
@@ -0,0 +1,719 @@
+import copy
+import importlib
+import json
+import math
+import pathlib
+import pickle
+import sys
+import time
+
+import nevergrad
+import numpy
+import ray
+import torch
+from torch.utils.tensorboard import SummaryWriter
+
+import diagnose_model
+import models
+import replay_buffer
+import simplifiedMuZero.search_policy.rhea_self_play as self_play
+import shared_storage
+import trainer
+
+
+class MuZero_Rhea:
+    """
+    Main class to manage MuZero.
+
+    Args:
+        game_name (str): Name of the game module, it should match the name of a .py file
+        in the "./games" directory.
+
+        config (dict, MuZeroConfig, optional): Override the default config of the game.
+
+        split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances.
+
+    Example:
+        >>> muzero = MuZero_Rhea("cartpole")
+        >>> muzero.train()
+        >>> muzero.test(render=True)
+    """
+
+    def __init__(self, game_name, config=None, split_resources_in=1):
+        # Load the game and the config from the module with the game name
+        try:
+            game_module = importlib.import_module("games." + game_name)
+            print("games." + game_name)
+            self.Game = game_module.Game
+            self.config = game_module.MuZeroConfig()
+        except ModuleNotFoundError as err:
+            print(
+                f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.'
+            )
+            raise err
+
+        # Overwrite the config
+        if config:
+            if type(config) is dict:
+                for param, value in config.items():
+                    if hasattr(self.config, param):
+                        setattr(self.config, param, value)
+                    else:
+                        raise AttributeError(
+                            f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters."
+                        )
+            else:
+                self.config = config
+
+        # 重命名路径，以便区分不同的模型
+        self.config.results_path /= self.__class__.__name__
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Manage GPUs
+        if self.config.max_num_gpus == 0 and (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            raise ValueError(
+                "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu."
+            )
+        if (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            total_gpus = (
+                self.config.max_num_gpus
+                if self.config.max_num_gpus is not None
+                else torch.cuda.device_count()
+            )
+        else:
+            total_gpus = 0
+        self.num_gpus = total_gpus / split_resources_in
+        if 1 < self.num_gpus:
+            self.num_gpus = math.floor(self.num_gpus)
+
+        ray.init(num_gpus=total_gpus, ignore_reinit_error=True)
+
+        # Checkpoint and replay buffer used to initialize workers
+        self.checkpoint = {
+            "weights": None,
+            "optimizer_state": None,
+            "total_reward": 0,
+            "muzero_reward": 0,
+            "opponent_reward": 0,
+            "episode_length": 0,
+            "mean_value": 0,
+            "training_step": 0,
+            "lr": 0,
+            "total_loss": 0,
+            "value_loss": 0,
+            "reward_loss": 0,
+            "policy_loss": 0,
+            "num_played_games": 0,
+            "num_played_steps": 0,
+            "num_reanalysed_games": 0,
+            "terminate": False,
+        }
+        self.replay_buffer = {}
+
+        cpu_actor = CPUActor.remote()
+        cpu_weights = cpu_actor.get_initial_weights.remote(self.config)
+        self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights))
+
+        # Workers
+        self.self_play_workers = None
+        self.test_worker = None
+        self.training_worker = None
+        self.reanalyse_worker = None
+        self.replay_buffer_worker = None
+        self.shared_storage_worker = None
+
+    def train(self, log_in_tensorboard=True):
+        """
+        Spawn ray workers and launch the training.
+
+        Args:
+            log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard.
+        """
+        if log_in_tensorboard or self.config.save_model:
+            self.config.results_path.mkdir(parents=True, exist_ok=True)
+
+        # Manage GPUs
+        if 0 < self.num_gpus:
+            num_gpus_per_worker = self.num_gpus / (
+                self.config.train_on_gpu
+                + self.config.num_workers * self.config.selfplay_on_gpu
+                + log_in_tensorboard * self.config.selfplay_on_gpu
+                + self.config.use_last_model_value * self.config.reanalyse_on_gpu
+            )
+            if 1 < num_gpus_per_worker:
+                num_gpus_per_worker = math.floor(num_gpus_per_worker)
+        else:
+            num_gpus_per_worker = 0
+
+        # Initialize workers
+        self.training_worker = trainer.Trainer.options(
+            num_cpus=0,
+            num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0,
+        ).remote(self.checkpoint, self.config)
+
+        self.shared_storage_worker = shared_storage.SharedStorage.remote(
+            self.checkpoint,
+            self.config,
+        )
+        self.shared_storage_worker.set_info.remote("terminate", False)
+
+        self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote(
+            self.checkpoint, self.replay_buffer, self.config
+        )
+
+        if self.config.use_last_model_value:
+            self.reanalyse_worker = replay_buffer.Reanalyse.options(
+                num_cpus=0,
+                num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0,
+            ).remote(self.checkpoint, self.config)
+
+        self.self_play_workers = [
+            self_play.SelfPlayRhea.options(
+                num_cpus=0,
+                num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
+            ).remote(
+                self.checkpoint,
+                self.Game,
+                self.config,
+                self.config.seed + seed,
+            )
+            for seed in range(self.config.num_workers)
+        ]
+
+        # Launch workers
+        [
+            self_play_worker.continuous_self_play.remote(
+                self.shared_storage_worker, self.replay_buffer_worker
+            )
+            for self_play_worker in self.self_play_workers
+        ]
+        self.training_worker.continuous_update_weights.remote(
+            self.replay_buffer_worker, self.shared_storage_worker
+        )
+        if self.config.use_last_model_value:
+            self.reanalyse_worker.reanalyse.remote(
+                self.replay_buffer_worker, self.shared_storage_worker
+            )
+
+        if log_in_tensorboard:
+            self.logging_loop(
+                num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
+            )
+
+    def logging_loop(self, num_gpus):
+        """
+        Keep track of the training performance.
+        """
+        # Launch the test worker to get performance metrics
+        self.test_worker = self_play.SelfPlayRhea.options(
+            num_cpus=0,
+            num_gpus=num_gpus,
+        ).remote(
+            self.checkpoint,
+            self.Game,
+            self.config,
+            self.config.seed + self.config.num_workers,
+        )
+        self.test_worker.continuous_self_play.remote(
+            self.shared_storage_worker, None, True
+        )
+
+        # Write everything in TensorBoard
+        writer = SummaryWriter(self.config.results_path)
+
+        print(
+            "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
+        )
+
+        # Save hyperparameters to TensorBoard
+        hp_table = [
+            f"| {key} | {value} |" for key, value in self.config.__dict__.items()
+        ]
+        writer.add_text(
+            "Hyperparameters",
+            "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
+        )
+        # Save model representation
+        writer.add_text(
+            "Model summary",
+            self.summary,
+        )
+        # Loop for updating the training performance
+        counter = 0
+        keys = [
+            "total_reward",
+            "muzero_reward",
+            "opponent_reward",
+            "episode_length",
+            "mean_value",
+            "training_step",
+            "lr",
+            "total_loss",
+            "value_loss",
+            "reward_loss",
+            "policy_loss",
+            "num_played_games",
+            "num_played_steps",
+            "num_reanalysed_games",
+        ]
+        info = ray.get(self.shared_storage_worker.get_info.remote(keys))
+        try:
+            while info["training_step"] < self.config.training_steps:
+                info = ray.get(self.shared_storage_worker.get_info.remote(keys))
+                writer.add_scalar(
+                    "1.Total_reward/1.Total_reward",
+                    info["total_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/2.Mean_value",
+                    info["mean_value"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/3.Episode_length",
+                    info["episode_length"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/4.MuZero_reward",
+                    info["muzero_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/5.Opponent_reward",
+                    info["opponent_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/1.Self_played_games",
+                    info["num_played_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/2.Training_steps", info["training_step"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/3.Self_played_steps", info["num_played_steps"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/4.Reanalysed_games",
+                    info["num_reanalysed_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/5.Training_steps_per_self_played_step_ratio",
+                    info["training_step"] / max(1, info["num_played_steps"]),
+                    counter,
+                )
+                writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter)
+                writer.add_scalar(
+                    "3.Loss/1.Total_weighted_loss", info["total_loss"], counter
+                )
+                writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter)
+                writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter)
+                writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter)
+                print(
+                    f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}',
+                    end="\r",
+                )
+                counter += 1
+                time.sleep(0.5)
+        except KeyboardInterrupt:
+            pass
+
+        self.terminate_workers()
+
+        if self.config.save_model:
+            # Persist replay buffer to disk
+            path = self.config.results_path / "replay_buffer.pkl"
+            print(f"\n\nPersisting replay buffer games to disk at {path}")
+            pickle.dump(
+                {
+                    "buffer": self.replay_buffer,
+                    "num_played_games": self.checkpoint["num_played_games"],
+                    "num_played_steps": self.checkpoint["num_played_steps"],
+                    "num_reanalysed_games": self.checkpoint["num_reanalysed_games"],
+                },
+                open(path, "wb"),
+            )
+
+    def terminate_workers(self):
+        """
+        Softly terminate the running tasks and garbage collect the workers.
+        """
+        if self.shared_storage_worker:
+            self.shared_storage_worker.set_info.remote("terminate", True)
+            self.checkpoint = ray.get(
+                self.shared_storage_worker.get_checkpoint.remote()
+            )
+        if self.replay_buffer_worker:
+            self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote())
+
+        print("\nShutting down workers...")
+
+        self.self_play_workers = None
+        self.test_worker = None
+        self.training_worker = None
+        self.reanalyse_worker = None
+        self.replay_buffer_worker = None
+        self.shared_storage_worker = None
+
+    def test(
+        self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0
+    ):
+        """
+        Test the model in a dedicated thread.
+
+        Args:
+            render (bool): To display or not the environment. Defaults to True.
+
+            opponent (str): "self" for self-play, "human" for playing against MuZero and "random"
+            for a random agent, None will use the opponent in the config. Defaults to None.
+
+            muzero_player (int): Player number of MuZero in case of multiplayer
+            games, None let MuZero play all players turn by turn, None will use muzero_player in
+            the config. Defaults to None.
+
+            num_tests (int): Number of games to average. Defaults to 1.
+
+            num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0.
+        """
+        opponent = opponent if opponent else self.config.opponent
+        muzero_player = muzero_player if muzero_player else self.config.muzero_player
+        self_play_worker = self_play.SelfPlayRhea.options(
+            num_cpus=0,
+            num_gpus=num_gpus,
+        ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000))
+        results = []
+        for i in range(num_tests):
+            print(f"Testing {i+1}/{num_tests}")
+            results.append(
+                ray.get(
+                    self_play_worker.play_game.remote(
+                        0,
+                        0,
+                        render,
+                        opponent,
+                        muzero_player,
+                    )
+                )
+            )
+        self_play_worker.close_game.remote()
+
+        if len(self.config.players) == 1:
+            result = numpy.mean([sum(history.reward_history) for history in results])
+        else:
+            result = numpy.mean(
+                [
+                    sum(
+                        reward
+                        for i, reward in enumerate(history.reward_history)
+                        if history.to_play_history[i - 1] == muzero_player
+                    )
+                    for history in results
+                ]
+            )
+        return result
+
+    def load_model(self, checkpoint_path=None, replay_buffer_path=None):
+        """
+        Load a model and/or a saved replay buffer.
+
+        Args:
+            checkpoint_path (str): Path to model.checkpoint or model.weights.
+
+            replay_buffer_path (str): Path to replay_buffer.pkl
+        """
+        # Load checkpoint
+        if checkpoint_path:
+            checkpoint_path = pathlib.Path(checkpoint_path)
+            self.checkpoint = torch.load(checkpoint_path)
+            print(f"\nUsing checkpoint from {checkpoint_path}")
+
+        # Load replay buffer
+        if replay_buffer_path:
+            replay_buffer_path = pathlib.Path(replay_buffer_path)
+            with open(replay_buffer_path, "rb") as f:
+                replay_buffer_infos = pickle.load(f)
+            self.replay_buffer = replay_buffer_infos["buffer"]
+            self.checkpoint["num_played_steps"] = replay_buffer_infos[
+                "num_played_steps"
+            ]
+            self.checkpoint["num_played_games"] = replay_buffer_infos[
+                "num_played_games"
+            ]
+            self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[
+                "num_reanalysed_games"
+            ]
+
+            print(f"\nInitializing replay buffer with {replay_buffer_path}")
+        else:
+            print(f"Using empty buffer.")
+            self.replay_buffer = {}
+            self.checkpoint["training_step"] = 0
+            self.checkpoint["num_played_steps"] = 0
+            self.checkpoint["num_played_games"] = 0
+            self.checkpoint["num_reanalysed_games"] = 0
+
+    def diagnose_model(self, horizon):
+        """
+        Play a game only with the learned model then play the same trajectory in the real
+        environment and display information.
+
+        Args:
+            horizon (int): Number of timesteps for which we collect information.
+        """
+        game = self.Game(self.config.seed)
+        obs = game.reset()
+        dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config)
+        dm.compare_virtual_with_real_trajectories(obs, game, horizon)
+        input("Press enter to close all plots")
+        dm.close_all()
+
+
+@ray.remote(num_cpus=0, num_gpus=0)
+class CPUActor:
+    # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU
+    def __init__(self):
+        pass
+
+    def get_initial_weights(self, config):
+        model = models.MuZeroNetwork(config)
+        weigths = model.get_weights()
+        summary = str(model).replace("\n", " \n\n")
+        return weigths, summary
+
+
+def hyperparameter_search(
+    game_name, parametrization, budget, parallel_experiments, num_tests
+):
+    """
+    Search for hyperparameters by launching parallel experiments.
+
+    Args:
+        game_name (str): Name of the game module, it should match the name of a .py file
+        in the "./games" directory.
+
+        parametrization : Nevergrad parametrization, please refer to nevergrad documentation.
+
+        budget (int): Number of experiments to launch in total.
+
+        parallel_experiments (int): Number of experiments to launch in parallel.
+
+        num_tests (int): Number of games to average for evaluating an experiment.
+    """
+    optimizer = nevergrad.optimizers.OnePlusOne(
+        parametrization=parametrization, budget=budget
+    )
+
+    running_experiments = []
+    best_training = None
+    try:
+        # Launch initial experiments
+        for i in range(parallel_experiments):
+            if 0 < budget:
+                param = optimizer.ask()
+                print(f"Launching new experiment: {param.value}")
+                muzero = MuZero_Rhea(game_name, param.value, parallel_experiments)
+                muzero.param = param
+                muzero.train(False)
+                running_experiments.append(muzero)
+                budget -= 1
+
+        while 0 < budget or any(running_experiments):
+            for i, experiment in enumerate(running_experiments):
+                if experiment and experiment.config.training_steps <= ray.get(
+                    experiment.shared_storage_worker.get_info.remote("training_step")
+                ):
+                    experiment.terminate_workers()
+                    result = experiment.test(False, num_tests=num_tests)
+                    if not best_training or best_training["result"] < result:
+                        best_training = {
+                            "result": result,
+                            "config": experiment.config,
+                            "checkpoint": experiment.checkpoint,
+                        }
+                    print(f"Parameters: {experiment.param.value}")
+                    print(f"Result: {result}")
+                    optimizer.tell(experiment.param, -result)
+
+                    if 0 < budget:
+                        param = optimizer.ask()
+                        print(f"Launching new experiment: {param.value}")
+                        muzero = MuZero_Rhea(game_name, param.value, parallel_experiments)
+                        muzero.param = param
+                        muzero.train(False)
+                        running_experiments[i] = muzero
+                        budget -= 1
+                    else:
+                        running_experiments[i] = None
+
+    except KeyboardInterrupt:
+        for experiment in running_experiments:
+            if isinstance(experiment, MuZero_Rhea):
+                experiment.terminate_workers()
+
+    recommendation = optimizer.provide_recommendation()
+    print("Best hyperparameters:")
+    print(recommendation.value)
+    if best_training:
+        # Save best training weights (but it's not the recommended weights)
+        best_training["config"].results_path.mkdir(parents=True, exist_ok=True)
+        torch.save(
+            best_training["checkpoint"],
+            best_training["config"].results_path / "model.checkpoint",
+        )
+        # Save the recommended hyperparameters
+        text_file = open(
+            best_training["config"].results_path / "best_parameters.txt",
+            "w",
+        )
+        text_file.write(str(recommendation.value))
+        text_file.close()
+    return recommendation.value
+
+
+def load_model_menu(muzero, game_name):
+    # Configure running options
+    options = ["Specify paths manually"] + sorted(
+        (pathlib.Path("results") / game_name).glob("*/")
+    )
+    options.reverse()
+    print()
+    for i in range(len(options)):
+        print(f"{i}. {options[i]}")
+
+    choice = input("Enter a number to choose a model to load: ")
+    valid_inputs = [str(i) for i in range(len(options))]
+    while choice not in valid_inputs:
+        choice = input("Invalid input, enter a number listed above: ")
+    choice = int(choice)
+
+    if choice == (len(options) - 1):
+        # manual path option
+        checkpoint_path = input(
+            "Enter a path to the model.checkpoint, or ENTER if none: "
+        )
+        while checkpoint_path and not pathlib.Path(checkpoint_path).is_file():
+            checkpoint_path = input("Invalid checkpoint path. Try again: ")
+        replay_buffer_path = input(
+            "Enter a path to the replay_buffer.pkl, or ENTER if none: "
+        )
+        while replay_buffer_path and not pathlib.Path(replay_buffer_path).is_file():
+            replay_buffer_path = input("Invalid replay buffer path. Try again: ")
+    else:
+        checkpoint_path = options[choice] / "model.checkpoint"
+        replay_buffer_path = options[choice] / "replay_buffer.pkl"
+
+    muzero.load_model(
+        checkpoint_path=checkpoint_path,
+        replay_buffer_path=replay_buffer_path,
+    )
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 2:
+        # Train directly with: python muzero.py cartpole
+        muzero = MuZero_Rhea(sys.argv[1])
+        muzero.train()
+    elif len(sys.argv) == 3:
+        # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
+        config = json.loads(sys.argv[2])
+        muzero = MuZero_Rhea(sys.argv[1], config)
+        muzero.train()
+    else:
+        print("\nWelcome to MuZero! Here's a list of games:")
+        # Let user pick a game
+        games = [
+            filename.stem
+            for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
+            if filename.name != "abstract_game.py"
+        ]
+        for i in range(len(games)):
+            print(f"{i}. {games[i]}")
+        choice = input("Enter a number to choose the game: ")
+        valid_inputs = [str(i) for i in range(len(games))]
+        while choice not in valid_inputs:
+            choice = input("Invalid input, enter a number listed above: ")
+
+        # Initialize MuZero
+        choice = int(choice)
+        game_name = games[choice]
+        muzero = MuZero_Rhea(game_name)
+
+        while True:
+            # Configure running options
+            options = [
+                "Train",
+                "Load pretrained model",
+                "Diagnose model",
+                "Render some self play games",
+                "Play against MuZero",
+                "Test the game manually",
+                "Hyperparameter search",
+                "Exit",
+            ]
+            print()
+            for i in range(len(options)):
+                print(f"{i}. {options[i]}")
+
+            choice = input("Enter a number to choose an action: ")
+            valid_inputs = [str(i) for i in range(len(options))]
+            while choice not in valid_inputs:
+                choice = input("Invalid input, enter a number listed above: ")
+            choice = int(choice)
+            if choice == 0:
+                start_time = time.time()
+                muzero.train()
+                end_time = time.time()
+                print("耗时: {:.2f}秒".format(end_time - start_time))
+            elif choice == 1:
+                load_model_menu(muzero, game_name)
+            elif choice == 2:
+                muzero.diagnose_model(30)
+            elif choice == 3:
+                muzero.test(render=True, opponent="self", muzero_player=None)
+            elif choice == 4:
+                muzero.test(render=True, opponent="human", muzero_player=0)
+            elif choice == 5:
+                env = muzero.Game()
+                env.reset()
+                env.render()
+
+                done = False
+                while not done:
+                    action = env.human_to_action()
+                    observation, reward, done = env.step(action)
+                    print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
+                    env.render()
+            elif choice == 6:
+                # Define here the parameters to tune
+                # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
+                muzero.terminate_workers()
+                del muzero
+                budget = 20
+                parallel_experiments = 2
+                lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
+                discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
+                parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
+                best_hyperparameters = hyperparameter_search(
+                    game_name, parametrization, budget, parallel_experiments, 20
+                )
+                muzero = MuZero_Rhea(game_name, best_hyperparameters)
+            else:
+                break
+            print("\nDone")
+
+    ray.shutdown()
diff --git a/muzero_uniform.py b/muzero_uniform.py
new file mode 100644
index 00000000..53d4a0b9
--- /dev/null
+++ b/muzero_uniform.py
@@ -0,0 +1,721 @@
+import copy
+import importlib
+import json
+import math
+import pathlib
+import pickle
+import sys
+import time
+
+import nevergrad
+import numpy
+import ray
+import torch
+from torch.utils.tensorboard import SummaryWriter
+
+import diagnose_model
+import models
+import replay_buffer
+import self_play
+# import simplifiedMuZero.search_policy.self_play_uniform_search as self_play
+import shared_storage
+import trainer
+
+
+class MuZero_uniform:
+    """
+    Main class to manage MuZero.
+
+    Args:
+        game_name (str): Name of the game module, it should match the name of a .py file
+        in the "./games" directory.
+
+        config (dict, MuZeroConfig, optional): Override the default config of the game.
+
+        split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances.
+
+    Example:
+        >>> muzero = MuZero_uniform("cartpole")
+        >>> muzero.train()
+        >>> muzero.test(render=True)
+    """
+
+    def __init__(self, game_name, config=None, split_resources_in=1):
+        # Load the game and the config from the module with the game name
+        try:
+            game_module = importlib.import_module("games." + game_name)
+            print("games." + game_name)
+            self.Game = game_module.Game
+            self.config = game_module.MuZeroConfig()
+        except ModuleNotFoundError as err:
+            print(
+                f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.'
+            )
+            raise err
+
+        # Overwrite the config
+        if config:
+            if type(config) is dict:
+                for param, value in config.items():
+                    if hasattr(self.config, param):
+                        setattr(self.config, param, value)
+                    else:
+                        raise AttributeError(
+                            f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters."
+                        )
+            else:
+                self.config = config
+
+        # 重命名路径，以便区分不同的模型
+        self.config.results_path /= "muzero_uniform"
+        self.config.temperature_threshold = 0
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Manage GPUs
+        if self.config.max_num_gpus == 0 and (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            raise ValueError(
+                "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu."
+            )
+        if (
+            self.config.selfplay_on_gpu
+            or self.config.train_on_gpu
+            or self.config.reanalyse_on_gpu
+        ):
+            total_gpus = (
+                self.config.max_num_gpus
+                if self.config.max_num_gpus is not None
+                else torch.cuda.device_count()
+            )
+        else:
+            total_gpus = 0
+        self.num_gpus = total_gpus / split_resources_in
+        if 1 < self.num_gpus:
+            self.num_gpus = math.floor(self.num_gpus)
+
+        ray.init(num_gpus=total_gpus, ignore_reinit_error=True)
+
+        # Checkpoint and replay buffer used to initialize workers
+        self.checkpoint = {
+            "weights": None,
+            "optimizer_state": None,
+            "total_reward": 0,
+            "muzero_reward": 0,
+            "opponent_reward": 0,
+            "episode_length": 0,
+            "mean_value": 0,
+            "training_step": 0,
+            "lr": 0,
+            "total_loss": 0,
+            "value_loss": 0,
+            "reward_loss": 0,
+            "policy_loss": 0,
+            "num_played_games": 0,
+            "num_played_steps": 0,
+            "num_reanalysed_games": 0,
+            "terminate": False,
+        }
+        self.replay_buffer = {}
+
+        cpu_actor = CPUActor.remote()
+        cpu_weights = cpu_actor.get_initial_weights.remote(self.config)
+        self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights))
+
+        # Workers
+        self.self_play_workers = None
+        self.test_worker = None
+        self.training_worker = None
+        self.reanalyse_worker = None
+        self.replay_buffer_worker = None
+        self.shared_storage_worker = None
+
+    def train(self, log_in_tensorboard=True):
+        """
+        Spawn ray workers and launch the training.
+
+        Args:
+            log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard.
+        """
+        if log_in_tensorboard or self.config.save_model:
+            self.config.results_path.mkdir(parents=True, exist_ok=True)
+
+        # Manage GPUs
+        if 0 < self.num_gpus:
+            num_gpus_per_worker = self.num_gpus / (
+                self.config.train_on_gpu
+                + self.config.num_workers * self.config.selfplay_on_gpu
+                + log_in_tensorboard * self.config.selfplay_on_gpu
+                + self.config.use_last_model_value * self.config.reanalyse_on_gpu
+            )
+            if 1 < num_gpus_per_worker:
+                num_gpus_per_worker = math.floor(num_gpus_per_worker)
+        else:
+            num_gpus_per_worker = 0
+
+        # Initialize workers
+        self.training_worker = trainer.Trainer.options(
+            num_cpus=0,
+            num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0,
+        ).remote(self.checkpoint, self.config)
+
+        self.shared_storage_worker = shared_storage.SharedStorage.remote(
+            self.checkpoint,
+            self.config,
+        )
+        self.shared_storage_worker.set_info.remote("terminate", False)
+
+        self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote(
+            self.checkpoint, self.replay_buffer, self.config
+        )
+
+        if self.config.use_last_model_value:
+            self.reanalyse_worker = replay_buffer.Reanalyse.options(
+                num_cpus=0,
+                num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0,
+            ).remote(self.checkpoint, self.config)
+
+        self.self_play_workers = [
+            self_play.SelfPlay.options(
+                num_cpus=0,
+                num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
+            ).remote(
+                self.checkpoint,
+                self.Game,
+                self.config,
+                self.config.seed + seed,
+            )
+            for seed in range(self.config.num_workers)
+        ]
+
+        # Launch workers
+        [
+            self_play_worker.continuous_self_play.remote(
+                self.shared_storage_worker, self.replay_buffer_worker
+            )
+            for self_play_worker in self.self_play_workers
+        ]
+        self.training_worker.continuous_update_weights.remote(
+            self.replay_buffer_worker, self.shared_storage_worker
+        )
+        if self.config.use_last_model_value:
+            self.reanalyse_worker.reanalyse.remote(
+                self.replay_buffer_worker, self.shared_storage_worker
+            )
+
+        if log_in_tensorboard:
+            self.logging_loop(
+                num_gpus_per_worker if self.config.selfplay_on_gpu else 0,
+            )
+
+    def logging_loop(self, num_gpus):
+        """
+        Keep track of the training performance.
+        """
+        # Launch the test worker to get performance metrics
+        self.test_worker = self_play.SelfPlay.options(
+            num_cpus=0,
+            num_gpus=num_gpus,
+        ).remote(
+            self.checkpoint,
+            self.Game,
+            self.config,
+            self.config.seed + self.config.num_workers,
+        )
+        self.test_worker.continuous_self_play.remote(
+            self.shared_storage_worker, None, True
+        )
+
+        # Write everything in TensorBoard
+        writer = SummaryWriter(self.config.results_path)
+
+        print(
+            "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
+        )
+
+        # Save hyperparameters to TensorBoard
+        hp_table = [
+            f"| {key} | {value} |" for key, value in self.config.__dict__.items()
+        ]
+        writer.add_text(
+            "Hyperparameters",
+            "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
+        )
+        # Save model representation
+        writer.add_text(
+            "Model summary",
+            self.summary,
+        )
+        # Loop for updating the training performance
+        counter = 0
+        keys = [
+            "total_reward",
+            "muzero_reward",
+            "opponent_reward",
+            "episode_length",
+            "mean_value",
+            "training_step",
+            "lr",
+            "total_loss",
+            "value_loss",
+            "reward_loss",
+            "policy_loss",
+            "num_played_games",
+            "num_played_steps",
+            "num_reanalysed_games",
+        ]
+        info = ray.get(self.shared_storage_worker.get_info.remote(keys))
+        try:
+            while info["training_step"] < self.config.training_steps:
+                info = ray.get(self.shared_storage_worker.get_info.remote(keys))
+                writer.add_scalar(
+                    "1.Total_reward/1.Total_reward",
+                    info["total_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/2.Mean_value",
+                    info["mean_value"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/3.Episode_length",
+                    info["episode_length"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/4.MuZero_reward",
+                    info["muzero_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "1.Total_reward/5.Opponent_reward",
+                    info["opponent_reward"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/1.Self_played_games",
+                    info["num_played_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/2.Training_steps", info["training_step"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/3.Self_played_steps", info["num_played_steps"], counter
+                )
+                writer.add_scalar(
+                    "2.Workers/4.Reanalysed_games",
+                    info["num_reanalysed_games"],
+                    counter,
+                )
+                writer.add_scalar(
+                    "2.Workers/5.Training_steps_per_self_played_step_ratio",
+                    info["training_step"] / max(1, info["num_played_steps"]),
+                    counter,
+                )
+                writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter)
+                writer.add_scalar(
+                    "3.Loss/1.Total_weighted_loss", info["total_loss"], counter
+                )
+                writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter)
+                writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter)
+                writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter)
+                print(
+                    f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}',
+                    end="\r",
+                )
+                counter += 1
+                time.sleep(0.5)
+        except KeyboardInterrupt:
+            pass
+
+        self.terminate_workers()
+
+        if self.config.save_model:
+            # Persist replay buffer to disk
+            path = self.config.results_path / "replay_buffer.pkl"
+            print(f"\n\nPersisting replay buffer games to disk at {path}")
+            pickle.dump(
+                {
+                    "buffer": self.replay_buffer,
+                    "num_played_games": self.checkpoint["num_played_games"],
+                    "num_played_steps": self.checkpoint["num_played_steps"],
+                    "num_reanalysed_games": self.checkpoint["num_reanalysed_games"],
+                },
+                open(path, "wb"),
+            )
+
+    def terminate_workers(self):
+        """
+        Softly terminate the running tasks and garbage collect the workers.
+        """
+        if self.shared_storage_worker:
+            self.shared_storage_worker.set_info.remote("terminate", True)
+            self.checkpoint = ray.get(
+                self.shared_storage_worker.get_checkpoint.remote()
+            )
+        if self.replay_buffer_worker:
+            self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote())
+
+        print("\nShutting down workers...")
+
+        self.self_play_workers = None
+        self.test_worker = None
+        self.training_worker = None
+        self.reanalyse_worker = None
+        self.replay_buffer_worker = None
+        self.shared_storage_worker = None
+
+    def test(
+        self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0
+    ):
+        """
+        Test the model in a dedicated thread.
+
+        Args:
+            render (bool): To display or not the environment. Defaults to True.
+
+            opponent (str): "self" for self-play, "human" for playing against MuZero and "random"
+            for a random agent, None will use the opponent in the config. Defaults to None.
+
+            muzero_player (int): Player number of MuZero in case of multiplayer
+            games, None let MuZero play all players turn by turn, None will use muzero_player in
+            the config. Defaults to None.
+
+            num_tests (int): Number of games to average. Defaults to 1.
+
+            num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0.
+        """
+        opponent = opponent if opponent else self.config.opponent
+        muzero_player = muzero_player if muzero_player else self.config.muzero_player
+        self_play_worker = self_play.SelfPlay.options(
+            num_cpus=0,
+            num_gpus=num_gpus,
+        ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000))
+        results = []
+        for i in range(num_tests):
+            print(f"Testing {i+1}/{num_tests}")
+            results.append(
+                ray.get(
+                    self_play_worker.play_game.remote(
+                        0,
+                        0,
+                        render,
+                        opponent,
+                        muzero_player,
+                    )
+                )
+            )
+        self_play_worker.close_game.remote()
+
+        if len(self.config.players) == 1:
+            result = numpy.mean([sum(history.reward_history) for history in results])
+        else:
+            result = numpy.mean(
+                [
+                    sum(
+                        reward
+                        for i, reward in enumerate(history.reward_history)
+                        if history.to_play_history[i - 1] == muzero_player
+                    )
+                    for history in results
+                ]
+            )
+        return result
+
+    def load_model(self, checkpoint_path=None, replay_buffer_path=None):
+        """
+        Load a model and/or a saved replay buffer.
+
+        Args:
+            checkpoint_path (str): Path to model.checkpoint or model.weights.
+
+            replay_buffer_path (str): Path to replay_buffer.pkl
+        """
+        # Load checkpoint
+        if checkpoint_path:
+            checkpoint_path = pathlib.Path(checkpoint_path)
+            self.checkpoint = torch.load(checkpoint_path)
+            print(f"\nUsing checkpoint from {checkpoint_path}")
+
+        # Load replay buffer
+        if replay_buffer_path:
+            replay_buffer_path = pathlib.Path(replay_buffer_path)
+            with open(replay_buffer_path, "rb") as f:
+                replay_buffer_infos = pickle.load(f)
+            self.replay_buffer = replay_buffer_infos["buffer"]
+            self.checkpoint["num_played_steps"] = replay_buffer_infos[
+                "num_played_steps"
+            ]
+            self.checkpoint["num_played_games"] = replay_buffer_infos[
+                "num_played_games"
+            ]
+            self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[
+                "num_reanalysed_games"
+            ]
+
+            print(f"\nInitializing replay buffer with {replay_buffer_path}")
+        else:
+            print(f"Using empty buffer.")
+            self.replay_buffer = {}
+            self.checkpoint["training_step"] = 0
+            self.checkpoint["num_played_steps"] = 0
+            self.checkpoint["num_played_games"] = 0
+            self.checkpoint["num_reanalysed_games"] = 0
+
+    def diagnose_model(self, horizon):
+        """
+        Play a game only with the learned model then play the same trajectory in the real
+        environment and display information.
+
+        Args:
+            horizon (int): Number of timesteps for which we collect information.
+        """
+        game = self.Game(self.config.seed)
+        obs = game.reset()
+        dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config)
+        dm.compare_virtual_with_real_trajectories(obs, game, horizon)
+        input("Press enter to close all plots")
+        dm.close_all()
+
+
+@ray.remote(num_cpus=0, num_gpus=0)
+class CPUActor:
+    # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU
+    def __init__(self):
+        pass
+
+    def get_initial_weights(self, config):
+        model = models.MuZeroNetwork(config)
+        weigths = model.get_weights()
+        summary = str(model).replace("\n", " \n\n")
+        return weigths, summary
+
+
+def hyperparameter_search(
+    game_name, parametrization, budget, parallel_experiments, num_tests
+):
+    """
+    Search for hyperparameters by launching parallel experiments.
+
+    Args:
+        game_name (str): Name of the game module, it should match the name of a .py file
+        in the "./games" directory.
+
+        parametrization : Nevergrad parametrization, please refer to nevergrad documentation.
+
+        budget (int): Number of experiments to launch in total.
+
+        parallel_experiments (int): Number of experiments to launch in parallel.
+
+        num_tests (int): Number of games to average for evaluating an experiment.
+    """
+    optimizer = nevergrad.optimizers.OnePlusOne(
+        parametrization=parametrization, budget=budget
+    )
+
+    running_experiments = []
+    best_training = None
+    try:
+        # Launch initial experiments
+        for i in range(parallel_experiments):
+            if 0 < budget:
+                param = optimizer.ask()
+                print(f"Launching new experiment: {param.value}")
+                muzero = MuZero_uniform(game_name, param.value, parallel_experiments)
+                muzero.param = param
+                muzero.train(False)
+                running_experiments.append(muzero)
+                budget -= 1
+
+        while 0 < budget or any(running_experiments):
+            for i, experiment in enumerate(running_experiments):
+                if experiment and experiment.config.training_steps <= ray.get(
+                    experiment.shared_storage_worker.get_info.remote("training_step")
+                ):
+                    experiment.terminate_workers()
+                    result = experiment.test(False, num_tests=num_tests)
+                    if not best_training or best_training["result"] < result:
+                        best_training = {
+                            "result": result,
+                            "config": experiment.config,
+                            "checkpoint": experiment.checkpoint,
+                        }
+                    print(f"Parameters: {experiment.param.value}")
+                    print(f"Result: {result}")
+                    optimizer.tell(experiment.param, -result)
+
+                    if 0 < budget:
+                        param = optimizer.ask()
+                        print(f"Launching new experiment: {param.value}")
+                        muzero = MuZero_uniform(game_name, param.value, parallel_experiments)
+                        muzero.param = param
+                        muzero.train(False)
+                        running_experiments[i] = muzero
+                        budget -= 1
+                    else:
+                        running_experiments[i] = None
+
+    except KeyboardInterrupt:
+        for experiment in running_experiments:
+            if isinstance(experiment, MuZero_uniform):
+                experiment.terminate_workers()
+
+    recommendation = optimizer.provide_recommendation()
+    print("Best hyperparameters:")
+    print(recommendation.value)
+    if best_training:
+        # Save best training weights (but it's not the recommended weights)
+        best_training["config"].results_path.mkdir(parents=True, exist_ok=True)
+        torch.save(
+            best_training["checkpoint"],
+            best_training["config"].results_path / "model.checkpoint",
+        )
+        # Save the recommended hyperparameters
+        text_file = open(
+            best_training["config"].results_path / "best_parameters.txt",
+            "w",
+        )
+        text_file.write(str(recommendation.value))
+        text_file.close()
+    return recommendation.value
+
+
+def load_model_menu(muzero, game_name):
+    # Configure running options
+    options = ["Specify paths manually"] + sorted(
+        (pathlib.Path("results") / game_name).glob("*/")
+    )
+    options.reverse()
+    print()
+    for i in range(len(options)):
+        print(f"{i}. {options[i]}")
+
+    choice = input("Enter a number to choose a model to load: ")
+    valid_inputs = [str(i) for i in range(len(options))]
+    while choice not in valid_inputs:
+        choice = input("Invalid input, enter a number listed above: ")
+    choice = int(choice)
+
+    if choice == (len(options) - 1):
+        # manual path option
+        checkpoint_path = input(
+            "Enter a path to the model.checkpoint, or ENTER if none: "
+        )
+        while checkpoint_path and not pathlib.Path(checkpoint_path).is_file():
+            checkpoint_path = input("Invalid checkpoint path. Try again: ")
+        replay_buffer_path = input(
+            "Enter a path to the replay_buffer.pkl, or ENTER if none: "
+        )
+        while replay_buffer_path and not pathlib.Path(replay_buffer_path).is_file():
+            replay_buffer_path = input("Invalid replay buffer path. Try again: ")
+    else:
+        checkpoint_path = options[choice] / "model.checkpoint"
+        replay_buffer_path = options[choice] / "replay_buffer.pkl"
+
+    muzero.load_model(
+        checkpoint_path=checkpoint_path,
+        replay_buffer_path=replay_buffer_path,
+    )
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 2:
+        # Train directly with: python muzero.py cartpole
+        muzero = MuZero_uniform(sys.argv[1])
+        muzero.train()
+    elif len(sys.argv) == 3:
+        # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
+        config = json.loads(sys.argv[2])
+        muzero = MuZero_uniform(sys.argv[1], config)
+        muzero.train()
+    else:
+        print("\nWelcome to MuZero! Here's a list of games:")
+        # Let user pick a game
+        games = [
+            filename.stem
+            for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
+            if filename.name != "abstract_game.py"
+        ]
+        for i in range(len(games)):
+            print(f"{i}. {games[i]}")
+        choice = input("Enter a number to choose the game: ")
+        valid_inputs = [str(i) for i in range(len(games))]
+        while choice not in valid_inputs:
+            choice = input("Invalid input, enter a number listed above: ")
+
+        # Initialize MuZero
+        choice = int(choice)
+        game_name = games[choice]
+        muzero = MuZero_uniform(game_name)
+
+        while True:
+            # Configure running options
+            options = [
+                "Train",
+                "Load pretrained model",
+                "Diagnose model",
+                "Render some self play games",
+                "Play against MuZero",
+                "Test the game manually",
+                "Hyperparameter search",
+                "Exit",
+            ]
+            print()
+            for i in range(len(options)):
+                print(f"{i}. {options[i]}")
+
+            choice = input("Enter a number to choose an action: ")
+            valid_inputs = [str(i) for i in range(len(options))]
+            while choice not in valid_inputs:
+                choice = input("Invalid input, enter a number listed above: ")
+            choice = int(choice)
+            if choice == 0:
+                start_time = time.time()
+                muzero.train()
+                end_time = time.time()
+                print("耗时: {:.2f}秒".format(end_time - start_time))
+            elif choice == 1:
+                load_model_menu(muzero, game_name)
+            elif choice == 2:
+                muzero.diagnose_model(30)
+            elif choice == 3:
+                muzero.test(render=True, opponent="self", muzero_player=None)
+            elif choice == 4:
+                muzero.test(render=True, opponent="human", muzero_player=0)
+            elif choice == 5:
+                env = muzero.Game()
+                env.reset()
+                env.render()
+
+                done = False
+                while not done:
+                    action = env.human_to_action()
+                    observation, reward, done = env.step(action)
+                    print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
+                    env.render()
+            elif choice == 6:
+                # Define here the parameters to tune
+                # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
+                muzero.terminate_workers()
+                del muzero
+                budget = 20
+                parallel_experiments = 2
+                lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
+                discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
+                parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
+                best_hyperparameters = hyperparameter_search(
+                    game_name, parametrization, budget, parallel_experiments, 20
+                )
+                muzero = MuZero_uniform(game_name, best_hyperparameters)
+            else:
+                break
+            print("\nDone")
+
+    ray.shutdown()
diff --git a/muzero_without_replay_buffer.py b/muzero_without_replay_buffer.py
new file mode 100644
index 00000000..4b87fc7b
--- /dev/null
+++ b/muzero_without_replay_buffer.py
@@ -0,0 +1,108 @@
+import models
+from muzero_general import MuZeroGeneral
+from muzero import load_model_menu, hyperparameter_search
+
+import json
+import sys
+import pathlib
+import time
+import nevergrad
+
+if __name__ == "__main__":
+    # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb")
+    # start_time = time.time()
+    # muzero.train()
+    # end_time = time.time()
+    # print("耗时: {:.2f}秒".format(end_time - start_time))
+    model_cls = models.MuZeroNetwork
+    if len(sys.argv) == 2:
+        # Train directly with: python muzero.py cartpole
+        muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls)
+        muzero.train()
+    elif len(sys.argv) == 3:
+        # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
+        config = json.loads(sys.argv[2])
+        muzero = MuZeroGeneral(sys.argv[1], config, model_cls=model_cls)
+        muzero.train()
+    else:
+        print("\nWelcome to MuZero! Here's a list of games:")
+        # Let user pick a game
+        games = [
+            filename.stem
+            for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
+            if filename.name != "abstract_game.py"
+        ]
+        for i in range(len(games)):
+            print(f"{i}. {games[i]}")
+        choice = input("Enter a number to choose the game: ")
+        valid_inputs = [str(i) for i in range(len(games))]
+        while choice not in valid_inputs:
+            choice = input("Invalid input, enter a number listed above: ")
+
+        # Initialize MuZero
+        choice = int(choice)
+        game_name = games[choice]
+        muzero = MuZeroGeneral(game_name, model_cls=model_cls)
+
+        while True:
+            # Configure running options
+            options = [
+                "Train",
+                "Load pretrained model",
+                "Diagnose model",
+                "Render some self play games",
+                "Play against MuZero",
+                "Test the game manually",
+                "Hyperparameter search",
+                "Exit",
+            ]
+            print()
+            for i in range(len(options)):
+                print(f"{i}. {options[i]}")
+
+            choice = input("Enter a number to choose an action: ")
+            valid_inputs = [str(i) for i in range(len(options))]
+            while choice not in valid_inputs:
+                choice = input("Invalid input, enter a number listed above: ")
+            choice = int(choice)
+            if choice == 0:
+                start_time = time.time()
+                muzero.train()
+                end_time = time.time()
+                print("耗时: {:.2f}秒".format(end_time - start_time))
+            elif choice == 1:
+                load_model_menu(muzero, game_name)
+            elif choice == 2:
+                muzero.diagnose_model(30)
+            elif choice == 3:
+                muzero.test(render=True, opponent="self", muzero_player=None)
+            elif choice == 4:
+                muzero.test(render=True, opponent="human", muzero_player=0)
+            elif choice == 5:
+                env = muzero.Game()
+                env.reset()
+                env.render()
+
+                done = False
+                while not done:
+                    action = env.human_to_action()
+                    observation, reward, done = env.step(action)
+                    print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
+                    env.render()
+            elif choice == 6:
+                # Define here the parameters to tune
+                # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
+                muzero.terminate_workers()
+                del muzero
+                budget = 20
+                parallel_experiments = 2
+                lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
+                discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
+                parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
+                best_hyperparameters = hyperparameter_search(
+                    game_name, parametrization, budget, parallel_experiments, 20
+                )
+                muzero = MuZeroGeneral(game_name, best_hyperparameters, model_cls=model_cls)
+            else:
+                break
+            print("\nDone")
\ No newline at end of file
diff --git a/muzero_without_replay_buffer_tictactoe.py b/muzero_without_replay_buffer_tictactoe.py
new file mode 100644
index 00000000..f64413ab
--- /dev/null
+++ b/muzero_without_replay_buffer_tictactoe.py
@@ -0,0 +1,242 @@
+from self_play import MCTS, GameHistory
+from games.tictactoe import MuZeroConfig, Game
+# from games.tictactoe import MuZeroConfig, Game
+import models
+
+import numpy
+import torch
+from torch.utils.tensorboard import SummaryWriter
+import pickle
+
+import math
+import time
+import copy
+
+from simplifiedMuZero.without_rb.game_play import GamePlay
+from simplifiedMuZero.without_rb.play_buffer import PlayBuffer
+from simplifiedMuZero.without_rb.trainer import Trainer
+
+def logging_loop(config, checkpoint, writer, training_steps):
+    # writer = SummaryWriter(config.results_path)
+
+    # print(
+    #     "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n"
+    # )
+
+    # Save hyperparameters to TensorBoard
+    hp_table = [
+        f"| {key} | {value} |" for key, value in config.__dict__.items()
+    ]
+    writer.add_text(
+        "Hyperparameters",
+        "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table),
+    )
+    # # Save model representation
+    # writer.add_text(
+    #     "Model summary",
+    #     str(model).replace("\n", " \n\n") # self.summary, 换成其它的
+    # )
+    # Loop for updating the training performance
+    counter = training_steps
+
+    try:
+        if True:
+        # while checkpoint["training_step"] < config.training_steps:
+            writer.add_scalar(
+                "1.Total_reward/1.Total_reward",
+                checkpoint["total_reward"],
+                counter,
+            )
+            writer.add_scalar(
+                "1.Total_reward/2.Mean_value",
+                checkpoint["mean_value"],
+                counter,
+            )
+            writer.add_scalar(
+                "1.Total_reward/3.Episode_length",
+                checkpoint["episode_length"],
+                counter,
+            )
+            writer.add_scalar(
+                "1.Total_reward/4.MuZero_reward",
+                checkpoint["muzero_reward"],
+                counter,
+            )
+            writer.add_scalar(
+                "1.Total_reward/5.Opponent_reward",
+                checkpoint["opponent_reward"],
+                counter,
+            )
+            writer.add_scalar(
+                "2.Workers/1.Self_played_games",
+                checkpoint["num_played_games"],
+                counter,
+            )
+            writer.add_scalar(
+                "2.Workers/2.Training_steps", checkpoint["training_step"], counter
+            )
+            writer.add_scalar(
+                "2.Workers/3.Self_played_steps", checkpoint["num_played_steps"], counter
+            )
+            writer.add_scalar(
+                "2.Workers/4.Reanalysed_games",
+                checkpoint["num_reanalysed_games"],
+                counter,
+            )
+            writer.add_scalar(
+                "2.Workers/5.Training_steps_per_self_played_step_ratio",
+                checkpoint["training_step"] / max(1, checkpoint["num_played_steps"]),
+                counter,
+            )
+            writer.add_scalar("2.Workers/6.Learning_rate", checkpoint["lr"], counter)
+            writer.add_scalar(
+                "3.Loss/1.Total_weighted_loss", checkpoint["total_loss"], counter
+            )
+            writer.add_scalar("3.Loss/Value_loss", checkpoint["value_loss"], counter)
+            writer.add_scalar("3.Loss/Reward_loss", checkpoint["reward_loss"], counter)
+            writer.add_scalar("3.Loss/Policy_loss", checkpoint["policy_loss"], counter)
+            print(
+                f'Last test reward: {checkpoint["total_reward"]:.2f}. Training step: {checkpoint["training_step"]}/{config.training_steps}. Played games: {checkpoint["num_played_games"]}. Loss: {checkpoint["total_loss"]:.2f}',
+                end="\r",
+            )
+            counter += 1
+            # time.sleep(0.5)
+    except KeyboardInterrupt:
+        pass
+
+    # if config.save_model:
+    #     # Persist replay buffer to disk
+    #     path = config.results_path / "replay_buffer.pkl"
+    #     print(f"\n\nPersisting replay buffer games to disk at {path}")
+    #     pickle.dump(
+    #         {
+    #             "buffer": buffer,
+    #             "num_played_games": checkpoint["num_played_games"],
+    #             "num_played_steps": checkpoint["num_played_steps"],
+    #             "num_reanalysed_games": checkpoint["num_reanalysed_games"],
+    #         },
+    #         open(path, "wb"),
+    #     )
+
+def update_gameplay_checkpoint(config, checkpoint, game_history):
+    checkpoint["episode_length"] = len(game_history.action_history) - 1
+    checkpoint["total_reward"] = sum(game_history.reward_history)
+    checkpoint["mean_value"] = numpy.mean( [value for value in game_history.root_values if value])
+
+    if 1 < len(config.players):
+        checkpoint["muzero_reward"] = sum(
+                    reward
+                    for i, reward in enumerate(game_history.reward_history)
+                    if game_history.to_play_history[i - 1]
+                    == config.muzero_player
+                )
+        checkpoint["opponent_reward"] = sum(
+                    reward
+                    for i, reward in enumerate(game_history.reward_history)
+                    if game_history.to_play_history[i - 1]
+                    != config.muzero_player
+                )
+
+def save_checkpoint(config, checkpoint, path=None): #将模型存储在文件中
+    if not path:
+        path = config.results_path / "model.checkpoint"
+
+    torch.save(checkpoint, path)
+
+def train(log_in_tensorboard=True):
+    config = MuZeroConfig()
+    config.results_path /= "muzero_without_rb"
+
+    if log_in_tensorboard or config.save_model:
+        config.results_path.mkdir(parents=True, exist_ok=True)
+
+    checkpoint = {
+        "weights": None,
+        "optimizer_state": None,
+        "total_reward": 0,
+        "muzero_reward": 0,
+        "opponent_reward": 0,
+        "episode_length": 0,
+        "mean_value": 0,
+        "training_step": 0,
+        "lr": 0,
+        "total_loss": 0,
+        "value_loss": 0,
+        "reward_loss": 0,
+        "policy_loss": 0,
+        "num_played_games": 0,
+        "num_played_steps": 0,
+        "num_reanalysed_games": 0,
+        "terminate": False,
+    }
+
+    trainer = Trainer(models.MuZeroNetwork, checkpoint, config)
+    selfplay = GamePlay(trainer.model, checkpoint, Game, config, config.seed)
+    buffer = {}
+    play_buffer = PlayBuffer(checkpoint, buffer, config)
+
+    step = 1 # 间隔，即每次模拟后训练多少次
+    max_steps = int(config.training_steps/step)
+    # max_steps = 2000
+
+    writer = SummaryWriter(config.results_path)
+
+    for episode in range(max_steps):
+        game_id, game_history = selfplay.play_game(selfplay.config.visit_softmax_temperature_fn(0), selfplay.config.temperature_threshold, False, "self",0)
+
+        # print(game_id)
+        # print(game_history.action_history)
+        # print(game_history.reward_history)
+        # print(game_history.to_play_history)
+        # # print(game_history.observation_history)
+        # print("child visits", game_history.child_visits)
+        # print(game_history.root_values) # root value指的是root节点的UCB值
+
+        play_buffer.update_game_history(game_id, game_history)
+        update_gameplay_checkpoint(config, checkpoint, game_history)
+
+        for i in range(step):
+            index_batch, batch = play_buffer.get_batch()
+            # print(batch[1])
+            trainer.update_lr()
+            (
+                priorities,
+                total_loss,
+                value_loss,
+                reward_loss,
+                policy_loss,
+            ) = trainer.update_weights(batch)
+
+
+            training_step = episode * step + i
+            if training_step % config.checkpoint_interval == 0:
+                checkpoint["weights"] = copy.deepcopy(trainer.model.get_weights())
+                checkpoint["optimizer_state"] =copy.deepcopy(models.dict_to_cpu(trainer.optimizer.state_dict()) )
+
+                if config.save_model:
+                    save_checkpoint(config, checkpoint)
+            checkpoint["training_step"] = training_step
+            checkpoint["lr"] = trainer.optimizer.param_groups[0]["lr"]
+            checkpoint["total_loss"] = total_loss
+            checkpoint["value_loss"] = value_loss
+            checkpoint["reward_loss"] = reward_loss
+            checkpoint["policy_loss"] = policy_loss
+
+        # print(training_step)
+        # if training_step % 500 == 0:
+        # if training_step % config.checkpoint_interval == 0:
+        #     # print(training_step)
+        #     logging_loop(config, checkpoint, writer)
+
+        logging_loop(config, checkpoint, writer, training_step)
+
+
+    writer.close()
+
+    selfplay.close_game()
+
+if __name__ == "__main__":
+    start_time = time.time()
+    train()
+    end_time = time.time()
+    print("耗时: {:.2f}秒".format(end_time - start_time))
\ No newline at end of file
diff --git a/replay_buffer.py b/replay_buffer.py
index 81bc813e..cc1115db 100644
--- a/replay_buffer.py
+++ b/replay_buffer.py
@@ -16,7 +16,7 @@ class ReplayBuffer:
 
     def __init__(self, initial_checkpoint, initial_buffer, config):
         self.config = config
-        self.buffer = copy.deepcopy(initial_buffer)
+        self.buffer = copy.deepcopy(initial_buffer) # buffer是一个字典，key是game id，value是game_history
         self.num_played_games = initial_checkpoint["num_played_games"]
         self.num_played_steps = initial_checkpoint["num_played_steps"]
         self.total_samples = sum(
@@ -79,11 +79,14 @@ def get_batch(self):
         ) = ([], [], [], [], [], [], [])
         weight_batch = [] if self.config.PER else None
 
+        # 从buffer里抽取n鸽样本，有probs的话安装probs的概率抽取，没有的话按照uniform抽取
         for game_id, game_history, game_prob in self.sample_n_games(
             self.config.batch_size
         ):
+            # 每个game_history都是一个游戏运行的序列，使用sample_position从这些序列里随机抽取一个位置
             game_pos, pos_prob = self.sample_position(game_history)
 
+            # 计算从该位置开始的值，rewards等数据
             values, rewards, policies, actions = self.make_target(
                 game_history, game_pos
             )
@@ -165,11 +168,11 @@ def sample_n_games(self, n_games, force_uniform=False):
                 game_id_list.append(game_id)
                 game_probs.append(game_history.game_priority)
             game_probs = numpy.array(game_probs, dtype="float32")
-            game_probs /= numpy.sum(game_probs)
+            game_probs /= numpy.sum(game_probs) # 每一个都除以game_probs的总和，可以看成是归一化
             game_prob_dict = dict(
                 [(game_id, prob) for game_id, prob in zip(game_id_list, game_probs)]
             )
-            selected_games = numpy.random.choice(game_id_list, n_games, p=game_probs)
+            selected_games = numpy.random.choice(game_id_list, n_games, p=game_probs) # 抽取n个样本， 抽取的概率是根据game_probs确定的
         else:
             selected_games = numpy.random.choice(list(self.buffer.keys()), n_games)
             game_prob_dict = {}
@@ -177,10 +180,11 @@ def sample_n_games(self, n_games, force_uniform=False):
             (game_id, self.buffer[game_id], game_prob_dict.get(game_id))
             for game_id in selected_games
         ]
-        return ret
+        return ret # ret格式为[game_id, game_history, game_prob]
 
     def sample_position(self, game_history, force_uniform=False):
         """
+        统一或根据某些优先级从游戏中采样位置。
         Sample position from game either uniformly or according to some priority.
         See paper appendix Training.
         """
@@ -230,6 +234,8 @@ def update_priorities(self, priorities, index_info):
     def compute_target_value(self, game_history, index):
         # The value target is the discounted root value of the search tree td_steps into the
         # future, plus the discounted sum of all rewards until then.
+        # 价值目标是未来搜索树 td_steps 的折扣根值，加上到那时为止的所有奖励的折扣总和。
+        # 计算公式  ∑r*γ^n
         bootstrap_index = index + self.config.td_steps
         if bootstrap_index < len(game_history.root_values):
             root_values = (
@@ -237,6 +243,8 @@ def compute_target_value(self, game_history, index):
                 if game_history.reanalysed_predicted_root_values is None
                 else game_history.reanalysed_predicted_root_values
             )
+
+            # 检查当前的id和目标id是否一致，如果不一致则取负
             last_step_value = (
                 root_values[bootstrap_index]
                 if game_history.to_play_history[bootstrap_index]
@@ -244,13 +252,15 @@ def compute_target_value(self, game_history, index):
                 else -root_values[bootstrap_index]
             )
 
+            # 计算公式 r*γ^n
             value = last_step_value * self.config.discount**self.config.td_steps
-        else:
+        else: # 因为终点的长度超过了数据，因此设为0
             value = 0
 
         for i, reward in enumerate(
-            game_history.reward_history[index + 1 : bootstrap_index + 1]
+            game_history.reward_history[index + 1 : bootstrap_index + 1] # 获取reward,从index+1到最大（如果长度不够则只会取到最后）
         ):
+            # 根据对手决定正负号，只会累计到value上
             # The value is oriented from the perspective of the current player
             value += (
                 reward
@@ -259,12 +269,13 @@ def compute_target_value(self, game_history, index):
                 else -reward
             ) * self.config.discount**i
 
-        return value
+        return value # 返回value
 
     def make_target(self, game_history, state_index):
         """
         Generate targets for every unroll steps.
         """
+        # target policies 是 策略选择的概率序列，如[[0.4,0.6], [0.5,0.5],...]
         target_values, target_rewards, target_policies, actions = [], [], [], []
         for current_index in range(
             state_index, state_index + self.config.num_unroll_steps + 1
@@ -280,6 +291,7 @@ def make_target(self, game_history, state_index):
                 target_values.append(0)
                 target_rewards.append(game_history.reward_history[current_index])
                 # Uniform policy
+                # 因为是游戏结束的状态，因此选择各个策略的概率是平均分布的
                 target_policies.append(
                     [
                         1 / len(game_history.child_visits[0])
@@ -287,8 +299,9 @@ def make_target(self, game_history, state_index):
                     ]
                 )
                 actions.append(game_history.action_history[current_index])
-            else:
+            else: # 如果current index 大于 game_history的长度
                 # States past the end of games are treated as absorbing states
+                # 游戏结束后的状态被视为吸收状态，因此都为0
                 target_values.append(0)
                 target_rewards.append(0)
                 # Uniform policy
diff --git a/requirements.lock b/requirements.lock
index 742f745f..4d7ba441 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -6,7 +6,7 @@
 #
 absl-py==1.0.0
     # via tensorboard
-aiohttp==3.8.1
+aiohttp==3.7.4
     # via
     #   aiohttp-cors
     #   ray
@@ -16,7 +16,7 @@ aioredis==1.3.1
     # via ray
 aiosignal==1.2.0
     # via aiohttp
-async-timeout==4.0.1
+async-timeout==3.0.1
     # via
     #   aiohttp
     #   aioredis
@@ -171,7 +171,7 @@ pytz==2021.3
     # via pandas
 pyyaml==6.0
     # via ray
-ray==1.5.2
+ray==1.2
     # via -r requirements.in
 redis==4.0.1
     # via ray
diff --git a/self_play.py b/self_play.py
index d90fe5db..c62802f7 100644
--- a/self_play.py
+++ b/self_play.py
@@ -33,8 +33,8 @@ def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False):
             shared_storage.get_info.remote("training_step")
         ) < self.config.training_steps and not ray.get(
             shared_storage.get_info.remote("terminate")
-        ):
-            self.model.set_weights(ray.get(shared_storage.get_info.remote("weights")))
+        ): # 如果当前的训练步数低于训练总步数，并且没有终止的话，继续进行训练
+            self.model.set_weights(ray.get(shared_storage.get_info.remote("weights"))) # 从shared_storage中获取当前的参数
 
             if not test_mode:
                 game_history = self.play_game(
@@ -107,6 +107,16 @@ def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False):
 
         self.close_game()
 
+    #play game 运行
+    # 合法的actions是固定的，由游戏文件提供(在本函数中，可以看到调用legal_actions函数没有使用env，这表面现游戏环境于的改变于动作无关)。
+    # 运行步骤：
+    #   1. 创建GameHistory用来存储数据
+    #   2. 检查游戏是否结束或者到底最大移动次数
+    #   3. 获取stacked observation（因为有些游戏需要考虑之前的历史数据和移动轨迹)
+    #   4. 运行MCTS搜索下一步的action
+    #   5. 调用游戏函数step（action），获取下一步action之后的observation、reward和done
+    #   6. 持续运行2-5步直到结束
+    #   7. 返回GameHistory
     def play_game(
         self, temperature, temperature_threshold, render, opponent, muzero_player
     ):
@@ -116,9 +126,9 @@ def play_game(
         game_history = GameHistory()
         observation = self.game.reset()
         game_history.action_history.append(0)
-        game_history.observation_history.append(observation)
+        game_history.observation_history.append(observation) # 添加reset之后的observation
         game_history.reward_history.append(0)
-        game_history.to_play_history.append(self.game.to_play())
+        game_history.to_play_history.append(self.game.to_play()) # to_play_history是用来存放玩家id的
 
         done = False
 
@@ -128,7 +138,7 @@ def play_game(
         with torch.no_grad():
             while (
                 not done and len(game_history.action_history) <= self.config.max_moves
-            ):
+            ): # 游戏没有结束且运行步数小于最大移动步长
                 assert (
                     len(numpy.array(observation).shape) == 3
                 ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
@@ -138,14 +148,17 @@ def play_game(
                 stacked_observations = game_history.get_stacked_observations(
                     -1, self.config.stacked_observations, len(self.config.action_space)
                 )
+                # index是-1，game_history 会在创建时添加reset的observation，因此其长度为1.index取模（%）之后时1
+                # config.stacked_observationis是存储之前的observation的数量，如果不要之前的信息，可以设为0，这样就不会存储之前的信息
 
+                # 一下的if-else部分主要是为了选择一个动作
                 # Choose the action
                 if opponent == "self" or muzero_player == self.game.to_play():
                     root, mcts_info = MCTS(self.config).run(
                         self.model,
                         stacked_observations,
                         self.game.legal_actions(),
-                        self.game.to_play(),
+                        self.game.to_play(), # to_play返回当期玩游戏的玩家ID，默认是0
                         True,
                     )
                     action = self.select_action(
@@ -154,7 +167,7 @@ def play_game(
                         if not temperature_threshold
                         or len(game_history.action_history) < temperature_threshold
                         else 0,
-                    )
+                    ) # 根据temperature选择动作
 
                     if render:
                         print(f'Tree depth: {mcts_info["max_tree_depth"]}')
@@ -162,11 +175,11 @@ def play_game(
                             f"Root value for player {self.game.to_play()}: {root.value():.2f}"
                         )
                 else:
-                    action, root = self.select_opponent_action(
+                    action, root = self.select_opponent_action( #选择对手动作，分为随机，human和expert三种
                         opponent, stacked_observations
                     )
 
-                observation, reward, done = self.game.step(action)
+                observation, reward, done = self.game.step(action) # 运行游戏
 
                 if render:
                     print(f"Played action: {self.game.action_to_string(action)}")
@@ -176,7 +189,7 @@ def play_game(
 
                 # Next batch
                 game_history.action_history.append(action)
-                game_history.observation_history.append(observation)
+                game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
                 game_history.reward_history.append(reward)
                 game_history.to_play_history.append(self.game.to_play())
 
@@ -219,7 +232,12 @@ def select_opponent_action(self, opponent, stacked_observations):
                 'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"'
             )
 
-    @staticmethod
+    # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。
+    # 公式为 c^(1/t)。可以看到：
+    #   t越小，1/t于接近于无穷大，值大的c就越容易被选中。
+    #   t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1，难以区分大小，因此就会相当于随机选择
+    #   特殊地，当t=0时，使用random完全随机选择，当t=+∞,使用argmax选择最大的
+    @staticmethod # 静态方法修饰符，类似于static关键字
     def select_action(node, temperature):
         """
         Select action according to the visit count distribution and the temperature.
@@ -257,6 +275,25 @@ class MCTS:
     def __init__(self, config):
         self.config = config
 
+    # run函数运行流程：
+    #   1. 获取root节点
+    #       (1)如果由指定节点这将root赋值为该节点；
+    #       (2)如果没有，则
+    #           i. 创建新的节点Node(0)
+    #           ii. 使用initial_inference函数通过observation获取相应的reward，hidden state，legal actions等数据
+    #           iii. 将ii中获取的数据赋值到创建的root节点中取
+    #           PS. 可以看到，在（1）的情况下不需要调用initial_inference函数
+    #   2. 检查是否需要添加探索噪音
+    #   3. 开始循环模拟游戏，模拟的次数由num simulation决定
+    #       （1） 将初始节点node设置为root，并将节点node加入search tree中
+    #       （2） 检查该节点是否已经扩展，如果已经扩展，则通过ucb值来选择子节点expand. 并将node 设置为选中的节点。并将节点node加入search tree中
+    #       （3） 重复2，直到找到expanded为false的node为止
+    #       （4） 选择search_tree[-2]为parent(因为最后一个是node)
+    #       （5） 运行recurrent_inference函数，获得reward，hidden state，legal actions等数据
+    #       （6） 扩展node,即为node创建子节点，使node展开。
+    #       （7） 反向传播算法，对路径上的所有访问次数+1，value值加reward
+    #       PS: 可以看到，通过不停的模拟，节点被一层层的扩展（每次模拟扩展一个节点）。
+    #   4. 返回扩展过后的节点树root，以便之后的程序根据它选择动作action
     def run(
         self,
         model,
@@ -272,7 +309,7 @@ def run(
         We then run a Monte Carlo Tree Search using only action sequences and the model
         learned by the network.
         """
-        if override_root_with:
+        if override_root_with: #检查有没有提供Node,如果有，则指定；如果没有，则自己创建一个
             root = override_root_with
             root_predicted_value = None
         else:
@@ -282,7 +319,7 @@ def run(
                 .float()
                 .unsqueeze(0)
                 .to(next(model.parameters()).device)
-            )
+            ) # observation转tensor，外面包一层形成一个batch。 Observation的长度由参数stacked_observation配置，主要存储之前的previous。不要之前privious的配置为0
             (
                 root_predicted_value,
                 reward,
@@ -316,16 +353,17 @@ def run(
         min_max_stats = MinMaxStats()
 
         max_tree_depth = 0
-        for _ in range(self.config.num_simulations):
+        for _ in range(self.config.num_simulations): # 开始模拟游戏
             virtual_to_play = to_play
             node = root
             search_path = [node]
             current_tree_depth = 0
 
-            while node.expanded():
+            # expanded根据node的子节点个数判断是否已经扩展了，如果没有子节点，说明没被扩展
+            while node.expanded(): #这个循环一直在搜索没有expand的子节点。如果子节点已经expand了，则通过select_child选择下一个
                 current_tree_depth += 1
-                action, node = self.select_child(node, min_max_stats)
-                search_path.append(node)
+                action, node = self.select_child(node, min_max_stats) #选取ucb最大的一个action，如果有多个action得分相同，随机选取一个
+                search_path.append(node) #把节点添加到搜索队列
 
                 # Players play turn by turn
                 if virtual_to_play + 1 < len(self.config.players):
@@ -333,15 +371,18 @@ def run(
                 else:
                     virtual_to_play = self.config.players[0]
 
+            # 在搜索树内部，我们使用动态函数来获取给定动作的下一个hidden_state和previous hidden state
             # Inside the search tree we use the dynamics function to obtain the next hidden
             # state given an action and the previous hidden state
-            parent = search_path[-2]
+            parent = search_path[-2] # 选择倒数第二个节点，因为当前的node是-1，则-2是它的parent
             value, reward, policy_logits, hidden_state = model.recurrent_inference(
                 parent.hidden_state,
                 torch.tensor([[action]]).to(parent.hidden_state.device),
             )
             value = models.support_to_scalar(value, self.config.support_size).item()
             reward = models.support_to_scalar(reward, self.config.support_size).item()
+            # expand一层节点，actions是动作列表，policy_logits是rewards列表
+            # 通过该函数，在该节点扩展一层节点
             node.expand(
                 self.config.action_space,
                 virtual_to_play,
@@ -360,6 +401,9 @@ def run(
         }
         return root, extra_info
 
+    # MCTS 的select child和之前SelfPlay的select action逻辑是不一样的
+    #   1. select child是根据UCB选取的，select action是根据各个动作的visit count和temperature选取的
+    #   2. select child 选择的对象是Node,Node是由当前的state执行action后生成的新Node形成的。select action单纯的是选action
     def select_child(self, node, min_max_stats):
         """
         Select the child with the highest UCB score.
@@ -368,7 +412,7 @@ def select_child(self, node, min_max_stats):
             self.ucb_score(node, child, min_max_stats)
             for action, child in node.children.items()
         )
-        action = numpy.random.choice(
+        action = numpy.random.choice( # 随机选择ucb值等于最大ucb的动作（因为可能有多个动作的值都达到了最大的ucb,如果只有一个，那么就会选取这个)
             [
                 action
                 for action, child in node.children.items()
@@ -377,33 +421,37 @@ def select_child(self, node, min_max_stats):
         )
         return action, node.children[action]
 
-    def ucb_score(self, parent, child, min_max_stats):
+    def ucb_score(self, parent, child, min_max_stats): #该函数只进行一步查询，不进行多步
         """
         The score for a node is based on its value, plus an exploration bonus based on the prior.
         """
         pb_c = (
             math.log(
-                (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base
+                (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base # pc_c_base由配置文件决定
             )
             + self.config.pb_c_init
         )
         pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1)
 
-        prior_score = pb_c * child.prior
+        prior_score = pb_c * child.prior # prior 之前的p_value
+        # 公式 pb_c = (log((N+C+1)/C)+init ) * sqrt(N/(VC+1))
+        # prior_score = pbc * prior
 
         if child.visit_count > 0:
             # Mean value Q
-            value_score = min_max_stats.normalize(
+            value_score = min_max_stats.normalize( # 括号里的是Q值，Q=E[r+r*Q'。此处在对其进行正则化
                 child.reward
-                + self.config.discount
-                * (child.value() if len(self.config.players) == 1 else -child.value())
+                + self.config.discount # 衰减系数， 之后乘以子节点的值
+                * (child.value() if len(self.config.players) == 1 else -child.value()) # 根据players的个数，如果大于1，则子节点必定是对手，因此子节点的取负。
             )
         else:
             value_score = 0
 
-        return prior_score + value_score
+        return prior_score + value_score # 先前的分数加上Q值就是新的UCB值
 
-    def backpropagate(self, search_path, value, to_play, min_max_stats):
+    # 反向传播算法
+    # 对路径上的所有访问次数+1，value值加reward
+    def backpropagate(self, search_path, value, to_play, min_max_stats): # MCTS反向传播，visit count加1
         """
         At the end of a simulation, we propagate the evaluation all the way up the tree
         to the root.
@@ -432,7 +480,7 @@ def backpropagate(self, search_path, value, to_play, min_max_stats):
 
 class Node:
     def __init__(self, prior):
-        self.visit_count = 0
+        self.visit_count = 0 #visit count默认是0，只有经过反向传播之后才能变成增加
         self.to_play = -1
         self.prior = prior
         self.value_sum = 0
@@ -449,6 +497,8 @@ def value(self):
         return self.value_sum / self.visit_count
 
     def expand(self, actions, to_play, reward, policy_logits, hidden_state):
+        # expand一层节点，actions是动作列表，policy_logits是rewards列表
+        # 通过该函数，在该节点扩展一层节点
         """
         We expand a node using the value, reward and policy prediction obtained from the
         neural network.
@@ -460,7 +510,7 @@ def expand(self, actions, to_play, reward, policy_logits, hidden_state):
         policy_values = torch.softmax(
             torch.tensor([policy_logits[0][a] for a in actions]), dim=0
         ).tolist()
-        policy = {a: policy_values[i] for i, a in enumerate(actions)}
+        policy = {a: policy_values[i] for i, a in enumerate(actions)} # 列出所有的合法动作及对于的value值
         for action, p in policy.items():
             self.children[action] = Node(p)
 
@@ -512,7 +562,7 @@ def store_search_statistics(self, root, action_space):
 
     def get_stacked_observations(
         self, index, num_stacked_observations, action_space_size
-    ):
+    ): #根据索引index获取observation序列
         """
         Generate a new observation with the observation at the index position
         and num_stacked_observations past observations and actions stacked.
@@ -520,12 +570,12 @@ def get_stacked_observations(
         # Convert to positive index
         index = index % len(self.observation_history)
 
-        stacked_observations = self.observation_history[index].copy()
+        stacked_observations = self.observation_history[index].copy() #分为两部分，一部分是当前（current）观察值，一部分是之前的(previous)观察值
         for past_observation_index in reversed(
             range(index - num_stacked_observations, index)
         ):
             if 0 <= past_observation_index:
-                previous_observation = numpy.concatenate(
+                previous_observation = numpy.concatenate( # np.concatenate将第一个参数的list组合起来，方法是依次拆开每个元素，拼接
                     (
                         self.observation_history[past_observation_index],
                         [
@@ -543,7 +593,7 @@ def get_stacked_observations(
                     )
                 )
 
-            stacked_observations = numpy.concatenate(
+            stacked_observations = numpy.concatenate( # 向stoacked_observtions添加内容
                 (stacked_observations, previous_observation)
             )
 
@@ -556,15 +606,16 @@ class MinMaxStats:
     """
 
     def __init__(self):
-        self.maximum = -float("inf")
-        self.minimum = float("inf")
+        self.maximum = -float("inf") # 最大是-∞
+        self.minimum = float("inf") # 最小是+∞
+        # 跟类一定要update至少两次才能产生正确的范围。第一次更新掉max<min的情况，但是max=min;之后更新使其产生范围
 
-    def update(self, value):
+    def update(self, value): # 更新max和min,方法时对比大小，大的更新为上限，小的更新为下限
         self.maximum = max(self.maximum, value)
         self.minimum = min(self.minimum, value)
 
-    def normalize(self, value):
-        if self.maximum > self.minimum:
+    def normalize(self, value): #对value规范化，公式为(x-a)/(a-b) 当x∈[a,b]时
+        if self.maximum > self.minimum: # 如果最大大于最小，说明至少更新了两次（第一次更新掉max<min的情况，但是max=min;之后更新使其产生范围）
             # We normalize only when we have set the maximum and minimum values
             return (value - self.minimum) / (self.maximum - self.minimum)
-        return value
+        return value # 如果范围没有更新，就直接返回value
diff --git a/shared_storage.py b/shared_storage.py
index 5a70fe20..343fc313 100644
--- a/shared_storage.py
+++ b/shared_storage.py
@@ -9,21 +9,21 @@ class SharedStorage:
     """
     Class which run in a dedicated thread to store the network weights and some information.
     """
-
+    # SharedStorage定义了一个类，包含一个dict类型的config和当前的模型参数checkpoint。类通过ray访问
     def __init__(self, checkpoint, config):
         self.config = config
         self.current_checkpoint = copy.deepcopy(checkpoint)
 
-    def save_checkpoint(self, path=None):
+    def save_checkpoint(self, path=None): #将模型存储在文件中
         if not path:
             path = self.config.results_path / "model.checkpoint"
 
         torch.save(self.current_checkpoint, path)
 
-    def get_checkpoint(self):
+    def get_checkpoint(self): # 返回当前的模型参数，返回的是一个深拷贝，防止对当前模型的修改
         return copy.deepcopy(self.current_checkpoint)
 
-    def get_info(self, keys):
+    def get_info(self, keys): # 从config中获取参数
         if isinstance(keys, str):
             return self.current_checkpoint[keys]
         elif isinstance(keys, list):
@@ -31,7 +31,7 @@ def get_info(self, keys):
         else:
             raise TypeError
 
-    def set_info(self, keys, values=None):
+    def set_info(self, keys, values=None): # 向config中写入参数
         if isinstance(keys, str) and values is not None:
             self.current_checkpoint[keys] = values
         elif isinstance(keys, dict):
diff --git a/simplifiedMuZero/__init__.py b/simplifiedMuZero/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/simplifiedMuZero/net2/__init__.py b/simplifiedMuZero/net2/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/simplifiedMuZero/net2/models2.py b/simplifiedMuZero/net2/models2.py
new file mode 100644
index 00000000..c36e8095
--- /dev/null
+++ b/simplifiedMuZero/net2/models2.py
@@ -0,0 +1,411 @@
+import math
+from abc import ABC, abstractmethod
+
+import torch
+
+from models import support_to_scalar, scalar_to_support, mlp, AbstractNetwork, conv3x3, RepresentationNetwork, DynamicsNetwork, PredictionNetwork
+
+class MuZeroNetwork_2net:
+    def __new__(cls, config):
+        if config.network == "fullyconnected":
+            return MuZeroFullyConnectedNetwork_2net(
+                config.observation_shape,
+                config.stacked_observations,
+                len(config.action_space),
+                config.encoding_size,
+                config.fc_reward_layers,
+                config.fc_value_layers,
+                config.fc_policy_layers,
+                config.fc_representation_layers,
+                config.fc_dynamics_layers,
+                config.support_size,
+            )
+        elif config.network == "resnet":
+            print("resnet")
+            return MuZeroResidualNetwork_2net(
+                config.observation_shape,
+                config.stacked_observations,
+                len(config.action_space),
+                config.blocks,
+                config.channels,
+                config.reduced_channels_reward,
+                config.reduced_channels_value,
+                config.reduced_channels_policy,
+                config.resnet_fc_reward_layers,
+                config.resnet_fc_value_layers,
+                config.resnet_fc_policy_layers,
+                config.support_size,
+                config.downsample,
+            )
+        else:
+            raise NotImplementedError(
+                'The network parameter should be "fullyconnected" or "resnet".'
+            )
+class MuZeroFullyConnectedNetwork_2net(AbstractNetwork):
+    def __init__(
+        self,
+        observation_shape,
+        stacked_observations,
+        action_space_size,
+        encoding_size,
+        fc_reward_layers,
+        fc_value_layers,
+        fc_policy_layers,
+        fc_representation_layers,
+        fc_dynamics_layers,
+        support_size,
+    ):
+        super().__init__()
+        print(self.__class__.__name__)
+        self.action_space_size = action_space_size
+        self.full_support_size = 2 * support_size + 1
+        # support_size 表示的应该是一个选择的范围【-support_size, support_size】.最后+1是因为range最后不包含最后的数
+
+        representation_input_size = observation_shape[0] * observation_shape[1] * observation_shape[2] * (
+                    stacked_observations + 1) \
+                                    + stacked_observations * observation_shape[1] * observation_shape[2]
+
+        # 输出等于输入，即编码维度等于输入维度
+        encoding_size = representation_input_size
+
+        # self.representation_network = torch.nn.DataParallel(
+        #     # mlp(
+        #     #     representation_input_size,
+        #     #     fc_representation_layers,
+        #     #     encoding_size,
+        #     # )
+        #     mlp(
+        #         representation_input_size + self.action_space_size,
+        #         fc_representation_layers,
+        #         encoding_size,
+        #     )
+        # )
+
+        #dynamics的输入是encoding_size+action_space_size
+        self.dynamics_encoded_state_network = torch.nn.DataParallel(
+            mlp(
+                encoding_size + self.action_space_size,
+                fc_dynamics_layers,
+                encoding_size,
+            )
+        )
+        self.dynamics_reward_network = torch.nn.DataParallel(
+            mlp(encoding_size, fc_reward_layers, self.full_support_size) #最后的输出为full_support_size，因为范围是[-support_size, support_size]
+        )
+
+        self.prediction_policy_network = torch.nn.DataParallel(
+            mlp(encoding_size, fc_policy_layers, self.action_space_size) #输出action的概率
+        )
+        self.prediction_value_network = torch.nn.DataParallel(
+            mlp(encoding_size, fc_value_layers, self.full_support_size) #最后的输出为full_support_size，因为范围是[-support_size, support_size]
+        )
+
+
+    def prediction(self, encoded_state):
+        policy_logits = self.prediction_policy_network(encoded_state)
+        value = self.prediction_value_network(encoded_state)
+        return policy_logits, value
+
+        # 将encoded_stated标准化
+    def encoded_stated_normalized(self, encoded_state):
+        min_encoded_state = encoded_state.min(1, keepdim=True)[0]
+        max_encoded_state = encoded_state.max(1, keepdim=True)[0]
+        scale_encoded_state = max_encoded_state - min_encoded_state
+        scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5  # 防止为0，出现NAN
+        encoded_state_normalized = (encoded_state - min_encoded_state) / scale_encoded_state
+
+        return encoded_state_normalized
+    def representation(self, observation):
+        observation = observation.view(observation.shape[0], -1)
+        action_zeros = (torch.zeros((observation.shape[0], self.action_space_size)).to(observation.device).float())
+        x = torch.cat((observation, action_zeros), dim=1)
+
+        # encoded_state = self.representation_network(x)
+        encoded_state = self.dynamics_encoded_state_network(x)
+
+        # encoded_state = self.representation_network(
+        #     observation.view(observation.shape[0], -1)
+        # )
+
+        return self.encoded_stated_normalized(encoded_state)
+
+
+    # dynamic同representation的唯一不同就是前者需要将encoded_state和action合并在一起作为输入，而representation不需要绑定action
+    def dynamics(self, encoded_state, action):
+        action_one_hot = (torch.zeros((action.shape[0], self.action_space_size)).to(action.device).float())
+        action_one_hot.scatter_(1, action.long(), 1.0)
+        x = torch.cat((encoded_state, action_one_hot), dim=1)
+
+        next_encoded_state = self.dynamics_encoded_state_network(x)
+
+        reward = self.dynamics_reward_network(next_encoded_state)
+        next_encoded_state_normalized = self.encoded_stated_normalized(next_encoded_state)
+
+        return next_encoded_state_normalized, reward
+
+    def initial_inference(self, observation):
+        encoded_state = self.representation(observation)
+        policy_logits, value = self.prediction(encoded_state)
+        # reward equal to 0 for consistency 一致性奖励等于 0
+        reward = torch.log(
+            (
+                torch.zeros(1, self.full_support_size)
+                .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0)
+                .repeat(len(observation), 1)
+                .to(observation.device)
+            )
+        )
+        # reward的样子为[[0,0,...,0,1,0,...,0,0]，...]。即中间值为1，其余全为0，然后重复于observation行数相同的次数
+
+        return (
+            value,
+            reward,
+            policy_logits,
+            encoded_state,
+        )
+
+    def recurrent_inference(self, encoded_state, action):
+        next_encoded_state, reward = self.dynamics(encoded_state, action)
+        policy_logits, value = self.prediction(next_encoded_state)
+        return value, reward, policy_logits, next_encoded_state
+
+class MuZeroResidualNetwork_2net(AbstractNetwork):
+    def __init__(
+        self,
+        observation_shape,
+        stacked_observations, # stacken_observations表示先去观察的数量，用在那些需要历史信息的游戏里。如果不需要历史观察，该值为0
+        action_space_size,
+        num_blocks,
+        num_channels,
+        reduced_channels_reward,
+        reduced_channels_value,
+        reduced_channels_policy,
+        fc_reward_layers,
+        fc_value_layers,
+        fc_policy_layers,
+        support_size,
+        downsample,
+    ):
+        super().__init__()
+        num_channels = observation_shape[1]
+        self.action_space_size = action_space_size
+        self.full_support_size = 2 * support_size + 1
+        block_output_size_reward = (
+            (
+                reduced_channels_reward
+                * math.ceil(observation_shape[1] / 16)
+                * math.ceil(observation_shape[2] / 16)
+            )
+            if downsample
+            else (reduced_channels_reward * observation_shape[1] * observation_shape[2])
+        )
+
+        # observations_shape存放的时观察值的维度形状，第0维时观察的当前和历史维度，后面几维是观察值
+        block_output_size_value = (
+            (
+                reduced_channels_value
+                * math.ceil(observation_shape[1] / 16)
+                * math.ceil(observation_shape[2] / 16)
+            )
+            if downsample
+            else (reduced_channels_value * observation_shape[1] * observation_shape[2])
+        )
+
+        block_output_size_policy = (
+            (
+                reduced_channels_policy
+                * math.ceil(observation_shape[1] / 16)
+                * math.ceil(observation_shape[2] / 16)
+            )
+            if downsample
+            else (reduced_channels_policy * observation_shape[1] * observation_shape[2])
+        )
+
+        # self.representation_network = torch.nn.DataParallel(
+        #     RepresentationNetwork(
+        #         observation_shape,
+        #         stacked_observations,
+        #         num_blocks,
+        #         num_channels,
+        #         downsample,
+        #     )
+        # )
+
+        self.dynamics_network = torch.nn.DataParallel(
+            DynamicsNetwork(
+                num_blocks,
+                num_channels + 1,
+                reduced_channels_reward,
+                fc_reward_layers,
+                self.full_support_size,
+                block_output_size_reward,
+            )
+        )
+
+        self.prediction_network = torch.nn.DataParallel(
+            PredictionNetwork(
+                action_space_size,
+                num_blocks,
+                num_channels,
+                reduced_channels_value,
+                reduced_channels_policy,
+                fc_value_layers,
+                fc_policy_layers,
+                self.full_support_size,
+                block_output_size_value,
+                block_output_size_policy,
+            )
+        )
+
+    def prediction(self, encoded_state):
+        # print("encoded_state shape is : " , encoded_state.shape)
+        policy, value = self.prediction_network(encoded_state)
+        return policy, value
+
+    # def representation(self, observation):
+    #     # print("observation shape is : ", observation.shape)
+    #     encoded_state = self.representation_network(observation)
+    #
+    #     # Scale encoded state between [0, 1] (See appendix paper Training)
+    #     min_encoded_state = (
+    #         encoded_state.view(
+    #             -1,
+    #             encoded_state.shape[1],
+    #             encoded_state.shape[2] * encoded_state.shape[3],
+    #         )
+    #         .min(2, keepdim=True)[0]
+    #         .unsqueeze(-1)
+    #     )
+    #     max_encoded_state = (
+    #         encoded_state.view(
+    #             -1,
+    #             encoded_state.shape[1],
+    #             encoded_state.shape[2] * encoded_state.shape[3],
+    #         )
+    #         .max(2, keepdim=True)[0]
+    #         .unsqueeze(-1)
+    #     )
+    #     scale_encoded_state = max_encoded_state - min_encoded_state
+    #     scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5
+    #     encoded_state_normalized = (
+    #         encoded_state - min_encoded_state
+    #     ) / scale_encoded_state
+    #     return encoded_state_normalized
+
+    def representation(self, encoded_state):
+        # Stack encoded_state with a game specific one hot encoded action (See paper appendix Network Architecture)
+        action_one_hot = (
+            torch.ones(
+                (
+                    encoded_state.shape[0],
+                    1,
+                    encoded_state.shape[2],
+                    encoded_state.shape[3],
+                )
+            )
+            .to(encoded_state.device)
+            .float()
+        )
+        # action_one_hot = (
+        #         action[:, :, None, None] * action_one_hot / self.action_space_size
+        # )
+        x = torch.cat((encoded_state, action_one_hot), dim=1)
+        next_encoded_state, _ = self.dynamics_network(x) # 第二个参数是reward，在表示网络不需要它
+
+        # Scale encoded state between [0, 1] (See paper appendix Training)
+        min_next_encoded_state = (
+            next_encoded_state.view(
+                -1,
+                next_encoded_state.shape[1],
+                next_encoded_state.shape[2] * next_encoded_state.shape[3],
+            )
+            .min(2, keepdim=True)[0]
+            .unsqueeze(-1)
+        )
+        max_next_encoded_state = (
+            next_encoded_state.view(
+                -1,
+                next_encoded_state.shape[1],
+                next_encoded_state.shape[2] * next_encoded_state.shape[3],
+            )
+            .max(2, keepdim=True)[0]
+            .unsqueeze(-1)
+        )
+        scale_next_encoded_state = max_next_encoded_state - min_next_encoded_state
+        scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5
+        next_encoded_state_normalized = (
+                                                next_encoded_state - min_next_encoded_state
+                                        ) / scale_next_encoded_state
+        return next_encoded_state_normalized
+
+    def dynamics(self, encoded_state, action):
+        # Stack encoded_state with a game specific one hot encoded action (See paper appendix Network Architecture)
+        action_one_hot = (
+            torch.ones(
+                (
+                    encoded_state.shape[0],
+                    1,
+                    encoded_state.shape[2],
+                    encoded_state.shape[3],
+                )
+            )
+            .to(action.device)
+            .float()
+        )
+        action_one_hot = (
+            action[:, :, None, None] * action_one_hot / self.action_space_size
+        )
+        x = torch.cat((encoded_state, action_one_hot), dim=1)
+        next_encoded_state, reward = self.dynamics_network(x)
+
+        # Scale encoded state between [0, 1] (See paper appendix Training)
+        min_next_encoded_state = (
+            next_encoded_state.view(
+                -1,
+                next_encoded_state.shape[1],
+                next_encoded_state.shape[2] * next_encoded_state.shape[3],
+            )
+            .min(2, keepdim=True)[0]
+            .unsqueeze(-1)
+        )
+        max_next_encoded_state = (
+            next_encoded_state.view(
+                -1,
+                next_encoded_state.shape[1],
+                next_encoded_state.shape[2] * next_encoded_state.shape[3],
+            )
+            .max(2, keepdim=True)[0]
+            .unsqueeze(-1)
+        )
+        scale_next_encoded_state = max_next_encoded_state - min_next_encoded_state
+        scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5
+        next_encoded_state_normalized = (
+            next_encoded_state - min_next_encoded_state
+        ) / scale_next_encoded_state
+        return next_encoded_state_normalized, reward
+
+    def initial_inference(self, observation):
+        encoded_state = self.representation(observation)
+        # action = torch.tensor([[0]]).to(observation.device)
+        # encoded_state = self.dynamics(observation, action)
+        policy_logits, value = self.prediction(encoded_state)
+        # reward equal to 0 for consistency
+        reward = torch.log(
+            (
+                torch.zeros(1, self.full_support_size)
+                .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0) # 将support_size位置设为1
+                .repeat(len(observation), 1) # 根据observation的长度复制，保证reward的维度于observation的一致，即之前的observation也赋值
+                .to(observation.device)
+            )
+        )
+        return (
+            value,
+            reward,
+            policy_logits,
+            encoded_state,
+        )
+
+    def recurrent_inference(self, encoded_state, action):
+        next_encoded_state, reward = self.dynamics(encoded_state, action)
+        policy_logits, value = self.prediction(next_encoded_state)
+        return value, reward, policy_logits, next_encoded_state
diff --git a/simplifiedMuZero/net2/models_2net.py b/simplifiedMuZero/net2/models_2net.py
new file mode 100644
index 00000000..b62de9db
--- /dev/null
+++ b/simplifiedMuZero/net2/models_2net.py
@@ -0,0 +1,697 @@
+import math
+from abc import ABC, abstractmethod
+
+import torch
+
+
+class SimplifiedMuZeroNetwork:
+    def __new__(cls, config):
+        if config.network == "fullyconnected":
+            return SimplifiedMuZeroFullyConnectedNetwork(
+                config.observation_shape,
+                config.stacked_observations,
+                len(config.action_space),
+                config.encoding_size,
+                config.fc_reward_layers,
+                config.fc_value_layers,
+                config.fc_policy_layers,
+                config.fc_representation_layers,
+                config.fc_dynamics_layers,
+                config.support_size,
+            )
+        elif config.network == "resnet":
+            return MuZeroResidualNetwork(
+                config.observation_shape,
+                config.stacked_observations,
+                len(config.action_space),
+                config.blocks,
+                config.channels,
+                config.reduced_channels_reward,
+                config.reduced_channels_value,
+                config.reduced_channels_policy,
+                config.resnet_fc_reward_layers,
+                config.resnet_fc_value_layers,
+                config.resnet_fc_policy_layers,
+                config.support_size,
+                config.downsample,
+            )
+        else:
+            raise NotImplementedError(
+                'The network parameter should be "fullyconnected" or "resnet".'
+            )
+
+
+def dict_to_cpu(dictionary):
+    cpu_dict = {}
+    for key, value in dictionary.items():
+        if isinstance(value, torch.Tensor):
+            cpu_dict[key] = value.cpu()
+        elif isinstance(value, dict):
+            cpu_dict[key] = dict_to_cpu(value)
+        else:
+            cpu_dict[key] = value
+    return cpu_dict
+
+
+class AbstractNetwork(ABC, torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        pass
+
+    @abstractmethod
+    def initial_inference(self, observation):
+        pass
+
+    @abstractmethod
+    def recurrent_inference(self, encoded_state, action):
+        pass
+
+    def get_weights(self):
+        return dict_to_cpu(self.state_dict())
+
+    def set_weights(self, weights):
+        self.load_state_dict(weights)
+
+
+##################################
+######## Fully Connected #########
+
+
+class SimplifiedMuZeroFullyConnectedNetwork(AbstractNetwork):
+    def __init__(
+        self,
+        observation_shape,
+        stacked_observations,
+        action_space_size,
+        encoding_size,
+        fc_reward_layers,
+        fc_value_layers,
+        fc_policy_layers,
+        fc_representation_layers,
+        fc_dynamics_layers,
+        support_size,
+    ):
+        super().__init__()
+        self.action_space_size = action_space_size
+        self.full_support_size = 2 * support_size + 1
+        # support_size 表示的应该是一个选择的范围【-support_size, support_size】.最后+1是因为range最后不包含最后的数
+
+        representation_input_size = observation_shape[0] * observation_shape[1] * observation_shape[2] * (
+                    stacked_observations + 1) \
+                                    + stacked_observations * observation_shape[1] * observation_shape[2]
+
+        # 输出等于输入，即编码维度等于输入维度
+        encoding_size = representation_input_size
+
+        # self.representation_network = torch.nn.DataParallel(
+        #     # mlp(
+        #     #     representation_input_size,
+        #     #     fc_representation_layers,
+        #     #     encoding_size,
+        #     # )
+        #     mlp(
+        #         representation_input_size + self.action_space_size,
+        #         fc_representation_layers,
+        #         encoding_size,
+        #     )
+        # )
+
+        #dynamics的输入是encoding_size+action_space_size
+        self.dynamics_encoded_state_network = torch.nn.DataParallel(
+            mlp(
+                encoding_size + self.action_space_size,
+                fc_dynamics_layers,
+                encoding_size,
+            )
+        )
+        self.dynamics_reward_network = torch.nn.DataParallel(
+            mlp(encoding_size, fc_reward_layers, self.full_support_size) #最后的输出为full_support_size，因为范围是[-support_size, support_size]
+        )
+
+        self.prediction_policy_network = torch.nn.DataParallel(
+            mlp(encoding_size, fc_policy_layers, self.action_space_size) #输出action的概率
+        )
+        self.prediction_value_network = torch.nn.DataParallel(
+            mlp(encoding_size, fc_value_layers, self.full_support_size) #最后的输出为full_support_size，因为范围是[-support_size, support_size]
+        )
+
+    def prediction(self, encoded_state):
+        policy_logits = self.prediction_policy_network(encoded_state)
+        value = self.prediction_value_network(encoded_state)
+        return policy_logits, value
+
+        # 将encoded_stated标准化
+    def encoded_stated_normalized(self, encoded_state):
+        min_encoded_state = encoded_state.min(1, keepdim=True)[0]
+        max_encoded_state = encoded_state.max(1, keepdim=True)[0]
+        scale_encoded_state = max_encoded_state - min_encoded_state
+        scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5  # 防止为0，出现NAN
+        encoded_state_normalized = (encoded_state - min_encoded_state) / scale_encoded_state
+
+        return encoded_state_normalized
+    def representation(self, observation):
+        observation = observation.view(observation.shape[0], -1)
+        action_zeros = (torch.zeros((observation.shape[0], self.action_space_size)).to(observation.device).float())
+        x = torch.cat((observation, action_zeros), dim=1)
+
+        # encoded_state = self.representation_network(x)
+        encoded_state = self.dynamics_encoded_state_network(x)
+
+        # encoded_state = self.representation_network(
+        #     observation.view(observation.shape[0], -1)
+        # )
+
+        return self.encoded_stated_normalized(encoded_state)
+
+    # dynamic同representation的唯一不同就是前者需要将encoded_state和action合并在一起作为输入，而representation不需要绑定action
+    def dynamics(self, encoded_state, action):
+        action_one_hot = (torch.zeros((action.shape[0], self.action_space_size)).to(action.device).float())
+        action_one_hot.scatter(1, action.long(), 1.0)
+        x = torch.cat((encoded_state, action_one_hot), dim=1)
+
+        next_encoded_state = self.dynamics_encoded_state_network(x)
+
+        reward = self.dynamics_reward_network(next_encoded_state)
+        next_encoded_state_normalized = self.encoded_stated_normalized(next_encoded_state)
+
+        return next_encoded_state_normalized, reward
+
+    def initial_inference(self, observation):
+        encoded_state = self.representation(observation)
+        policy_logits, value = self.prediction(encoded_state)
+        # reward equal to 0 for consistency 一致性奖励等于 0
+        reward = torch.log(
+            (
+                torch.zeros(1, self.full_support_size)
+                .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0)
+                .repeat(len(observation), 1)
+                .to(observation.device)
+            )
+        )
+        # reward的样子为[[0,0,...,0,1,0,...,0,0]，...]。即中间值为1，其余全为0，然后重复于observation行数相同的次数
+
+        return (
+            value,
+            reward,
+            policy_logits,
+            encoded_state,
+        )
+
+    def recurrent_inference(self, encoded_state, action):
+        next_encoded_state, reward = self.dynamics(encoded_state, action)
+        policy_logits, value = self.prediction(next_encoded_state)
+        return value, reward, policy_logits, next_encoded_state
+
+
+###### End Fully Connected #######
+##################################
+
+
+##################################
+############# ResNet #############
+
+
+def conv3x3(in_channels, out_channels, stride=1):
+    return torch.nn.Conv2d(
+        in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False
+    )
+
+
+# Residual block
+class ResidualBlock(torch.nn.Module):
+    def __init__(self, num_channels, stride=1):
+        super().__init__()
+        self.conv1 = conv3x3(num_channels, num_channels, stride)
+        self.bn1 = torch.nn.BatchNorm2d(num_channels)
+        self.conv2 = conv3x3(num_channels, num_channels)
+        self.bn2 = torch.nn.BatchNorm2d(num_channels)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = torch.nn.functional.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out += x
+        out = torch.nn.functional.relu(out)
+        return out
+
+
+# Downsample observations before representation network (See paper appendix Network Architecture)
+class DownSample(torch.nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.conv1 = torch.nn.Conv2d(
+            in_channels,
+            out_channels // 2,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False,
+        )
+        self.resblocks1 = torch.nn.ModuleList(
+            [ResidualBlock(out_channels // 2) for _ in range(2)]
+        )
+        self.conv2 = torch.nn.Conv2d(
+            out_channels // 2,
+            out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False,
+        )
+        self.resblocks2 = torch.nn.ModuleList(
+            [ResidualBlock(out_channels) for _ in range(3)]
+        )
+        self.pooling1 = torch.nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
+        self.resblocks3 = torch.nn.ModuleList(
+            [ResidualBlock(out_channels) for _ in range(3)]
+        )
+        self.pooling2 = torch.nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        for block in self.resblocks1:
+            x = block(x)
+        x = self.conv2(x)
+        for block in self.resblocks2:
+            x = block(x)
+        x = self.pooling1(x)
+        for block in self.resblocks3:
+            x = block(x)
+        x = self.pooling2(x)
+        return x
+
+
+class DownsampleCNN(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, h_w):
+        super().__init__()
+        mid_channels = (in_channels + out_channels) // 2
+        self.features = torch.nn.Sequential(
+            torch.nn.Conv2d(
+                in_channels, mid_channels, kernel_size=h_w[0] * 2, stride=4, padding=2
+            ),
+            torch.nn.ReLU(inplace=True),
+            torch.nn.MaxPool2d(kernel_size=3, stride=2),
+            torch.nn.Conv2d(mid_channels, out_channels, kernel_size=5, padding=2),
+            torch.nn.ReLU(inplace=True),
+            torch.nn.MaxPool2d(kernel_size=3, stride=2),
+        )
+        self.avgpool = torch.nn.AdaptiveAvgPool2d(h_w)
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.avgpool(x)
+        return x
+
+
+class RepresentationNetwork(torch.nn.Module):
+    def __init__(
+        self,
+        observation_shape,
+        stacked_observations,
+        num_blocks,
+        num_channels,
+        downsample,
+    ):
+        super().__init__()
+        self.downsample = downsample
+        if self.downsample:
+            if self.downsample == "resnet":
+                self.downsample_net = DownSample(
+                    observation_shape[0] * (stacked_observations + 1)
+                    + stacked_observations,
+                    num_channels,
+                )
+            elif self.downsample == "CNN":
+                self.downsample_net = DownsampleCNN(
+                    observation_shape[0] * (stacked_observations + 1)
+                    + stacked_observations,
+                    num_channels,
+                    (
+                        math.ceil(observation_shape[1] / 16),
+                        math.ceil(observation_shape[2] / 16),
+                    ),
+                )
+            else:
+                raise NotImplementedError('downsample should be "resnet" or "CNN".')
+        self.conv = conv3x3(
+            observation_shape[0] * (stacked_observations + 1) + stacked_observations,
+            num_channels,
+        )
+        self.bn = torch.nn.BatchNorm2d(num_channels)
+        self.resblocks = torch.nn.ModuleList(
+            [ResidualBlock(num_channels) for _ in range(num_blocks)]
+        )
+
+    def forward(self, x):
+        if self.downsample:
+            x = self.downsample_net(x)
+        else:
+            x = self.conv(x)
+            x = self.bn(x)
+            x = torch.nn.functional.relu(x)
+
+        for block in self.resblocks:
+            x = block(x)
+        return x
+
+
+class DynamicsNetwork(torch.nn.Module):
+    def __init__(
+        self,
+        num_blocks,
+        num_channels,
+        reduced_channels_reward,
+        fc_reward_layers,
+        full_support_size,
+        block_output_size_reward,
+    ):
+        super().__init__()
+        self.conv = conv3x3(num_channels, num_channels - 1)
+        self.bn = torch.nn.BatchNorm2d(num_channels - 1)
+        self.resblocks = torch.nn.ModuleList(
+            [ResidualBlock(num_channels - 1) for _ in range(num_blocks)]
+        )
+
+        self.conv1x1_reward = torch.nn.Conv2d(
+            num_channels - 1, reduced_channels_reward, 1
+        )
+        self.block_output_size_reward = block_output_size_reward
+        self.fc = mlp(
+            self.block_output_size_reward,
+            fc_reward_layers,
+            full_support_size,
+        )
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = torch.nn.functional.relu(x)
+        for block in self.resblocks:
+            x = block(x)
+        state = x
+        x = self.conv1x1_reward(x)
+        x = x.view(-1, self.block_output_size_reward)
+        reward = self.fc(x)
+        return state, reward
+
+
+class PredictionNetwork(torch.nn.Module):
+    def __init__(
+        self,
+        action_space_size,
+        num_blocks,
+        num_channels,
+        reduced_channels_value,
+        reduced_channels_policy,
+        fc_value_layers,
+        fc_policy_layers,
+        full_support_size,
+        block_output_size_value,
+        block_output_size_policy,
+    ):
+        super().__init__()
+        self.resblocks = torch.nn.ModuleList(
+            [ResidualBlock(num_channels) for _ in range(num_blocks)]
+        )
+
+        self.conv1x1_value = torch.nn.Conv2d(num_channels, reduced_channels_value, 1)
+        self.conv1x1_policy = torch.nn.Conv2d(num_channels, reduced_channels_policy, 1)
+        self.block_output_size_value = block_output_size_value
+        self.block_output_size_policy = block_output_size_policy
+        self.fc_value = mlp(
+            self.block_output_size_value, fc_value_layers, full_support_size
+        )
+        self.fc_policy = mlp(
+            self.block_output_size_policy,
+            fc_policy_layers,
+            action_space_size,
+        )
+
+    def forward(self, x):
+        for block in self.resblocks:
+            x = block(x)
+        value = self.conv1x1_value(x)
+        policy = self.conv1x1_policy(x)
+        value = value.view(-1, self.block_output_size_value)
+        policy = policy.view(-1, self.block_output_size_policy)
+        value = self.fc_value(value)
+        policy = self.fc_policy(policy)
+        return policy, value
+
+
+class MuZeroResidualNetwork(AbstractNetwork):
+    def __init__(
+        self,
+        observation_shape,
+        stacked_observations,
+        action_space_size,
+        num_blocks,
+        num_channels,
+        reduced_channels_reward,
+        reduced_channels_value,
+        reduced_channels_policy,
+        fc_reward_layers,
+        fc_value_layers,
+        fc_policy_layers,
+        support_size,
+        downsample,
+    ):
+        super().__init__()
+        self.action_space_size = action_space_size
+        self.full_support_size = 2 * support_size + 1
+        block_output_size_reward = (
+            (
+                reduced_channels_reward
+                * math.ceil(observation_shape[1] / 16)
+                * math.ceil(observation_shape[2] / 16)
+            )
+            if downsample
+            else (reduced_channels_reward * observation_shape[1] * observation_shape[2])
+        )
+
+        block_output_size_value = (
+            (
+                reduced_channels_value
+                * math.ceil(observation_shape[1] / 16)
+                * math.ceil(observation_shape[2] / 16)
+            )
+            if downsample
+            else (reduced_channels_value * observation_shape[1] * observation_shape[2])
+        )
+
+        block_output_size_policy = (
+            (
+                reduced_channels_policy
+                * math.ceil(observation_shape[1] / 16)
+                * math.ceil(observation_shape[2] / 16)
+            )
+            if downsample
+            else (reduced_channels_policy * observation_shape[1] * observation_shape[2])
+        )
+
+        self.representation_network = torch.nn.DataParallel(
+            RepresentationNetwork(
+                observation_shape,
+                stacked_observations,
+                num_blocks,
+                num_channels,
+                downsample,
+            )
+        )
+
+        self.dynamics_network = torch.nn.DataParallel(
+            DynamicsNetwork(
+                num_blocks,
+                num_channels + 1,
+                reduced_channels_reward,
+                fc_reward_layers,
+                self.full_support_size,
+                block_output_size_reward,
+            )
+        )
+
+        self.prediction_network = torch.nn.DataParallel(
+            PredictionNetwork(
+                action_space_size,
+                num_blocks,
+                num_channels,
+                reduced_channels_value,
+                reduced_channels_policy,
+                fc_value_layers,
+                fc_policy_layers,
+                self.full_support_size,
+                block_output_size_value,
+                block_output_size_policy,
+            )
+        )
+
+    def prediction(self, encoded_state):
+        policy, value = self.prediction_network(encoded_state)
+        return policy, value
+
+    def representation(self, observation):
+        encoded_state = self.representation_network(observation)
+
+        # Scale encoded state between [0, 1] (See appendix paper Training)
+        min_encoded_state = (
+            encoded_state.view(
+                -1,
+                encoded_state.shape[1],
+                encoded_state.shape[2] * encoded_state.shape[3],
+            )
+            .min(2, keepdim=True)[0]
+            .unsqueeze(-1)
+        )
+        max_encoded_state = (
+            encoded_state.view(
+                -1,
+                encoded_state.shape[1],
+                encoded_state.shape[2] * encoded_state.shape[3],
+            )
+            .max(2, keepdim=True)[0]
+            .unsqueeze(-1)
+        )
+        scale_encoded_state = max_encoded_state - min_encoded_state
+        scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5
+        encoded_state_normalized = (
+            encoded_state - min_encoded_state
+        ) / scale_encoded_state
+        return encoded_state_normalized
+
+    def dynamics(self, encoded_state, action):
+        # Stack encoded_state with a game specific one hot encoded action (See paper appendix Network Architecture)
+        action_one_hot = (
+            torch.ones(
+                (
+                    encoded_state.shape[0],
+                    1,
+                    encoded_state.shape[2],
+                    encoded_state.shape[3],
+                )
+            )
+            .to(action.device)
+            .float()
+        )
+        action_one_hot = (
+            action[:, :, None, None] * action_one_hot / self.action_space_size
+        )
+        x = torch.cat((encoded_state, action_one_hot), dim=1)
+        next_encoded_state, reward = self.dynamics_network(x)
+
+        # Scale encoded state between [0, 1] (See paper appendix Training)
+        min_next_encoded_state = (
+            next_encoded_state.view(
+                -1,
+                next_encoded_state.shape[1],
+                next_encoded_state.shape[2] * next_encoded_state.shape[3],
+            )
+            .min(2, keepdim=True)[0]
+            .unsqueeze(-1)
+        )
+        max_next_encoded_state = (
+            next_encoded_state.view(
+                -1,
+                next_encoded_state.shape[1],
+                next_encoded_state.shape[2] * next_encoded_state.shape[3],
+            )
+            .max(2, keepdim=True)[0]
+            .unsqueeze(-1)
+        )
+        scale_next_encoded_state = max_next_encoded_state - min_next_encoded_state
+        scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5
+        next_encoded_state_normalized = (
+            next_encoded_state - min_next_encoded_state
+        ) / scale_next_encoded_state
+        return next_encoded_state_normalized, reward
+
+    def initial_inference(self, observation):
+        encoded_state = self.representation(observation)
+        policy_logits, value = self.prediction(encoded_state)
+        # reward equal to 0 for consistency
+        reward = torch.log(
+            (
+                torch.zeros(1, self.full_support_size)
+                .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0) # 将support_size位置设为1
+                .repeat(len(observation), 1) # 根据observation的长度复制，保证reward的维度于observation的一致，即之前的observation也赋值
+                .to(observation.device)
+            )
+        )
+        return (
+            value,
+            reward,
+            policy_logits,
+            encoded_state,
+        )
+
+    def recurrent_inference(self, encoded_state, action):
+        next_encoded_state, reward = self.dynamics(encoded_state, action)
+        policy_logits, value = self.prediction(next_encoded_state)
+        return value, reward, policy_logits, next_encoded_state
+
+
+########### End ResNet ###########
+##################################
+
+
+def mlp(
+    input_size,
+    layer_sizes,
+    output_size,
+    output_activation=torch.nn.Identity,
+    activation=torch.nn.ELU,
+):
+    sizes = [input_size] + layer_sizes + [output_size]
+    layers = []
+    for i in range(len(sizes) - 1):
+        act = activation if i < len(sizes) - 2 else output_activation #激活函数，最后一层是output_activation，其余的都一样
+        layers += [torch.nn.Linear(sizes[i], sizes[i + 1]), act()]
+    return torch.nn.Sequential(*layers)
+
+
+def support_to_scalar(logits, support_size): # logits 是 value的对数值，support_size是转换后的范围。
+    """
+    Transform a categorical representation to a scalar
+    See paper appendix Network Architecture
+    """
+    # Decode to a scalar
+    probabilities = torch.softmax(logits, dim=1) # softmax在指定的向量和为1，softmax扩大大的，缩小下的，shape为[stacked_size, fully_support_size]
+    support = (
+        torch.tensor([x for x in range(-support_size, support_size + 1)]) # 范围是-support_size, support_szie。因为support_size+1
+        .expand(probabilities.shape)
+        .float()
+        .to(device=probabilities.device)
+    ) # shape 为【stacked_size, fully_support_size】，
+    x = torch.sum(support * probabilities, dim=1, keepdim=True) # 输出为【1，fully_support_size】，因为dim=1，另外keep_dim=True，所有是【1，fully_support_size】而不是【fully_support_size]
+
+    # Invert the scaling (defined in https://arxiv.org/abs/1805.11593)
+    x = torch.sign(x) * ( # sign函数为分段函数, x小于0为-1，大于0为1，0为0。主要是获取x的符号
+        ((torch.sqrt(1 + 4 * 0.001 * (torch.abs(x) + 1 + 0.001)) - 1) / (2 * 0.001)) # (sqrt(1+0.04*(|x|+1.001))-1)/0.002
+        ** 2
+        - 1
+    )
+    return x
+
+
+def scalar_to_support(x, support_size):
+    """
+    Transform a scalar to a categorical representation with (2 * support_size + 1) categories
+    See paper appendix Network Architecture
+    """
+    # Reduce the scale (defined in https://arxiv.org/abs/1805.11593)
+    x = torch.sign(x) * (torch.sqrt(torch.abs(x) + 1) - 1) + 0.001 * x
+
+    # Encode on a vector
+    x = torch.clamp(x, -support_size, support_size) # 裁剪x的范围，使x的范围定为[-support_size, support_size]
+    floor = x.floor() # floor向下取整，类似的，ceil为向上取整
+    prob = x - floor # 减去整数，保留小数部分（因为在support_to_scala部分是index位置乘上概率)
+    logits = torch.zeros(x.shape[0], x.shape[1], 2 * support_size + 1).to(x.device)
+    logits.scatter_(
+        2, (floor + support_size).long().unsqueeze(-1), (1 - prob).unsqueeze(-1)
+    )
+    indexes = floor + support_size + 1
+    prob = prob.masked_fill_(2 * support_size < indexes, 0.0)
+    indexes = indexes.masked_fill_(2 * support_size < indexes, 0.0)
+    logits.scatter_(2, indexes.long().unsqueeze(-1), prob.unsqueeze(-1))
+    return logits
diff --git a/simplifiedMuZero/net2/replay_buffer_2net.py b/simplifiedMuZero/net2/replay_buffer_2net.py
new file mode 100644
index 00000000..646611c1
--- /dev/null
+++ b/simplifiedMuZero/net2/replay_buffer_2net.py
@@ -0,0 +1,376 @@
+import copy
+import time
+
+import numpy
+import ray
+import torch
+
+# import simplifiedMuZero.net2.models_2net as models
+import models
+from simplifiedMuZero.net2.models2 import MuZeroNetwork_2net
+
+
+@ray.remote
+class ReplayBuffer:
+    """
+    Class which run in a dedicated thread to store played games and generate batch.
+    """
+
+    def __init__(self, initial_checkpoint, initial_buffer, config):
+        self.config = config
+        self.buffer = copy.deepcopy(initial_buffer)
+        self.num_played_games = initial_checkpoint["num_played_games"]
+        self.num_played_steps = initial_checkpoint["num_played_steps"]
+        self.total_samples = sum(
+            [len(game_history.root_values) for game_history in self.buffer.values()]
+        )
+        if self.total_samples != 0:
+            print(
+                f"Replay buffer initialized with {self.total_samples} samples ({self.num_played_games} games).\n"
+            )
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+
+    def save_game(self, game_history, shared_storage=None):
+        if self.config.PER: # config.PER指的是优先重放 Prioritized Replay（参见论文附录训练），优先选择重放缓冲区中网络意外的元素
+            if game_history.priorities is not None:
+                # Avoid read only array when loading replay buffer from disk
+                game_history.priorities = numpy.copy(game_history.priorities)
+            else:
+                # Initial priorities for the prioritized replay (See paper appendix Training)
+                priorities = []
+                for i, root_value in enumerate(game_history.root_values):
+                    priority = (
+                        numpy.abs(
+                            root_value - self.compute_target_value(game_history, i)
+                        )
+                        ** self.config.PER_alpha
+                    )
+                    priorities.append(priority)
+
+                game_history.priorities = numpy.array(priorities, dtype="float32")
+                game_history.game_priority = numpy.max(game_history.priorities)
+
+        self.buffer[self.num_played_games] = game_history
+        self.num_played_games += 1
+        self.num_played_steps += len(game_history.root_values)
+        self.total_samples += len(game_history.root_values)
+
+        if self.config.replay_buffer_size < len(self.buffer):
+            del_id = self.num_played_games - len(self.buffer)
+            self.total_samples -= len(self.buffer[del_id].root_values)
+            del self.buffer[del_id]
+
+        if shared_storage:
+            shared_storage.set_info.remote("num_played_games", self.num_played_games)
+            shared_storage.set_info.remote("num_played_steps", self.num_played_steps)
+
+    def get_buffer(self):
+        return self.buffer
+
+    def get_batch(self):
+        (
+            index_batch,
+            observation_batch,
+            action_batch,
+            reward_batch,
+            value_batch,
+            policy_batch,
+            gradient_scale_batch,
+        ) = ([], [], [], [], [], [], [])
+        weight_batch = [] if self.config.PER else None
+
+        for game_id, game_history, game_prob in self.sample_n_games(
+            self.config.batch_size
+        ):
+            game_pos, pos_prob = self.sample_position(game_history)
+
+            values, rewards, policies, actions = self.make_target(
+                game_history, game_pos
+            )
+
+            index_batch.append([game_id, game_pos])
+            observation_batch.append(
+                game_history.get_stacked_observations(
+                    game_pos,
+                    self.config.stacked_observations,
+                    len(self.config.action_space),
+                )
+            )
+            action_batch.append(actions)
+            value_batch.append(values)
+            reward_batch.append(rewards)
+            policy_batch.append(policies)
+            gradient_scale_batch.append(
+                [
+                    min(
+                        self.config.num_unroll_steps,
+                        len(game_history.action_history) - game_pos,
+                    )
+                ]
+                * len(actions)
+            )
+            if self.config.PER:
+                weight_batch.append(1 / (self.total_samples * game_prob * pos_prob))
+
+        if self.config.PER:
+            weight_batch = numpy.array(weight_batch, dtype="float32") / max(
+                weight_batch
+            )
+
+        # observation_batch: batch, channels, height, width
+        # action_batch: batch, num_unroll_steps+1
+        # value_batch: batch, num_unroll_steps+1
+        # reward_batch: batch, num_unroll_steps+1
+        # policy_batch: batch, num_unroll_steps+1, len(action_space)
+        # weight_batch: batch
+        # gradient_scale_batch: batch, num_unroll_steps+1
+        return (
+            index_batch,
+            (
+                observation_batch,
+                action_batch,
+                value_batch,
+                reward_batch,
+                policy_batch,
+                weight_batch,
+                gradient_scale_batch,
+            ),
+        )
+
+    def sample_game(self, force_uniform=False):
+        """
+        Sample game from buffer either uniformly or according to some priority.
+        See paper appendix Training.
+        """
+        game_prob = None
+        if self.config.PER and not force_uniform:
+            game_probs = numpy.array(
+                [game_history.game_priority for game_history in self.buffer.values()],
+                dtype="float32",
+            )
+            game_probs /= numpy.sum(game_probs)
+            game_index = numpy.random.choice(len(self.buffer), p=game_probs)
+            game_prob = game_probs[game_index]
+        else:
+            game_index = numpy.random.choice(len(self.buffer))
+        game_id = self.num_played_games - len(self.buffer) + game_index
+
+        return game_id, self.buffer[game_id], game_prob
+
+    def sample_n_games(self, n_games, force_uniform=False):
+        if self.config.PER and not force_uniform:
+            game_id_list = []
+            game_probs = []
+            for game_id, game_history in self.buffer.items():
+                game_id_list.append(game_id)
+                game_probs.append(game_history.game_priority)
+            game_probs = numpy.array(game_probs, dtype="float32")
+            game_probs /= numpy.sum(game_probs)
+            game_prob_dict = dict(
+                [(game_id, prob) for game_id, prob in zip(game_id_list, game_probs)]
+            )
+            selected_games = numpy.random.choice(game_id_list, n_games, p=game_probs)
+        else:
+            selected_games = numpy.random.choice(list(self.buffer.keys()), n_games)
+            game_prob_dict = {}
+        ret = [
+            (game_id, self.buffer[game_id], game_prob_dict.get(game_id))
+            for game_id in selected_games
+        ]
+        return ret
+
+    def sample_position(self, game_history, force_uniform=False):
+        """
+        Sample position from game either uniformly or according to some priority.
+        See paper appendix Training.
+        """
+        position_prob = None
+        if self.config.PER and not force_uniform:
+            position_probs = game_history.priorities / sum(game_history.priorities)
+            position_index = numpy.random.choice(len(position_probs), p=position_probs)
+            position_prob = position_probs[position_index]
+        else:
+            position_index = numpy.random.choice(len(game_history.root_values))
+
+        return position_index, position_prob
+
+    def update_game_history(self, game_id, game_history):
+        # The element could have been removed since its selection and update
+        if next(iter(self.buffer)) <= game_id:
+            if self.config.PER:
+                # Avoid read only array when loading replay buffer from disk
+                game_history.priorities = numpy.copy(game_history.priorities)
+            self.buffer[game_id] = game_history
+
+    def update_priorities(self, priorities, index_info):
+        """
+        Update game and position priorities with priorities calculated during the training.
+        See Distributed Prioritized Experience Replay https://arxiv.org/abs/1803.00933
+        """
+        for i in range(len(index_info)):
+            game_id, game_pos = index_info[i]
+
+            # The element could have been removed since its selection and training
+            if next(iter(self.buffer)) <= game_id:
+                # Update position priorities
+                priority = priorities[i, :]
+                start_index = game_pos
+                end_index = min(
+                    game_pos + len(priority), len(self.buffer[game_id].priorities)
+                )
+                self.buffer[game_id].priorities[start_index:end_index] = priority[
+                    : end_index - start_index
+                ]
+
+                # Update game priorities
+                self.buffer[game_id].game_priority = numpy.max(
+                    self.buffer[game_id].priorities
+                )
+
+    def compute_target_value(self, game_history, index):
+        # The value target is the discounted root value of the search tree td_steps into the
+        # future, plus the discounted sum of all rewards until then.
+        bootstrap_index = index + self.config.td_steps
+        if bootstrap_index < len(game_history.root_values):
+            root_values = (
+                game_history.root_values
+                if game_history.reanalysed_predicted_root_values is None
+                else game_history.reanalysed_predicted_root_values
+            )
+            last_step_value = (
+                root_values[bootstrap_index]
+                if game_history.to_play_history[bootstrap_index]
+                == game_history.to_play_history[index]
+                else -root_values[bootstrap_index]
+            )
+
+            value = last_step_value * self.config.discount**self.config.td_steps
+        else:
+            value = 0
+
+        for i, reward in enumerate(
+            game_history.reward_history[index + 1 : bootstrap_index + 1]
+        ):
+            # The value is oriented from the perspective of the current player
+            value += (
+                reward
+                if game_history.to_play_history[index]
+                == game_history.to_play_history[index + i]
+                else -reward
+            ) * self.config.discount**i
+
+        return value
+
+    def make_target(self, game_history, state_index):
+        """
+        Generate targets for every unroll steps.
+        """
+        target_values, target_rewards, target_policies, actions = [], [], [], []
+        for current_index in range(
+            state_index, state_index + self.config.num_unroll_steps + 1
+        ):
+            value = self.compute_target_value(game_history, current_index)
+
+            if current_index < len(game_history.root_values):
+                target_values.append(value)
+                target_rewards.append(game_history.reward_history[current_index])
+                target_policies.append(game_history.child_visits[current_index])
+                actions.append(game_history.action_history[current_index])
+            elif current_index == len(game_history.root_values):
+                target_values.append(0)
+                target_rewards.append(game_history.reward_history[current_index])
+                # Uniform policy
+                target_policies.append(
+                    [
+                        1 / len(game_history.child_visits[0])
+                        for _ in range(len(game_history.child_visits[0]))
+                    ]
+                )
+                actions.append(game_history.action_history[current_index])
+            else:
+                # States past the end of games are treated as absorbing states
+                target_values.append(0)
+                target_rewards.append(0)
+                # Uniform policy
+                target_policies.append(
+                    [
+                        1 / len(game_history.child_visits[0])
+                        for _ in range(len(game_history.child_visits[0]))
+                    ]
+                )
+                actions.append(numpy.random.choice(self.config.action_space))
+
+        return target_values, target_rewards, target_policies, actions
+
+
+@ray.remote
+class Reanalyse:
+    """
+    Class which run in a dedicated thread to update the replay buffer with fresh information.
+    See paper appendix Reanalyse.
+    """
+
+    def __init__(self, initial_checkpoint, config):
+        self.config = config
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Initialize the network
+        # self.model = models.SimplifiedMuZeroNetwork(self.config)
+        self.model = MuZeroNetwork_2net(self.config)
+        self.model.set_weights(initial_checkpoint["weights"])
+        self.model.to(torch.device("cuda" if self.config.reanalyse_on_gpu else "cpu"))
+        self.model.eval()
+
+        self.num_reanalysed_games = initial_checkpoint["num_reanalysed_games"]
+
+    def reanalyse(self, replay_buffer, shared_storage):
+        while ray.get(shared_storage.get_info.remote("num_played_games")) < 1:
+            time.sleep(0.1)
+
+        while ray.get(
+            shared_storage.get_info.remote("training_step")
+        ) < self.config.training_steps and not ray.get(
+            shared_storage.get_info.remote("terminate")
+        ):
+            self.model.set_weights(ray.get(shared_storage.get_info.remote("weights")))
+
+            game_id, game_history, _ = ray.get(
+                replay_buffer.sample_game.remote(force_uniform=True)
+            )
+
+            # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze)
+            if self.config.use_last_model_value:
+                observations = numpy.array(
+                    [
+                        game_history.get_stacked_observations(
+                            i,
+                            self.config.stacked_observations,
+                            len(self.config.action_space),
+                        )
+                        for i in range(len(game_history.root_values))
+                    ]
+                )
+
+                observations = (
+                    torch.tensor(observations)
+                    .float()
+                    .to(next(self.model.parameters()).device)
+                )
+                values = models.support_to_scalar(
+                    self.model.initial_inference(observations)[0],
+                    self.config.support_size,
+                )
+                game_history.reanalysed_predicted_root_values = (
+                    torch.squeeze(values).detach().cpu().numpy()
+                )
+
+            replay_buffer.update_game_history.remote(game_id, game_history)
+            self.num_reanalysed_games += 1
+            shared_storage.set_info.remote(
+                "num_reanalysed_games", self.num_reanalysed_games
+            )
diff --git a/simplifiedMuZero/net2/self_play_2net.py b/simplifiedMuZero/net2/self_play_2net.py
new file mode 100644
index 00000000..5ca7bfbd
--- /dev/null
+++ b/simplifiedMuZero/net2/self_play_2net.py
@@ -0,0 +1,624 @@
+import math
+import time
+
+import numpy
+import ray
+import torch
+
+# import simplifiedMuZero.net2.models_2net as models
+import models
+from simplifiedMuZero.net2.models2 import MuZeroNetwork_2net
+
+
+@ray.remote
+class SelfPlay:
+    """
+    Class which run in a dedicated thread to play games and save them to the replay-buffer.
+    """
+
+    def __init__(self, initial_checkpoint, Game, config, seed):
+        self.config = config
+        self.game = Game(seed)
+
+        # Fix random generator seed
+        numpy.random.seed(seed)
+        torch.manual_seed(seed)
+
+        # Initialize the network
+        self.model = MuZeroNetwork_2net(self.config)
+        # self.model = models.MuZeroNetwork(self.config)
+        self.model.set_weights(initial_checkpoint["weights"])
+        self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu"))
+        self.model.eval()
+
+    def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False):
+        while ray.get(
+            shared_storage.get_info.remote("training_step")
+        ) < self.config.training_steps and not ray.get(
+            shared_storage.get_info.remote("terminate")
+        ): # 如果当前的训练步数低于训练总步数，并且没有终止的话，继续进行训练
+            self.model.set_weights(ray.get(shared_storage.get_info.remote("weights"))) # 从shared_storage中获取当前的参数
+
+            if not test_mode:
+                game_history = self.play_game(
+                    self.config.visit_softmax_temperature_fn(
+                        trained_steps=ray.get(
+                            shared_storage.get_info.remote("training_step")
+                        )
+                    ),
+                    self.config.temperature_threshold,
+                    False,
+                    "self",
+                    0,
+                )
+
+                replay_buffer.save_game.remote(game_history, shared_storage)
+
+            else:
+                # Take the best action (no exploration) in test mode
+                game_history = self.play_game(
+                    0,
+                    self.config.temperature_threshold,
+                    False,
+                    "self" if len(self.config.players) == 1 else self.config.opponent,
+                    self.config.muzero_player,
+                )
+
+                # Save to the shared storage
+                shared_storage.set_info.remote(
+                    {
+                        "episode_length": len(game_history.action_history) - 1,
+                        "total_reward": sum(game_history.reward_history),
+                        "mean_value": numpy.mean(
+                            [value for value in game_history.root_values if value]
+                        ),
+                    }
+                )
+                if 1 < len(self.config.players):
+                    shared_storage.set_info.remote(
+                        {
+                            "muzero_reward": sum(
+                                reward
+                                for i, reward in enumerate(game_history.reward_history)
+                                if game_history.to_play_history[i - 1]
+                                == self.config.muzero_player
+                            ),
+                            "opponent_reward": sum(
+                                reward
+                                for i, reward in enumerate(game_history.reward_history)
+                                if game_history.to_play_history[i - 1]
+                                != self.config.muzero_player
+                            ),
+                        }
+                    )
+
+            # Managing the self-play / training ratio
+            if not test_mode and self.config.self_play_delay:
+                time.sleep(self.config.self_play_delay)
+            if not test_mode and self.config.ratio:
+                while (
+                    ray.get(shared_storage.get_info.remote("training_step"))
+                    / max(
+                        1, ray.get(shared_storage.get_info.remote("num_played_steps"))
+                    )
+                    < self.config.ratio
+                    and ray.get(shared_storage.get_info.remote("training_step"))
+                    < self.config.training_steps
+                    and not ray.get(shared_storage.get_info.remote("terminate"))
+                ):
+                    time.sleep(0.5)
+
+        self.close_game()
+
+    #play game 运行
+    # 合法的actions是固定的，由游戏文件提供(在本函数中，可以看到调用legal_actions函数没有使用env，这表面现游戏环境于的改变于动作无关)。
+    # 运行步骤：
+    #   1. 创建GameHistory用来存储数据
+    #   2. 检查游戏是否结束或者到底最大移动次数
+    #   3. 获取stacked observation（因为有些游戏需要考虑之前的历史数据和移动轨迹)
+    #   4. 运行MCTS搜索下一步的action
+    #   5. 调用游戏函数step（action），获取下一步action之后的observation、reward和done
+    #   6. 持续运行2-5步直到结束
+    #   7. 返回GameHistory
+    def play_game(
+        self, temperature, temperature_threshold, render, opponent, muzero_player
+    ):
+        """
+        Play one game with actions based on the Monte Carlo tree search at each moves.
+        """
+        game_history = GameHistory()
+        observation = self.game.reset()
+        game_history.action_history.append(0)
+        game_history.observation_history.append(observation) # 添加reset之后的observation
+        game_history.reward_history.append(0)
+        game_history.to_play_history.append(self.game.to_play())
+
+        done = False
+
+        if render:
+            self.game.render()
+
+        with torch.no_grad():
+            while (
+                not done and len(game_history.action_history) <= self.config.max_moves
+            ): # 游戏没有结束且运行步数小于最大移动步长
+                assert (
+                    len(numpy.array(observation).shape) == 3
+                ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
+                assert (
+                    numpy.array(observation).shape == self.config.observation_shape
+                ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
+                stacked_observations = game_history.get_stacked_observations(
+                    -1, self.config.stacked_observations, len(self.config.action_space)
+                )
+                # index是-1，game_history 会在创建时添加reset的observation，因此其长度为1.index取模（%）之后时1
+                # config.stacked_observationis是存储之前的observation的数量，如果不要之前的信息，可以设为0，这样就不会存储之前的信息
+
+                # 一下的if-else部分主要是为了选择一个动作
+                # Choose the action
+                if opponent == "self" or muzero_player == self.game.to_play():
+                    root, mcts_info = MCTS(self.config).run(
+                        self.model,
+                        stacked_observations,
+                        self.game.legal_actions(),
+                        self.game.to_play(), # to_play返回当期玩游戏的玩家ID，默认是0
+                        True,
+                    )
+                    action = self.select_action(
+                        root,
+                        temperature
+                        if not temperature_threshold
+                        or len(game_history.action_history) < temperature_threshold
+                        else 0,
+                    ) # 根据temperature选择动作
+
+                    if render:
+                        print(f'Tree depth: {mcts_info["max_tree_depth"]}')
+                        print(
+                            f"Root value for player {self.game.to_play()}: {root.value():.2f}"
+                        )
+                else:
+                    action, root = self.select_opponent_action( #选择对手动作，分为随机，human和expert三种
+                        opponent, stacked_observations
+                    )
+
+                observation, reward, done = self.game.step(action) # 运行游戏
+
+                if render:
+                    print(f"Played action: {self.game.action_to_string(action)}")
+                    self.game.render()
+
+                game_history.store_search_statistics(root, self.config.action_space)
+
+                # Next batch
+                game_history.action_history.append(action)
+                game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
+                game_history.reward_history.append(reward)
+                game_history.to_play_history.append(self.game.to_play())
+
+        return game_history
+
+    def close_game(self):
+        self.game.close()
+
+    def select_opponent_action(self, opponent, stacked_observations):
+        """
+        Select opponent action for evaluating MuZero level.
+        """
+        if opponent == "human":
+            root, mcts_info = MCTS(self.config).run(
+                self.model,
+                stacked_observations,
+                self.game.legal_actions(),
+                self.game.to_play(),
+                True,
+            )
+            print(f'Tree depth: {mcts_info["max_tree_depth"]}')
+            print(f"Root value for player {self.game.to_play()}: {root.value():.2f}")
+            print(
+                f"Player {self.game.to_play()} turn. MuZero suggests {self.game.action_to_string(self.select_action(root, 0))}"
+            )
+            return self.game.human_to_action(), root
+        elif opponent == "expert":
+            return self.game.expert_agent(), None
+        elif opponent == "random":
+            assert (
+                self.game.legal_actions()
+            ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}."
+            assert set(self.game.legal_actions()).issubset(
+                set(self.config.action_space)
+            ), "Legal actions should be a subset of the action space."
+
+            return numpy.random.choice(self.game.legal_actions()), None
+        else:
+            raise NotImplementedError(
+                'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"'
+            )
+
+    # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。
+    # 公式为 c^(1/t)。可以看到：
+    #   t越小，1/t于接近于无穷大，值大的c就越容易被选中。
+    #   t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1，难以区分大小，因此就会相当于随机选择
+    #   特殊地，当t=0时，使用random完全随机选择，当t=+∞,使用argmax选择最大的
+    @staticmethod # 静态方法修饰符，类似于static关键字
+    def select_action(node, temperature):
+        """
+        Select action according to the visit count distribution and the temperature.
+        The temperature is changed dynamically with the visit_softmax_temperature function
+        in the config.
+        """
+        visit_counts = numpy.array(
+            [child.visit_count for child in node.children.values()], dtype="int32"
+        )
+        actions = [action for action in node.children.keys()]
+        if temperature == 0:
+            action = actions[numpy.argmax(visit_counts)]
+        elif temperature == float("inf"):
+            action = numpy.random.choice(actions)
+        else:
+            # See paper appendix Data Generation
+            visit_count_distribution = visit_counts ** (1 / temperature)
+            visit_count_distribution = visit_count_distribution / sum(
+                visit_count_distribution
+            )
+            action = numpy.random.choice(actions, p=visit_count_distribution)
+
+        return action
+
+
+# Game independent
+class MCTS:
+    """
+    Core Monte Carlo Tree Search algorithm.
+    To decide on an action, we run N simulations, always starting at the root of
+    the search tree and traversing the tree according to the UCB formula until we
+    reach a leaf node.
+    """
+
+    def __init__(self, config):
+        self.config = config
+
+    # run函数运行流程：
+    #   1. 获取root节点
+    #       (1)如果由指定节点这将root赋值为该节点；
+    #       (2)如果没有，则
+    #           i. 创建新的节点Node(0)
+    #           ii. 使用initial_inference函数通过observation获取相应的reward，hidden state，legal actions等数据
+    #           iii. 将ii中获取的数据赋值到创建的root节点中取
+    #           PS. 可以看到，在（1）的情况下不需要调用initial_inference函数
+    #   2. 检查是否需要添加探索噪音
+    #   3. 开始循环模拟游戏，模拟的次数由num simulation决定
+    #       （1） 将初始节点node设置为root，并将节点node加入search tree中
+    #       （2） 检查该节点是否已经扩展，如果已经扩展，则通过ucb值来选择子节点expand. 并将node 设置为选中的节点。并将节点node加入search tree中
+    #       （3） 重复2，直到找到expanded为false的node为止
+    #       （4） 选择search_tree[-2]为parent(因为最后一个是node)
+    #       （5） 运行recurrent_inference函数，获得reward，hidden state，legal actions等数据
+    #       （6） 扩展node,即为node创建子节点，使node展开。
+    #       （7） 反向传播算法，对路径上的所有访问次数+1，value值加reward
+    #       PS: 可以看到，通过不停的模拟，节点被一层层的扩展（每次模拟扩展一个节点）。
+    #   4. 返回扩展过后的节点树root，以便之后的程序根据它选择动作action
+    def run(
+        self,
+        model,
+        observation,
+        legal_actions,
+        to_play,
+        add_exploration_noise,
+        override_root_with=None,
+    ):
+        """
+        At the root of the search tree we use the representation function to obtain a
+        hidden state given the current observation.
+        We then run a Monte Carlo Tree Search using only action sequences and the model
+        learned by the network.
+        """
+        if override_root_with: #检查有没有提供Node,如果有，则指定；如果没有，则自己创建一个
+            root = override_root_with
+            root_predicted_value = None
+        else:
+            root = Node(0)
+            observation = (
+                torch.tensor(observation)
+                .float()
+                .unsqueeze(0)
+                .to(next(model.parameters()).device)
+            ) # observation转tensor，外面包一层形成一个batch。 Observation的长度由参数stacked_observation配置，主要存储之前的previous。不要之前privious的配置为0
+            (
+                root_predicted_value,
+                reward,
+                policy_logits,
+                hidden_state,
+            ) = model.initial_inference(observation)
+            root_predicted_value = models.support_to_scalar(
+                root_predicted_value, self.config.support_size
+            ).item()
+            reward = models.support_to_scalar(reward, self.config.support_size).item()
+            assert (
+                legal_actions
+            ), f"Legal actions should not be an empty array. Got {legal_actions}."
+            assert set(legal_actions).issubset(
+                set(self.config.action_space)
+            ), "Legal actions should be a subset of the action space."
+            root.expand(
+                legal_actions,
+                to_play,
+                reward,
+                policy_logits,
+                hidden_state,
+            )
+
+        if add_exploration_noise:
+            root.add_exploration_noise(
+                dirichlet_alpha=self.config.root_dirichlet_alpha,
+                exploration_fraction=self.config.root_exploration_fraction,
+            )
+
+        min_max_stats = MinMaxStats()
+
+        max_tree_depth = 0
+        for _ in range(self.config.num_simulations): # 开始模拟游戏
+            virtual_to_play = to_play
+            node = root
+            search_path = [node]
+            current_tree_depth = 0
+
+            # expanded根据node的子节点个数判断是否已经扩展了，如果没有子节点，说明没被扩展
+            while node.expanded(): #这个循环一直在搜索没有expand的子节点。如果子节点已经expand了，则通过select_child选择下一个
+                current_tree_depth += 1
+                action, node = self.select_child(node, min_max_stats) #选取ucb最大的一个action，如果有多个action得分相同，随机选取一个
+                search_path.append(node) #把节点添加到搜索队列
+
+                # Players play turn by turn
+                if virtual_to_play + 1 < len(self.config.players):
+                    virtual_to_play = self.config.players[virtual_to_play + 1]
+                else:
+                    virtual_to_play = self.config.players[0]
+
+            # 在搜索树内部，我们使用动态函数来获取给定动作的下一个hidden_state和previous hidden state
+            # Inside the search tree we use the dynamics function to obtain the next hidden
+            # state given an action and the previous hidden state
+            parent = search_path[-2] # 选择倒数第二个节点，因为当前的node是-1，则-2是它的parent
+            value, reward, policy_logits, hidden_state = model.recurrent_inference(
+                parent.hidden_state,
+                torch.tensor([[action]]).to(parent.hidden_state.device),
+            )
+            value = models.support_to_scalar(value, self.config.support_size).item()
+            reward = models.support_to_scalar(reward, self.config.support_size).item()
+            # expand一层节点，actions是动作列表，policy_logits是rewards列表
+            # 通过该函数，在该节点扩展一层节点
+            node.expand(
+                self.config.action_space,
+                virtual_to_play,
+                reward,
+                policy_logits,
+                hidden_state,
+            )
+
+            self.backpropagate(search_path, value, virtual_to_play, min_max_stats)
+
+            max_tree_depth = max(max_tree_depth, current_tree_depth)
+
+        extra_info = {
+            "max_tree_depth": max_tree_depth,
+            "root_predicted_value": root_predicted_value,
+        }
+        return root, extra_info
+
+    # MCTS 的select child和之前SelfPlay的select action逻辑是不一样的
+    #   1. select child是根据UCB选取的，select action是根据各个动作的visit count和temperature选取的
+    #   2. select child 选择的对象是Node,Node是由当前的state执行action后生成的新Node形成的。select action单纯的是选action
+    def select_child(self, node, min_max_stats):
+        """
+        Select the child with the highest UCB score.
+        """
+        max_ucb = max(
+            self.ucb_score(node, child, min_max_stats)
+            for action, child in node.children.items()
+        )
+        action = numpy.random.choice( # 随机选择ucb值等于最大ucb的动作（因为可能有多个动作的值都达到了最大的ucb,如果只有一个，那么就会选取这个)
+            [
+                action
+                for action, child in node.children.items()
+                if self.ucb_score(node, child, min_max_stats) == max_ucb
+            ]
+        )
+        return action, node.children[action]
+
+    def ucb_score(self, parent, child, min_max_stats): #该函数只进行一步查询，不进行多步
+        """
+        The score for a node is based on its value, plus an exploration bonus based on the prior.
+        """
+        pb_c = (
+            math.log(
+                (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base # pc_c_base由配置文件决定
+            )
+            + self.config.pb_c_init
+        )
+        pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1)
+
+        prior_score = pb_c * child.prior # prior 之前的p_value
+        # 公式 pb_c = (log((N+C+1)/C)+init ) * sqrt(N/(VC+1))
+        # prior_score = pbc * prior
+
+        if child.visit_count > 0:
+            # Mean value Q
+            value_score = min_max_stats.normalize( # 括号里的是Q值，Q=E[r+r*Q'。此处在对其进行正则化
+                child.reward
+                + self.config.discount # 衰减系数， 之后乘以子节点的值
+                * (child.value() if len(self.config.players) == 1 else -child.value()) # 根据players的个数，如果大于1，则子节点必定是对手，因此子节点的取负。
+            )
+        else:
+            value_score = 0
+
+        return prior_score + value_score # 先前的分数加上Q值就是新的UCB值
+
+    # 反向传播算法
+    # 对路径上的所有访问次数+1，value值加reward
+    def backpropagate(self, search_path, value, to_play, min_max_stats): # MCTS反向传播，visit count加1
+        """
+        At the end of a simulation, we propagate the evaluation all the way up the tree
+        to the root.
+        """
+        if len(self.config.players) == 1:
+            for node in reversed(search_path):
+                node.value_sum += value
+                node.visit_count += 1
+                min_max_stats.update(node.reward + self.config.discount * node.value())
+
+                value = node.reward + self.config.discount * value
+
+        elif len(self.config.players) == 2:
+            for node in reversed(search_path):
+                node.value_sum += value if node.to_play == to_play else -value
+                node.visit_count += 1
+                min_max_stats.update(node.reward + self.config.discount * -node.value())
+
+                value = (
+                    -node.reward if node.to_play == to_play else node.reward
+                ) + self.config.discount * value
+
+        else:
+            raise NotImplementedError("More than two player mode not implemented.")
+
+
+class Node:
+    def __init__(self, prior):
+        self.visit_count = 0 #visit count默认是0，只有经过反向传播之后才能变成增加
+        self.to_play = -1
+        self.prior = prior
+        self.value_sum = 0
+        self.children = {}
+        self.hidden_state = None
+        self.reward = 0
+
+    def expanded(self):
+        return len(self.children) > 0
+
+    def value(self):
+        if self.visit_count == 0:
+            return 0
+        return self.value_sum / self.visit_count
+
+    def expand(self, actions, to_play, reward, policy_logits, hidden_state):
+        # expand一层节点，actions是动作列表，policy_logits是rewards列表
+        # 通过该函数，在该节点扩展一层节点
+        """
+        We expand a node using the value, reward and policy prediction obtained from the
+        neural network.
+        """
+        self.to_play = to_play
+        self.reward = reward
+        self.hidden_state = hidden_state
+
+        policy_values = torch.softmax(
+            torch.tensor([policy_logits[0][a] for a in actions]), dim=0
+        ).tolist()
+        policy = {a: policy_values[i] for i, a in enumerate(actions)} # 列出所有的合法动作及对于的value值
+        for action, p in policy.items():
+            self.children[action] = Node(p)
+
+    def add_exploration_noise(self, dirichlet_alpha, exploration_fraction):
+        """
+        At the start of each search, we add dirichlet noise to the prior of the root to
+        encourage the search to explore new actions.
+        """
+        actions = list(self.children.keys())
+        noise = numpy.random.dirichlet([dirichlet_alpha] * len(actions))
+        frac = exploration_fraction
+        for a, n in zip(actions, noise):
+            self.children[a].prior = self.children[a].prior * (1 - frac) + n * frac
+
+
+class GameHistory:
+    """
+    Store only usefull information of a self-play game.
+    """
+
+    def __init__(self):
+        self.observation_history = []
+        self.action_history = []
+        self.reward_history = []
+        self.to_play_history = []
+        self.child_visits = []
+        self.root_values = []
+        self.reanalysed_predicted_root_values = None
+        # For PER
+        self.priorities = None
+        self.game_priority = None
+
+    def store_search_statistics(self, root, action_space):
+        # Turn visit count from root into a policy
+        if root is not None:
+            sum_visits = sum(child.visit_count for child in root.children.values())
+            self.child_visits.append(
+                [
+                    root.children[a].visit_count / sum_visits
+                    if a in root.children
+                    else 0
+                    for a in action_space
+                ]
+            )
+
+            self.root_values.append(root.value())
+        else:
+            self.root_values.append(None)
+
+    def get_stacked_observations(
+        self, index, num_stacked_observations, action_space_size
+    ): #根据索引index获取observation序列
+        """
+        Generate a new observation with the observation at the index position
+        and num_stacked_observations past observations and actions stacked.
+        """
+        # Convert to positive index
+        index = index % len(self.observation_history)
+
+        stacked_observations = self.observation_history[index].copy() #分为两部分，一部分是当前（current）观察值，一部分是之前的(previous)观察值
+        for past_observation_index in reversed(
+            range(index - num_stacked_observations, index)
+        ):
+            if 0 <= past_observation_index:
+                previous_observation = numpy.concatenate( # np.concatenate将第一个参数的list组合起来，方法是依次拆开每个元素，拼接
+                    (
+                        self.observation_history[past_observation_index],
+                        [
+                            numpy.ones_like(stacked_observations[0])
+                            * self.action_history[past_observation_index + 1]
+                            / action_space_size
+                        ],
+                    )
+                )
+            else:
+                previous_observation = numpy.concatenate(
+                    (
+                        numpy.zeros_like(self.observation_history[index]),
+                        [numpy.zeros_like(stacked_observations[0])],
+                    )
+                )
+
+            stacked_observations = numpy.concatenate( # 向stoacked_observtions添加内容
+                (stacked_observations, previous_observation)
+            )
+
+        return stacked_observations
+
+
+class MinMaxStats:
+    """
+    A class that holds the min-max values of the tree.
+    """
+
+    def __init__(self):
+        self.maximum = -float("inf") # 最大是-∞
+        self.minimum = float("inf") # 最小是+∞
+        # 跟类一定要update至少两次才能产生正确的范围。第一次更新掉max<min的情况，但是max=min;之后更新使其产生范围
+
+    def update(self, value): # 更新max和min,方法时对比大小，大的更新为上限，小的更新为下限
+        self.maximum = max(self.maximum, value)
+        self.minimum = min(self.minimum, value)
+
+    def normalize(self, value): #对value规范化，公式为(x-a)/(a-b) 当x∈[a,b]时
+        if self.maximum > self.minimum: # 如果最大大于最小，说明至少更新了两次（第一次更新掉max<min的情况，但是max=min;之后更新使其产生范围）
+            # We normalize only when we have set the maximum and minimum values
+            return (value - self.minimum) / (self.maximum - self.minimum)
+        return value # 如果范围没有更新，就直接返回value
diff --git a/simplifiedMuZero/net2/trainer_2net.py b/simplifiedMuZero/net2/trainer_2net.py
new file mode 100644
index 00000000..d11612bd
--- /dev/null
+++ b/simplifiedMuZero/net2/trainer_2net.py
@@ -0,0 +1,302 @@
+import copy
+import time
+
+import numpy
+import ray
+import torch
+
+# import simplifiedMuZero.net2.models_2net as models
+import models
+from simplifiedMuZero.net2.models2 import MuZeroNetwork_2net
+
+
+@ray.remote
+class Trainer:
+    """
+    Class which run in a dedicated thread to train a neural network and save it
+    in the shared storage.
+    """
+
+    def __init__(self, initial_checkpoint, config):
+        self.config = config
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Initialize the network
+        self.model = MuZeroNetwork_2net(self.config)
+        self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"]))
+        self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu"))
+        self.model.train()
+
+        self.training_step = initial_checkpoint["training_step"]
+
+        if "cuda" not in str(next(self.model.parameters()).device):
+            print("You are not training on GPU.\n")
+
+        # Initialize the optimizer
+        if self.config.optimizer == "SGD":
+            self.optimizer = torch.optim.SGD(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                momentum=self.config.momentum,
+                weight_decay=self.config.weight_decay,
+            )
+        elif self.config.optimizer == "Adam":
+            self.optimizer = torch.optim.Adam(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                weight_decay=self.config.weight_decay,
+            )
+        else:
+            raise NotImplementedError(
+                f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py."
+            )
+
+        if initial_checkpoint["optimizer_state"] is not None:
+            print("Loading optimizer...\n")
+            self.optimizer.load_state_dict(
+                copy.deepcopy(initial_checkpoint["optimizer_state"])
+            )
+
+    def continuous_update_weights(self, replay_buffer, shared_storage):
+        # Wait for the replay buffer to be filled
+        while ray.get(shared_storage.get_info.remote("num_played_games")) < 1:
+            time.sleep(0.1)
+
+        next_batch = replay_buffer.get_batch.remote()
+        # Training loop
+        while self.training_step < self.config.training_steps and not ray.get(
+            shared_storage.get_info.remote("terminate")
+        ):
+            index_batch, batch = ray.get(next_batch)
+            next_batch = replay_buffer.get_batch.remote()
+            self.update_lr()
+            (
+                priorities,
+                total_loss,
+                value_loss,
+                reward_loss,
+                policy_loss,
+            ) = self.update_weights(batch)
+
+            if self.config.PER:
+                # Save new priorities in the replay buffer (See https://arxiv.org/abs/1803.00933)
+                replay_buffer.update_priorities.remote(priorities, index_batch)
+
+            # Save to the shared storage
+            if self.training_step % self.config.checkpoint_interval == 0:
+                shared_storage.set_info.remote(
+                    {
+                        "weights": copy.deepcopy(self.model.get_weights()),
+                        "optimizer_state": copy.deepcopy(
+                            models.dict_to_cpu(self.optimizer.state_dict())
+                        ),
+                    }
+                )
+                if self.config.save_model:
+                    shared_storage.save_checkpoint.remote()
+            shared_storage.set_info.remote(
+                {
+                    "training_step": self.training_step,
+                    "lr": self.optimizer.param_groups[0]["lr"],
+                    "total_loss": total_loss,
+                    "value_loss": value_loss,
+                    "reward_loss": reward_loss,
+                    "policy_loss": policy_loss,
+                }
+            )
+
+            # Managing the self-play / training ratio
+            if self.config.training_delay:
+                time.sleep(self.config.training_delay)
+            if self.config.ratio:
+                while (
+                    self.training_step
+                    / max(
+                        1, ray.get(shared_storage.get_info.remote("num_played_steps"))
+                    )
+                    > self.config.ratio
+                    and self.training_step < self.config.training_steps
+                    and not ray.get(shared_storage.get_info.remote("terminate"))
+                ):
+                    time.sleep(0.5)
+
+    def update_weights(self, batch):
+        """
+        Perform one training step.
+        """
+
+        (
+            observation_batch,
+            action_batch,
+            target_value,
+            target_reward,
+            target_policy,
+            weight_batch,
+            gradient_scale_batch,
+        ) = batch
+
+        # Keep values as scalars for calculating the priorities for the prioritized replay
+        target_value_scalar = numpy.array(target_value, dtype="float32")
+        priorities = numpy.zeros_like(target_value_scalar)
+
+        device = next(self.model.parameters()).device
+        if self.config.PER:
+            weight_batch = torch.tensor(weight_batch.copy()).float().to(device)
+        observation_batch = (
+            torch.tensor(numpy.array(observation_batch)).float().to(device)
+        )
+        action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1)
+        target_value = torch.tensor(target_value).float().to(device)
+        target_reward = torch.tensor(target_reward).float().to(device)
+        target_policy = torch.tensor(target_policy).float().to(device)
+        gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device)
+        # observation_batch: batch, channels, height, width
+        # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze)
+        # target_value: batch, num_unroll_steps+1
+        # target_reward: batch, num_unroll_steps+1
+        # target_policy: batch, num_unroll_steps+1, len(action_space)
+        # gradient_scale_batch: batch, num_unroll_steps+1
+
+        target_value = models.scalar_to_support(target_value, self.config.support_size)
+        target_reward = models.scalar_to_support(
+            target_reward, self.config.support_size
+        )
+        # target_value: batch, num_unroll_steps+1, 2*support_size+1
+        # target_reward: batch, num_unroll_steps+1, 2*support_size+1
+
+        ## Generate predictions
+        value, reward, policy_logits, hidden_state = self.model.initial_inference(
+            observation_batch
+        )
+        predictions = [(value, reward, policy_logits)]
+        for i in range(1, action_batch.shape[1]):
+            value, reward, policy_logits, hidden_state = self.model.recurrent_inference(
+                hidden_state, action_batch[:, i]
+            )
+            # Scale the gradient at the start of the dynamics function (See paper appendix Training)
+            hidden_state.register_hook(lambda grad: grad * 0.5)
+            predictions.append((value, reward, policy_logits))
+        # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim)
+
+        ## Compute losses
+        value_loss, reward_loss, policy_loss = (0, 0, 0)
+        value, reward, policy_logits = predictions[0]
+        # Ignore reward loss for the first batch step
+        current_value_loss, _, current_policy_loss = self.loss_function(
+            value.squeeze(-1),
+            reward.squeeze(-1),
+            policy_logits,
+            target_value[:, 0],
+            target_reward[:, 0],
+            target_policy[:, 0],
+        )
+        value_loss += current_value_loss
+        policy_loss += current_policy_loss
+        # Compute priorities for the prioritized replay (See paper appendix Training)
+        pred_value_scalar = (
+            models.support_to_scalar(value, self.config.support_size)
+            .detach()
+            .cpu()
+            .numpy()
+            .squeeze()
+        )
+        priorities[:, 0] = (
+            numpy.abs(pred_value_scalar - target_value_scalar[:, 0])
+            ** self.config.PER_alpha
+        )
+
+        for i in range(1, len(predictions)):
+            value, reward, policy_logits = predictions[i]
+            (
+                current_value_loss,
+                current_reward_loss,
+                current_policy_loss,
+            ) = self.loss_function(
+                value.squeeze(-1),
+                reward.squeeze(-1),
+                policy_logits,
+                target_value[:, i],
+                target_reward[:, i],
+                target_policy[:, i],
+            )
+
+            # Scale gradient by the number of unroll steps (See paper appendix Training)
+            current_value_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_reward_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_policy_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+
+            value_loss += current_value_loss
+            reward_loss += current_reward_loss
+            policy_loss += current_policy_loss
+
+            # Compute priorities for the prioritized replay (See paper appendix Training)
+            pred_value_scalar = (
+                models.support_to_scalar(value, self.config.support_size)
+                .detach()
+                .cpu()
+                .numpy()
+                .squeeze()
+            )
+            priorities[:, i] = (
+                numpy.abs(pred_value_scalar - target_value_scalar[:, i])
+                ** self.config.PER_alpha
+            )
+
+        # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze)
+        loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss
+        if self.config.PER:
+            # Correct PER bias by using importance-sampling (IS) weights
+            loss *= weight_batch
+        # Mean over batch dimension (pseudocode do a sum)
+        loss = loss.mean()
+
+        # Optimize
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        self.training_step += 1
+
+        return (
+            priorities,
+            # For log purpose
+            loss.item(),
+            value_loss.mean().item(),
+            reward_loss.mean().item(),
+            policy_loss.mean().item(),
+        )
+
+    def update_lr(self):
+        """
+        Update learning rate
+        """
+        lr = self.config.lr_init * self.config.lr_decay_rate ** (
+            self.training_step / self.config.lr_decay_steps
+        )
+        for param_group in self.optimizer.param_groups:
+            param_group["lr"] = lr
+
+    @staticmethod
+    def loss_function(
+        value,
+        reward,
+        policy_logits,
+        target_value,
+        target_reward,
+        target_policy,
+    ):
+        # Cross-entropy seems to have a better convergence than MSE
+        value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1)
+        reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1)
+        policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(
+            1
+        )
+        return value_loss, reward_loss, policy_loss
diff --git a/simplifiedMuZero/no_pv/trainer_no_pv.py b/simplifiedMuZero/no_pv/trainer_no_pv.py
new file mode 100644
index 00000000..f51e3ef8
--- /dev/null
+++ b/simplifiedMuZero/no_pv/trainer_no_pv.py
@@ -0,0 +1,301 @@
+import copy
+import time
+
+import numpy
+import ray
+import torch
+
+import models
+
+
+@ray.remote
+class Trainer:
+    """
+    Class which run in a dedicated thread to train a neural network and save it
+    in the shared storage.
+    """
+
+    def __init__(self, initial_checkpoint, config):
+        self.config = config
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Initialize the network
+        self.model = models.MuZeroNetwork(self.config)
+        self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"]))
+        self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu"))
+        self.model.train()
+
+        self.training_step = initial_checkpoint["training_step"]
+
+        if "cuda" not in str(next(self.model.parameters()).device):
+            print("You are not training on GPU.\n")
+
+        # Initialize the optimizer
+        if self.config.optimizer == "SGD":
+            self.optimizer = torch.optim.SGD(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                momentum=self.config.momentum,
+                weight_decay=self.config.weight_decay,
+            )
+        elif self.config.optimizer == "Adam":
+            self.optimizer = torch.optim.Adam(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                weight_decay=self.config.weight_decay,
+            )
+        else:
+            raise NotImplementedError(
+                f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py."
+            )
+
+        if initial_checkpoint["optimizer_state"] is not None:
+            print("Loading optimizer...\n")
+            self.optimizer.load_state_dict(
+                copy.deepcopy(initial_checkpoint["optimizer_state"])
+            )
+
+    def continuous_update_weights(self, replay_buffer, shared_storage):
+        # Wait for the replay buffer to be filled
+        while ray.get(shared_storage.get_info.remote("num_played_games")) < 1:
+            time.sleep(0.1)
+
+        next_batch = replay_buffer.get_batch.remote()
+        # Training loop
+        while self.training_step < self.config.training_steps and not ray.get(
+            shared_storage.get_info.remote("terminate") # terminate是用来记录replay buffer等其它程序是否终止的，跟game的状态无关
+        ):
+            index_batch, batch = ray.get(next_batch)
+            next_batch = replay_buffer.get_batch.remote()
+            self.update_lr()
+            (
+                priorities,
+                total_loss,
+                value_loss,
+                reward_loss,
+                policy_loss,
+            ) = self.update_weights(batch)
+
+            if self.config.PER:
+                # Save new priorities in the replay buffer (See https://arxiv.org/abs/1803.00933)
+                replay_buffer.update_priorities.remote(priorities, index_batch)
+
+            # Save to the shared storage
+            if self.training_step % self.config.checkpoint_interval == 0:
+                shared_storage.set_info.remote(
+                    {
+                        "weights": copy.deepcopy(self.model.get_weights()),
+                        "optimizer_state": copy.deepcopy(
+                            models.dict_to_cpu(self.optimizer.state_dict())
+                        ),
+                    }
+                )
+                if self.config.save_model:
+                    shared_storage.save_checkpoint.remote()
+            shared_storage.set_info.remote(
+                {
+                    "training_step": self.training_step,
+                    "lr": self.optimizer.param_groups[0]["lr"],
+                    "total_loss": total_loss,
+                    "value_loss": value_loss,
+                    "reward_loss": reward_loss,
+                    "policy_loss": policy_loss,
+                }
+            )
+
+            # Managing the self-play / training ratio
+            if self.config.training_delay:
+                time.sleep(self.config.training_delay)
+            if self.config.ratio:
+                while (
+                    self.training_step
+                    / max(
+                        1, ray.get(shared_storage.get_info.remote("num_played_steps"))
+                    )
+                    > self.config.ratio
+                    and self.training_step < self.config.training_steps
+                    and not ray.get(shared_storage.get_info.remote("terminate")) # terminate是用来记录replay buffer等其它程序是否终止的，跟game的状态无关
+                ):
+                    time.sleep(0.5)
+
+    def update_weights(self, batch):
+        """
+        Perform one training step.
+        """
+
+        (
+            observation_batch,
+            action_batch,
+            target_value,
+            target_reward,
+            target_policy,
+            weight_batch,
+            gradient_scale_batch,
+        ) = batch
+
+        # Keep values as scalars for calculating the priorities for the prioritized replay
+        target_value_scalar = numpy.array(target_value, dtype="float32")
+        priorities = numpy.zeros_like(target_value_scalar)
+
+        device = next(self.model.parameters()).device
+        if self.config.PER:
+            weight_batch = torch.tensor(weight_batch.copy()).float().to(device)
+        observation_batch = (
+            torch.tensor(numpy.array(observation_batch)).float().to(device)
+        )
+        action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1)
+        target_value = torch.tensor(target_value).float().to(device)
+        target_reward = torch.tensor(target_reward).float().to(device)
+        target_policy = torch.tensor(target_policy).float().to(device)
+        gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device)
+        # observation_batch: batch, channels, height, width
+        # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze)
+        # target_value: batch, num_unroll_steps+1
+        # target_reward: batch, num_unroll_steps+1
+        # target_policy: batch, num_unroll_steps+1, len(action_space)
+        # gradient_scale_batch: batch, num_unroll_steps+1
+
+        target_value = models.scalar_to_support(target_value, self.config.support_size)
+        target_reward = models.scalar_to_support(
+            target_reward, self.config.support_size
+        )
+        # target_value: batch, num_unroll_steps+1, 2*support_size+1
+        # target_reward: batch, num_unroll_steps+1, 2*support_size+1
+
+        ## Generate predictions
+        value, reward, policy_logits, hidden_state = self.model.initial_inference(
+            observation_batch
+        )
+        predictions = [(value, reward, policy_logits)]
+        for i in range(1, action_batch.shape[1]):
+            value, reward, policy_logits, hidden_state = self.model.recurrent_inference(
+                hidden_state, action_batch[:, i]
+            )
+            # Scale the gradient at the start of the dynamics function (See paper appendix Training)
+            hidden_state.register_hook(lambda grad: grad * 0.5)
+            predictions.append((value, reward, policy_logits))
+        # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim)
+
+        ## Compute losses
+        value_loss, reward_loss, policy_loss = (0, 0, 0)
+        value, reward, policy_logits = predictions[0]
+        # Ignore reward loss for the first batch step
+        current_value_loss, _, current_policy_loss = self.loss_function(
+            value.squeeze(-1),
+            reward.squeeze(-1),
+            policy_logits,
+            target_value[:, 0],
+            target_reward[:, 0],
+            target_policy[:, 0],
+        )
+        value_loss += current_value_loss
+        policy_loss += current_policy_loss
+        # Compute priorities for the prioritized replay (See paper appendix Training)
+        pred_value_scalar = (
+            models.support_to_scalar(value, self.config.support_size)
+            .detach()
+            .cpu()
+            .numpy()
+            .squeeze()
+        )
+        priorities[:, 0] = (
+            numpy.abs(pred_value_scalar - target_value_scalar[:, 0])
+            ** self.config.PER_alpha
+        )
+
+        for i in range(1, len(predictions)):
+            value, reward, policy_logits = predictions[i]
+            (
+                current_value_loss,
+                current_reward_loss,
+                current_policy_loss,
+            ) = self.loss_function(
+                value.squeeze(-1),
+                reward.squeeze(-1),
+                policy_logits,
+                target_value[:, i],
+                target_reward[:, i],
+                target_policy[:, i],
+            )
+
+            # Scale gradient by the number of unroll steps (See paper appendix Training)
+            current_value_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_reward_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_policy_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+
+            value_loss += current_value_loss
+            reward_loss += current_reward_loss
+            policy_loss += current_policy_loss
+
+            # Compute priorities for the prioritized replay (See paper appendix Training)
+            pred_value_scalar = (
+                models.support_to_scalar(value, self.config.support_size)
+                .detach()
+                .cpu()
+                .numpy()
+                .squeeze()
+            )
+            priorities[:, i] = (
+                numpy.abs(pred_value_scalar - target_value_scalar[:, i])
+                ** self.config.PER_alpha
+            )
+
+        # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze)
+        # loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss
+        loss = reward_loss + policy_loss
+        if self.config.PER:
+            # Correct PER bias by using importance-sampling (IS) weights
+            loss *= weight_batch
+        # Mean over batch dimension (pseudocode do a sum)
+        loss = loss.mean()
+
+        # Optimize
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        self.training_step += 1
+
+        return (
+            priorities,
+            # For log purpose
+            loss.item(),
+            value_loss.mean().item(),
+            reward_loss.mean().item(),
+            policy_loss.mean().item(),
+        )
+
+    def update_lr(self):
+        """
+        Update learning rate
+        """
+        lr = self.config.lr_init * self.config.lr_decay_rate ** (
+            self.training_step / self.config.lr_decay_steps
+        )
+        for param_group in self.optimizer.param_groups: # 更新optimizer的lr
+            param_group["lr"] = lr
+
+    @staticmethod
+    def loss_function(
+        value,
+        reward,
+        policy_logits,
+        target_value,
+        target_reward,
+        target_policy,
+    ):
+        # Cross-entropy seems to have a better convergence than MSE
+        value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1)
+        reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1)
+        policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(
+            1
+        )
+        return value_loss, reward_loss, policy_loss
diff --git a/simplifiedMuZero/search_policy/RHEA.py b/simplifiedMuZero/search_policy/RHEA.py
new file mode 100644
index 00000000..fe070c8b
--- /dev/null
+++ b/simplifiedMuZero/search_policy/RHEA.py
@@ -0,0 +1,75 @@
+import copy
+import numpy as np
+from functools import partial
+
+from deap import base, creator, tools, algorithms
+
+from games.abstract_game import AbstractGame
+
+creator.create('FitnessMax', base.Fitness, weights=(1.0,))
+creator.create('Individual', list, fitness = creator.FitnessMax)
+
+class RHEA:
+    def __init__(self):
+        self.game = None
+        self.play_id = 0
+        self.toolbox = base.Toolbox()
+        self.register("mate", tools.cxTwoPoint)
+        self.register("mutate", tools.mutFlipBit, indpb=0.05)
+        self.register("select", tools.selStochasticUniversalSampling)
+
+    def game_evaluate(self, actions, game_stat=None, play_id=None):
+        game_stat = copy.deepcopy(game_stat)
+        game_stat.reset()
+
+        for i in range(len(actions)):
+            player = game_stat.to_play()
+            observation, reward, done = game_stat.step(actions[i])
+            if done:
+                break
+
+        game_stat.close()
+        reward = reward if play_id == player else -reward
+        # 因为i是从0开始的，如果第一个action就结束，会出现NAN异常
+        reward /= i + 1  # 路径越长，回报越低。以便寻找到最近的路径
+        return reward,
+
+    def evaluate(self, actions):
+        game_stat = copy.deepcopy(self.game)
+        play_id = self.play_id
+
+        game_stat.reset()
+
+        for i in range(len(actions)):
+            player = game_stat.to_play()
+            observation, reward, done = game_stat.step(actions[i])
+            if done:
+                break
+
+        game_stat.close()
+        reward = reward if play_id == player else -reward
+        # 因为i是从0开始的，如果第一个action就结束，会出现NAN异常
+        reward /= i + 1  # 路径越长，回报越低。以便寻找到最近的路径
+        return reward,
+
+    def individual(self, actions, max_moves, replace=False):
+        max_moves = max_moves if replace else len(actions)
+        return tools.initIterate(creator.Individual, partial(np.random.choice, actions, max_moves, replace=replace))
+    def population(self, actions, max_moves, N, replace=False):
+        return tools.initRepeat(list, partial(self.individual, actions, max_moves, replace), N)
+
+    def rhea(self, game_state:AbstractGame, config, play_id):
+        actions = game_state.legal_actions()
+        pop = self.population(actions. config.max_moves)
+        self.toolbox.register("evaluate", self.game_evaluate, game=game_state, play_id=play_id)
+        pop, logbook = algorithms.eaSimple(pop, self.toolbox, cxpb=0.5, mutpb=0.2, ngen=10, verbose=False)
+
+        results = tools.selBest(pop, k=1)
+
+        # 返回第一个动作和评分
+        return [(r[0],self.game_evaluate(actions, game_state, play_id)[0]) for r in results] # r[0]表示第一个动作
+
+
+
+
+
diff --git a/simplifiedMuZero/search_policy/RHEA2.py b/simplifiedMuZero/search_policy/RHEA2.py
new file mode 100644
index 00000000..73d30799
--- /dev/null
+++ b/simplifiedMuZero/search_policy/RHEA2.py
@@ -0,0 +1,192 @@
+import copy
+import numpy as np
+from functools import partial
+import torch
+
+from deap import base, creator, tools, algorithms
+
+from games.abstract_game import AbstractGame
+from self_play import Node
+import models
+
+from games.tictactoe import MuZeroConfig, Game
+
+creator.create('FitnessMax', base.Fitness, weights=(1.0,))
+creator.create('Individual', list, fitness = creator.FitnessMax)
+
+
+def evaluate(actions, model, observation, config):
+    (
+        root_predicted_value,
+        reward,
+        policy_logits,
+        hidden_state,
+    ) = model.initial_inference(observation)
+
+    for action in actions:
+        value, reward, policy_logits, hidden_state = model.recurrent_inference(
+            hidden_state,
+            torch.tensor([[action]]).to(observation.device),
+        )
+
+    reward = models.support_to_scalar(reward, config.support_size).item()
+    return reward,
+
+class RHEA:
+    def __init__(self, config, game):
+        self.game = game
+        self.config = config
+        self.play_id = -1
+        self.toolbox = base.Toolbox()
+        self.toolbox.register("mate", tools.cxTwoPoint)
+        self.toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
+        self.toolbox.register("select", tools.selStochasticUniversalSampling)
+
+    # def game_evaluate(self, actions, game_stat=None, play_id=None):
+    #     game_stat = copy.deepcopy(game_stat)
+    #     game_stat.reset()
+    #
+    #     for i in range(len(actions)):
+    #         player = game_stat.to_play()
+    #         observation, reward, done = game_stat.step(actions[i])
+    #         if done:
+    #             break
+    #
+    #     game_stat.close()
+    #     reward = reward if play_id == player else -reward
+    #     # 因为i是从0开始的，如果第一个action就结束，会出现NAN异常
+    #     reward /= i + 1  # 路径越长，回报越低。以便寻找到最近的路径
+    #     return reward,
+    #
+    # def action_evaluate(self, actions):
+    #     game_stat = copy.deepcopy(self.game)
+    #     game_stat.reset()
+    #
+    #     for i in range(len(actions)):
+    #         player = game_stat.to_play()
+    #         observation, reward, done = game_stat.step(actions[i])
+    #         if done:
+    #             break
+    #
+    #     game_stat.close()
+    #     reward = reward if self.play_id == player else -reward
+    #
+    #     return reward, actions[:(i+1)]
+    #
+    def evaluate(self, actions):
+        game_stat = copy.deepcopy(self.game)
+        play_id = self.play_id
+
+        game_stat.reset()
+
+        for i in range(len(actions)):
+            player = game_stat.to_play()
+            observation, reward, done = game_stat.step(actions[i])
+            if done:
+                break
+
+        game_stat.close()
+        reward = reward if play_id == player else -reward
+        # 因为i是从0开始的，如果第一个action就结束，会出现NAN异常
+        reward /= i + 1  # 路径越长，回报越低。以便寻找到最近的路径
+        return reward,
+
+    def individual(self, actions, max_moves, replace=False):
+        max_moves = max_moves if replace else min(len(actions), max_moves)
+        return tools.initIterate(creator.Individual, partial(np.random.choice, actions, max_moves, replace=replace))
+    def population(self, actions, max_moves, N, replace=False):
+        return tools.initRepeat(list, partial(self.individual, actions, max_moves, replace), N)
+
+    # def rhea(self, game_state:AbstractGame):
+    #     self.game = game_state
+    #     self.play_id = game_state.to_play()
+    #     actions = game_state.legal_actions()
+    #     self.toolbox.register("evaluate", evaluate, )
+    #     pop = self.population(actions. self.config.max_moves)
+    #
+    #     pop, logbook = algorithms.eaSimple(pop, self.toolbox, cxpb=0.5, mutpb=0.2, ngen=10, verbose=False)
+    #
+    #     results = tools.selBest(pop, k=1)
+    #
+    #     return self.action_evaluate(results[0])
+
+
+
+        # # 返回第一个动作和评分
+        # return [(r[0],self.game_evaluate(actions, game_state, play_id)[0]) for r in results] # r[0]表示第一个动作
+
+    def run(self,
+            model,
+            observation,
+            legal_actions,
+            to_play,
+            action_replace,
+            override_root_with=None,
+            ):
+        observation = (
+            torch.tensor(observation)
+            .float()
+            .unsqueeze(0)
+            .to(next(model.parameters()).device)
+        )
+
+        # 检查可用的动作空间，如果小于等于1，则直接返回。因为进化算法无法杂交，会报错
+        if len(legal_actions) <=1:
+            return legal_actions
+        else:
+            # self.toolbox.register("evaluate", evaluate, model=model, observation=observation, config=self.config)
+            self.toolbox.register("evaluate", self.evaluate)
+            pop = self.population(legal_actions, self.config.max_moves, self.config.num_simulations, replace=action_replace)
+
+            pop, logbook = algorithms.eaSimple(pop, self.toolbox, cxpb=0.5, mutpb=0.2, ngen=len(legal_actions), verbose=False)
+
+            results = tools.selBest(pop, k=1)
+
+            return results[0]
+
+if __name__=="__main__":
+    game = Game()
+    config = MuZeroConfig()
+    game.reset()
+    done = False
+
+    # rhea = RHEA(config, game)
+    # pop = rhea.population(game.legal_actions(), 9, config.num_simulations, config.action_replace)
+    #
+    # print(pop)
+    # rhea.toolbox.register("evaluate", rhea.evaluate)
+    # pop, logbook = algorithms.eaSimple(pop, rhea.toolbox, cxpb=0.5, mutpb=0, ngen=9, verbose=False)
+    #
+    # results = tools.selBest(pop, k=1)
+    # print(results)
+
+    legal_actions = game.legal_actions()
+    while not done and len(legal_actions) >1:
+        legal_actions = game.legal_actions()
+        rhea = RHEA(config, game)
+        rhea.play_id = game.to_play()
+
+        pop = rhea.population(legal_actions, config.max_moves, config.num_simulations, config.action_replace)
+
+        rhea.toolbox.register("evaluate", rhea.evaluate)
+
+        pop, logbook = algorithms.eaSimple(pop, rhea.toolbox, cxpb=0.5, mutpb=0.2, ngen=len(legal_actions), verbose=False)
+
+        print(pop)
+        results = tools.selBest(pop, k=1)
+        print(results)
+        action = results[0][0]
+        observation, reward, done = game.step(action)
+        # print(observation)
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/simplifiedMuZero/search_policy/__init__.py b/simplifiedMuZero/search_policy/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/simplifiedMuZero/search_policy/rhea_self_play.py b/simplifiedMuZero/search_policy/rhea_self_play.py
new file mode 100644
index 00000000..ca49d875
--- /dev/null
+++ b/simplifiedMuZero/search_policy/rhea_self_play.py
@@ -0,0 +1,227 @@
+import math
+import time
+
+import numpy
+import ray
+import torch
+
+import models
+from simplifiedMuZero.search_policy.RHEA2 import RHEA
+from self_play import GameHistory
+
+
+@ray.remote
+class SelfPlayRhea:
+    """
+    Class which run in a dedicated thread to play games and save them to the replay-buffer.
+    """
+
+    def __init__(self, initial_checkpoint, Game, config, seed):
+        self.config = config
+        self.game = Game(seed)
+
+        # Fix random generator seed
+        numpy.random.seed(seed)
+        torch.manual_seed(seed)
+
+        # Initialize the network
+        self.model = models.MuZeroNetwork(self.config)
+        # self.model = models.MuZeroNetwork(self.config)
+        self.model.set_weights(initial_checkpoint["weights"])
+        self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu"))
+        self.model.eval()
+
+    def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False):
+        while ray.get(
+            shared_storage.get_info.remote("training_step")
+        ) < self.config.training_steps and not ray.get(
+            shared_storage.get_info.remote("terminate")
+        ): # 如果当前的训练步数低于训练总步数，并且没有终止的话，继续进行训练
+            self.model.set_weights(ray.get(shared_storage.get_info.remote("weights"))) # 从shared_storage中获取当前的参数
+
+            if not test_mode:
+                game_history = self.play_game(
+                    self.config.visit_softmax_temperature_fn(
+                        trained_steps=ray.get(
+                            shared_storage.get_info.remote("training_step")
+                        )
+                    ),
+                    self.config.temperature_threshold,
+                    False,
+                    "self",
+                    0,
+                )
+
+                replay_buffer.save_game.remote(game_history, shared_storage)
+
+            else:
+                # Take the best action (no exploration) in test mode
+                game_history = self.play_game(
+                    0,
+                    self.config.temperature_threshold,
+                    False,
+                    "self" if len(self.config.players) == 1 else self.config.opponent,
+                    self.config.muzero_player,
+                )
+
+                # Save to the shared storage
+                shared_storage.set_info.remote(
+                    {
+                        "episode_length": len(game_history.action_history) - 1,
+                        "total_reward": sum(game_history.reward_history),
+                        "mean_value": numpy.mean(
+                            [value for value in game_history.root_values if value]
+                        ),
+                    }
+                )
+                if 1 < len(self.config.players):
+                    shared_storage.set_info.remote(
+                        {
+                            "muzero_reward": sum(
+                                reward
+                                for i, reward in enumerate(game_history.reward_history)
+                                if game_history.to_play_history[i - 1]
+                                == self.config.muzero_player
+                            ),
+                            "opponent_reward": sum(
+                                reward
+                                for i, reward in enumerate(game_history.reward_history)
+                                if game_history.to_play_history[i - 1]
+                                != self.config.muzero_player
+                            ),
+                        }
+                    )
+
+            # Managing the self-play / training ratio
+            if not test_mode and self.config.self_play_delay:
+                time.sleep(self.config.self_play_delay)
+            if not test_mode and self.config.ratio:
+                while (
+                    ray.get(shared_storage.get_info.remote("training_step"))
+                    / max(
+                        1, ray.get(shared_storage.get_info.remote("num_played_steps"))
+                    )
+                    < self.config.ratio
+                    and ray.get(shared_storage.get_info.remote("training_step"))
+                    < self.config.training_steps
+                    and not ray.get(shared_storage.get_info.remote("terminate"))
+                ):
+                    time.sleep(0.5)
+
+        self.close_game()
+
+    #play game 运行
+    # 合法的actions是固定的，由游戏文件提供(在本函数中，可以看到调用legal_actions函数没有使用env，这表面现游戏环境于的改变于动作无关)。
+    # 运行步骤：
+    #   1. 创建GameHistory用来存储数据
+    #   2. 检查游戏是否结束或者到底最大移动次数
+    #   3. 获取stacked observation（因为有些游戏需要考虑之前的历史数据和移动轨迹)
+    #   4. 运行MCTS搜索下一步的action
+    #   5. 调用游戏函数step（action），获取下一步action之后的observation、reward和done
+    #   6. 持续运行2-5步直到结束
+    #   7. 返回GameHistory
+    def play_game(
+        self, temperature, temperature_threshold, render, opponent, muzero_player
+    ):
+        """
+        Play one game with actions based on the Monte Carlo tree search at each moves.
+        """
+        game_history = GameHistory()
+        observation = self.game.reset()
+        game_history.action_history.append(0)
+        game_history.observation_history.append(observation) # 添加reset之后的observation
+        game_history.reward_history.append(0)
+        game_history.to_play_history.append(self.game.to_play())
+
+        done = False
+
+        if render:
+            self.game.render()
+
+        with torch.no_grad():
+            while (
+                not done and len(game_history.action_history) <= self.config.max_moves
+            ): # 游戏没有结束且运行步数小于最大移动步长
+                assert (
+                    len(numpy.array(observation).shape) == 3
+                ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
+                assert (
+                    numpy.array(observation).shape == self.config.observation_shape
+                ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
+                stacked_observations = game_history.get_stacked_observations(
+                    -1, self.config.stacked_observations, len(self.config.action_space)
+                )
+                # index是-1，game_history 会在创建时添加reset的observation，因此其长度为1.index取模（%）之后时1
+                # config.stacked_observationis是存储之前的observation的数量，如果不要之前的信息，可以设为0，这样就不会存储之前的信息
+
+                # 一下的if-else部分主要是为了选择一个动作
+                # Choose the action
+                if opponent == "self" or muzero_player == self.game.to_play():
+                    # root, mcts_info = MCTS(self.config).run(
+                    #     self.model,
+                    #     stacked_observations,
+                    #     self.game.legal_actions(),
+                    #     self.game.to_play(), # to_play返回当期玩游戏的玩家ID，默认是0
+                    #     True,
+                    # )
+                    # action = self.select_action(
+                    #     root,
+                    #     temperature
+                    #     if not temperature_threshold
+                    #     or len(game_history.action_history) < temperature_threshold
+                    #     else 0,
+                    # ) # 根据temperature选择动作
+                    actions = RHEA(self.config, self.game).run(self.model,
+                                          stacked_observations,
+                                          self.game.legal_actions(),
+                                          self.game.to_play(),
+                                          self.config.action_replace,
+                                          )
+                    action = actions[0]
+
+                else:
+                    action, root = self.select_opponent_action( #选择对手动作，分为随机，human和expert三种
+                        opponent, stacked_observations
+                    )
+
+                observation, reward, done = self.game.step(action) # 运行游戏
+
+                if render:
+                    print(f"Played action: {self.game.action_to_string(action)}")
+                    self.game.render()
+
+                # game_history.store_search_statistics(root, self.config.action_space)
+                game_history.root_values.append(reward)
+
+                # Next batch
+                game_history.action_history.append(action)
+                game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
+                game_history.reward_history.append(reward)
+                game_history.to_play_history.append(self.game.to_play())
+
+        return game_history
+
+    def close_game(self):
+        self.game.close()
+
+    def select_opponent_action(self, opponent, stacked_observations):
+        """
+        Select opponent action for evaluating MuZero level.
+        """
+        if opponent == "human":
+            return self.game.human_to_action(), None
+        elif opponent == "expert":
+            return self.game.expert_agent(), None
+        elif opponent == "random":
+            assert (
+                self.game.legal_actions()
+            ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}."
+            assert set(self.game.legal_actions()).issubset(
+                set(self.config.action_space)
+            ), "Legal actions should be a subset of the action space."
+
+            return numpy.random.choice(self.game.legal_actions()), None
+        else:
+            raise NotImplementedError(
+                'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"'
+            )
diff --git a/simplifiedMuZero/search_policy/self_play_uniform_search.py b/simplifiedMuZero/search_policy/self_play_uniform_search.py
new file mode 100644
index 00000000..314249f0
--- /dev/null
+++ b/simplifiedMuZero/search_policy/self_play_uniform_search.py
@@ -0,0 +1,622 @@
+import math
+import time
+
+import numpy
+import ray
+import torch
+
+import models
+
+
+@ray.remote
+class SelfPlay:
+    """
+    Class which run in a dedicated thread to play games and save them to the replay-buffer.
+    """
+
+    def __init__(self, initial_checkpoint, Game, config, seed):
+        self.config = config
+        self.game = Game(seed)
+
+        # Fix random generator seed
+        numpy.random.seed(seed)
+        torch.manual_seed(seed)
+
+        # Initialize the network
+        self.model = models.MuZeroNetwork(self.config)
+        self.model.set_weights(initial_checkpoint["weights"])
+        self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu"))
+        self.model.eval()
+
+    def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False):
+        while ray.get(
+            shared_storage.get_info.remote("training_step")
+        ) < self.config.training_steps and not ray.get(
+            shared_storage.get_info.remote("terminate")
+        ): # 如果当前的训练步数低于训练总步数，并且没有终止的话，继续进行训练
+            self.model.set_weights(ray.get(shared_storage.get_info.remote("weights"))) # 从shared_storage中获取当前的参数
+
+            if not test_mode:
+                game_history = self.play_game(
+                    self.config.visit_softmax_temperature_fn(
+                        trained_steps=ray.get(
+                            shared_storage.get_info.remote("training_step")
+                        )
+                    ),
+                    self.config.temperature_threshold,
+                    False,
+                    "self",
+                    0,
+                )
+
+                replay_buffer.save_game.remote(game_history, shared_storage)
+
+            else:
+                # Take the best action (no exploration) in test mode
+                game_history = self.play_game(
+                    0,
+                    self.config.temperature_threshold,
+                    False,
+                    "self" if len(self.config.players) == 1 else self.config.opponent,
+                    self.config.muzero_player,
+                )
+
+                # Save to the shared storage
+                shared_storage.set_info.remote(
+                    {
+                        "episode_length": len(game_history.action_history) - 1,
+                        "total_reward": sum(game_history.reward_history),
+                        "mean_value": numpy.mean(
+                            [value for value in game_history.root_values if value]
+                        ),
+                    }
+                )
+                if 1 < len(self.config.players):
+                    shared_storage.set_info.remote(
+                        {
+                            "muzero_reward": sum(
+                                reward
+                                for i, reward in enumerate(game_history.reward_history)
+                                if game_history.to_play_history[i - 1]
+                                == self.config.muzero_player
+                            ),
+                            "opponent_reward": sum(
+                                reward
+                                for i, reward in enumerate(game_history.reward_history)
+                                if game_history.to_play_history[i - 1]
+                                != self.config.muzero_player
+                            ),
+                        }
+                    )
+
+            # Managing the self-play / training ratio
+            if not test_mode and self.config.self_play_delay:
+                time.sleep(self.config.self_play_delay)
+            if not test_mode and self.config.ratio:
+                while (
+                    ray.get(shared_storage.get_info.remote("training_step"))
+                    / max(
+                        1, ray.get(shared_storage.get_info.remote("num_played_steps"))
+                    )
+                    < self.config.ratio
+                    and ray.get(shared_storage.get_info.remote("training_step"))
+                    < self.config.training_steps
+                    and not ray.get(shared_storage.get_info.remote("terminate"))
+                ):
+                    time.sleep(0.5)
+
+        self.close_game()
+
+    #play game 运行
+    # 合法的actions是固定的，由游戏文件提供(在本函数中，可以看到调用legal_actions函数没有使用env，这表面现游戏环境于的改变于动作无关)。
+    # 运行步骤：
+    #   1. 创建GameHistory用来存储数据
+    #   2. 检查游戏是否结束或者到底最大移动次数
+    #   3. 获取stacked observation（因为有些游戏需要考虑之前的历史数据和移动轨迹)
+    #   4. 运行MCTS搜索下一步的action
+    #   5. 调用游戏函数step（action），获取下一步action之后的observation、reward和done
+    #   6. 持续运行2-5步直到结束
+    #   7. 返回GameHistory
+    def play_game(
+        self, temperature, temperature_threshold, render, opponent, muzero_player
+    ):
+        """
+        Play one game with actions based on the Monte Carlo tree search at each moves.
+        """
+        game_history = GameHistory()
+        observation = self.game.reset()
+        game_history.action_history.append(0)
+        game_history.observation_history.append(observation) # 添加reset之后的observation
+        game_history.reward_history.append(0)
+        game_history.to_play_history.append(self.game.to_play()) # to_play_history是用来存放玩家id的
+
+        done = False
+
+        if render:
+            self.game.render()
+
+        with torch.no_grad():
+            while (
+                not done and len(game_history.action_history) <= self.config.max_moves
+            ): # 游戏没有结束且运行步数小于最大移动步长
+                assert (
+                    len(numpy.array(observation).shape) == 3
+                ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
+                assert (
+                    numpy.array(observation).shape == self.config.observation_shape
+                ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
+                stacked_observations = game_history.get_stacked_observations(
+                    -1, self.config.stacked_observations, len(self.config.action_space)
+                )
+                # index是-1，game_history 会在创建时添加reset的observation，因此其长度为1.index取模（%）之后时1
+                # config.stacked_observationis是存储之前的observation的数量，如果不要之前的信息，可以设为0，这样就不会存储之前的信息
+
+                # 一下的if-else部分主要是为了选择一个动作
+                # Choose the action
+                if opponent == "self" or muzero_player == self.game.to_play():
+                    root, mcts_info = UniformSearch(self.config).run(
+                        self.model,
+                        stacked_observations,
+                        self.game.legal_actions(),
+                        self.game.to_play(), # to_play返回当期玩游戏的玩家ID，默认是0
+                        True,
+                    )
+                    action = self.select_action(
+                        root,
+                        temperature
+                        if not temperature_threshold
+                        or len(game_history.action_history) < temperature_threshold
+                        else 0,
+                    ) # 根据temperature选择动作
+
+                    if render:
+                        print(f'Tree depth: {mcts_info["max_tree_depth"]}')
+                        print(
+                            f"Root value for player {self.game.to_play()}: {root.value():.2f}"
+                        )
+                else:
+                    action, root = self.select_opponent_action( #选择对手动作，分为随机，human和expert三种
+                        opponent, stacked_observations
+                    )
+
+                observation, reward, done = self.game.step(action) # 运行游戏
+
+                if render:
+                    print(f"Played action: {self.game.action_to_string(action)}")
+                    self.game.render()
+
+                game_history.store_search_statistics(root, self.config.action_space)
+
+                # Next batch
+                game_history.action_history.append(action)
+                game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
+                game_history.reward_history.append(reward)
+                game_history.to_play_history.append(self.game.to_play())
+
+        return game_history
+
+    def close_game(self):
+        self.game.close()
+
+    def select_opponent_action(self, opponent, stacked_observations):
+        """
+        Select opponent action for evaluating MuZero level.
+        """
+        if opponent == "human":
+            root, mcts_info = UniformSearch(self.config).run(
+                self.model,
+                stacked_observations,
+                self.game.legal_actions(),
+                self.game.to_play(),
+                True,
+            )
+            print(f'Tree depth: {mcts_info["max_tree_depth"]}')
+            print(f"Root value for player {self.game.to_play()}: {root.value():.2f}")
+            print(
+                f"Player {self.game.to_play()} turn. MuZero suggests {self.game.action_to_string(self.select_action(root, 0))}"
+            )
+            return self.game.human_to_action(), root
+        elif opponent == "expert":
+            return self.game.expert_agent(), None
+        elif opponent == "random":
+            assert (
+                self.game.legal_actions()
+            ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}."
+            assert set(self.game.legal_actions()).issubset(
+                set(self.config.action_space)
+            ), "Legal actions should be a subset of the action space."
+
+            return numpy.random.choice(self.game.legal_actions()), None
+        else:
+            raise NotImplementedError(
+                'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"'
+            )
+
+    # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。
+    # 公式为 c^(1/t)。可以看到：
+    #   t越小，1/t于接近于无穷大，值大的c就越容易被选中。
+    #   t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1，难以区分大小，因此就会相当于随机选择
+    #   特殊地，当t=0时，使用random完全随机选择，当t=+∞,使用argmax选择最大的
+    @staticmethod # 静态方法修饰符，类似于static关键字
+    def select_action(node, temperature):
+        """
+        Select action according to the visit count distribution and the temperature.
+        The temperature is changed dynamically with the visit_softmax_temperature function
+        in the config.
+        """
+        visit_counts = numpy.array(
+            [child.visit_count for child in node.children.values()], dtype="int32"
+        )
+        actions = [action for action in node.children.keys()]
+        if temperature == 0:
+            action = actions[numpy.argmax(visit_counts)]
+        elif temperature == float("inf"):
+            action = numpy.random.choice(actions)
+        else:
+            # See paper appendix Data Generation
+            visit_count_distribution = visit_counts ** (1 / temperature)
+            visit_count_distribution = visit_count_distribution / sum(
+                visit_count_distribution
+            )
+            action = numpy.random.choice(actions, p=visit_count_distribution)
+
+        return action
+
+
+# Game independent
+class UniformSearch:
+    """
+    Core Monte Carlo Tree Search algorithm.
+    To decide on an action, we run N simulations, always starting at the root of
+    the search tree and traversing the tree according to the UCB formula until we
+    reach a leaf node.
+    """
+
+    def __init__(self, config):
+        self.config = config
+
+    # run函数运行流程：
+    #   1. 获取root节点
+    #       (1)如果由指定节点这将root赋值为该节点；
+    #       (2)如果没有，则
+    #           i. 创建新的节点Node(0)
+    #           ii. 使用initial_inference函数通过observation获取相应的reward，hidden state，legal actions等数据
+    #           iii. 将ii中获取的数据赋值到创建的root节点中取
+    #           PS. 可以看到，在（1）的情况下不需要调用initial_inference函数
+    #   2. 检查是否需要添加探索噪音
+    #   3. 开始循环模拟游戏，模拟的次数由num simulation决定
+    #       （1） 将初始节点node设置为root，并将节点node加入search tree中
+    #       （2） 检查该节点是否已经扩展，如果已经扩展，则通过ucb值来选择子节点expand. 并将node 设置为选中的节点。并将节点node加入search tree中
+    #       （3） 重复2，直到找到expanded为false的node为止
+    #       （4） 选择search_tree[-2]为parent(因为最后一个是node)
+    #       （5） 运行recurrent_inference函数，获得reward，hidden state，legal actions等数据
+    #       （6） 扩展node,即为node创建子节点，使node展开。
+    #       （7） 反向传播算法，对路径上的所有访问次数+1，value值加reward
+    #       PS: 可以看到，通过不停的模拟，节点被一层层的扩展（每次模拟扩展一个节点）。
+    #   4. 返回扩展过后的节点树root，以便之后的程序根据它选择动作action
+    def run(
+        self,
+        model,
+        observation,
+        legal_actions,
+        to_play,
+        add_exploration_noise,
+        override_root_with=None,
+    ):
+        """
+        At the root of the search tree we use the representation function to obtain a
+        hidden state given the current observation.
+        We then run a Monte Carlo Tree Search using only action sequences and the model
+        learned by the network.
+        """
+        if override_root_with: #检查有没有提供Node,如果有，则指定；如果没有，则自己创建一个
+            root = override_root_with
+            root_predicted_value = None
+        else:
+            root = Node(0)
+            observation = (
+                torch.tensor(observation)
+                .float()
+                .unsqueeze(0)
+                .to(next(model.parameters()).device)
+            ) # observation转tensor，外面包一层形成一个batch。 Observation的长度由参数stacked_observation配置，主要存储之前的previous。不要之前privious的配置为0
+            (
+                root_predicted_value,
+                reward,
+                policy_logits,
+                hidden_state,
+            ) = model.initial_inference(observation)
+            root_predicted_value = models.support_to_scalar(
+                root_predicted_value, self.config.support_size
+            ).item()
+            reward = models.support_to_scalar(reward, self.config.support_size).item()
+            assert (
+                legal_actions
+            ), f"Legal actions should not be an empty array. Got {legal_actions}."
+            assert set(legal_actions).issubset(
+                set(self.config.action_space)
+            ), "Legal actions should be a subset of the action space."
+            root.expand(
+                legal_actions,
+                to_play,
+                reward,
+                policy_logits,
+                hidden_state,
+            )
+
+        if add_exploration_noise:
+            root.add_exploration_noise(
+                dirichlet_alpha=self.config.root_dirichlet_alpha,
+                exploration_fraction=self.config.root_exploration_fraction,
+            )
+
+        min_max_stats = MinMaxStats()
+
+        max_tree_depth = 0
+        for _ in range(self.config.num_simulations): # 开始模拟游戏
+            virtual_to_play = to_play
+            node = root
+            search_path = [node]
+            current_tree_depth = 0
+
+            # expanded根据node的子节点个数判断是否已经扩展了，如果没有子节点，说明没被扩展
+            while node.expanded(): #这个循环一直在搜索没有expand的子节点。如果子节点已经expand了，则通过select_child选择下一个
+                current_tree_depth += 1
+                action, node = self.select_child(node, min_max_stats) #选取ucb最大的一个action，如果有多个action得分相同，随机选取一个
+                search_path.append(node) #把节点添加到搜索队列
+
+                # Players play turn by turn
+                if virtual_to_play + 1 < len(self.config.players):
+                    virtual_to_play = self.config.players[virtual_to_play + 1]
+                else:
+                    virtual_to_play = self.config.players[0]
+
+            # 在搜索树内部，我们使用动态函数来获取给定动作的下一个hidden_state和previous hidden state
+            # Inside the search tree we use the dynamics function to obtain the next hidden
+            # state given an action and the previous hidden state
+            parent = search_path[-2] # 选择倒数第二个节点，因为当前的node是-1，则-2是它的parent
+            value, reward, policy_logits, hidden_state = model.recurrent_inference(
+                parent.hidden_state,
+                torch.tensor([[action]]).to(parent.hidden_state.device),
+            )
+            value = models.support_to_scalar(value, self.config.support_size).item()
+            reward = models.support_to_scalar(reward, self.config.support_size).item()
+            # expand一层节点，actions是动作列表，policy_logits是rewards列表
+            # 通过该函数，在该节点扩展一层节点
+            node.expand(
+                self.config.action_space,
+                virtual_to_play,
+                reward,
+                policy_logits,
+                hidden_state,
+            )
+
+            self.backpropagate(search_path, value, virtual_to_play, min_max_stats)
+
+            max_tree_depth = max(max_tree_depth, current_tree_depth)
+
+        extra_info = {
+            "max_tree_depth": max_tree_depth,
+            "root_predicted_value": root_predicted_value,
+        }
+        return root, extra_info
+
+    # MCTS 的select child和之前SelfPlay的select action逻辑是不一样的
+    #   1. select child是根据UCB选取的，select action是根据各个动作的visit count和temperature选取的
+    #   2. select child 选择的对象是Node,Node是由当前的state执行action后生成的新Node形成的。select action单纯的是选action
+    def select_child(self, node, min_max_stats):
+        """
+        Select the child with the highest UCB score.
+        """
+        # max_ucb = max(
+        #     self.ucb_score(node, child, min_max_stats)
+        #     for action, child in node.children.items()
+        # )
+        # action = numpy.random.choice( # 随机选择ucb值等于最大ucb的动作（因为可能有多个动作的值都达到了最大的ucb,如果只有一个，那么就会选取这个)
+        #     [
+        #         action
+        #         for action, child in node.children.items()
+        #         if self.ucb_score(node, child, min_max_stats) == max_ucb
+        #     ]
+        # )
+        action = numpy.random.choice([action for action,child in node.children.items()])
+        return action, node.children[action]
+
+    # def ucb_score(self, parent, child, min_max_stats): #该函数只进行一步查询，不进行多步
+    #     """
+    #     The score for a node is based on its value, plus an exploration bonus based on the prior.
+    #     """
+    #     pb_c = (
+    #         math.log(
+    #             (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base # pc_c_base由配置文件决定
+    #         )
+    #         + self.config.pb_c_init
+    #     )
+    #     pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1)
+    #
+    #     prior_score = pb_c * child.prior # prior 之前的p_value
+    #     # 公式 pb_c = (log((N+C+1)/C)+init ) * sqrt(N/(VC+1))
+    #     # prior_score = pbc * prior
+    #
+    #     if child.visit_count > 0:
+    #         # Mean value Q
+    #         value_score = min_max_stats.normalize( # 括号里的是Q值，Q=E[r+r*Q'。此处在对其进行正则化
+    #             child.reward
+    #             + self.config.discount # 衰减系数， 之后乘以子节点的值
+    #             * (child.value() if len(self.config.players) == 1 else -child.value()) # 根据players的个数，如果大于1，则子节点必定是对手，因此子节点的取负。
+    #         )
+    #     else:
+    #         value_score = 0
+    #
+    #     return prior_score + value_score # 先前的分数加上Q值就是新的UCB值
+
+    # 反向传播算法
+    # 对路径上的所有访问次数+1，value值加reward
+    def backpropagate(self, search_path, value, to_play, min_max_stats): # MCTS反向传播，visit count加1
+        """
+        At the end of a simulation, we propagate the evaluation all the way up the tree
+        to the root.
+        """
+        if len(self.config.players) == 1:
+            for node in reversed(search_path):
+                node.value_sum += value
+                node.visit_count += 1
+                min_max_stats.update(node.reward + self.config.discount * node.value())
+
+                value = node.reward + self.config.discount * value
+
+        elif len(self.config.players) == 2:
+            for node in reversed(search_path):
+                node.value_sum += value if node.to_play == to_play else -value
+                node.visit_count += 1
+                min_max_stats.update(node.reward + self.config.discount * -node.value())
+
+                value = (
+                    -node.reward if node.to_play == to_play else node.reward
+                ) + self.config.discount * value
+
+        else:
+            raise NotImplementedError("More than two player mode not implemented.")
+
+
+class Node:
+    def __init__(self, prior):
+        self.visit_count = 0 #visit count默认是0，只有经过反向传播之后才能变成增加
+        self.to_play = -1
+        self.prior = prior
+        self.value_sum = 0
+        self.children = {}
+        self.hidden_state = None
+        self.reward = 0
+
+    def expanded(self):
+        return len(self.children) > 0
+
+    def value(self):
+        if self.visit_count == 0:
+            return 0
+        return self.value_sum / self.visit_count
+
+    def expand(self, actions, to_play, reward, policy_logits, hidden_state):
+        # expand一层节点，actions是动作列表，policy_logits是rewards列表
+        # 通过该函数，在该节点扩展一层节点
+        """
+        We expand a node using the value, reward and policy prediction obtained from the
+        neural network.
+        """
+        self.to_play = to_play
+        self.reward = reward
+        self.hidden_state = hidden_state
+
+        policy_values = torch.softmax(
+            torch.tensor([policy_logits[0][a] for a in actions]), dim=0
+        ).tolist()
+        policy = {a: policy_values[i] for i, a in enumerate(actions)} # 列出所有的合法动作及对于的value值
+        for action, p in policy.items():
+            self.children[action] = Node(p)
+
+    def add_exploration_noise(self, dirichlet_alpha, exploration_fraction):
+        """
+        At the start of each search, we add dirichlet noise to the prior of the root to
+        encourage the search to explore new actions.
+        """
+        actions = list(self.children.keys())
+        noise = numpy.random.dirichlet([dirichlet_alpha] * len(actions))
+        frac = exploration_fraction
+        for a, n in zip(actions, noise):
+            self.children[a].prior = self.children[a].prior * (1 - frac) + n * frac
+
+
+class GameHistory:
+    """
+    Store only usefull information of a self-play game.
+    """
+
+    def __init__(self):
+        self.observation_history = []
+        self.action_history = []
+        self.reward_history = []
+        self.to_play_history = []
+        self.child_visits = []
+        self.root_values = []
+        self.reanalysed_predicted_root_values = None
+        # For PER
+        self.priorities = None
+        self.game_priority = None
+
+    def store_search_statistics(self, root, action_space):
+        # Turn visit count from root into a policy
+        if root is not None:
+            sum_visits = sum(child.visit_count for child in root.children.values())
+            self.child_visits.append(
+                [
+                    root.children[a].visit_count / sum_visits
+                    if a in root.children
+                    else 0
+                    for a in action_space
+                ]
+            )
+
+            self.root_values.append(root.value())
+        else:
+            self.root_values.append(None)
+
+    def get_stacked_observations(
+        self, index, num_stacked_observations, action_space_size
+    ): #根据索引index获取observation序列
+        """
+        Generate a new observation with the observation at the index position
+        and num_stacked_observations past observations and actions stacked.
+        """
+        # Convert to positive index
+        index = index % len(self.observation_history)
+
+        stacked_observations = self.observation_history[index].copy() #分为两部分，一部分是当前（current）观察值，一部分是之前的(previous)观察值
+        for past_observation_index in reversed(
+            range(index - num_stacked_observations, index)
+        ):
+            if 0 <= past_observation_index:
+                previous_observation = numpy.concatenate( # np.concatenate将第一个参数的list组合起来，方法是依次拆开每个元素，拼接
+                    (
+                        self.observation_history[past_observation_index],
+                        [
+                            numpy.ones_like(stacked_observations[0])
+                            * self.action_history[past_observation_index + 1]
+                            / action_space_size
+                        ],
+                    )
+                )
+            else:
+                previous_observation = numpy.concatenate(
+                    (
+                        numpy.zeros_like(self.observation_history[index]),
+                        [numpy.zeros_like(stacked_observations[0])],
+                    )
+                )
+
+            stacked_observations = numpy.concatenate( # 向stoacked_observtions添加内容
+                (stacked_observations, previous_observation)
+            )
+
+        return stacked_observations
+
+
+class MinMaxStats:
+    """
+    A class that holds the min-max values of the tree.
+    """
+
+    def __init__(self):
+        self.maximum = -float("inf") # 最大是-∞
+        self.minimum = float("inf") # 最小是+∞
+        # 跟类一定要update至少两次才能产生正确的范围。第一次更新掉max<min的情况，但是max=min;之后更新使其产生范围
+
+    def update(self, value): # 更新max和min,方法时对比大小，大的更新为上限，小的更新为下限
+        self.maximum = max(self.maximum, value)
+        self.minimum = min(self.minimum, value)
+
+    def normalize(self, value): #对value规范化，公式为(x-a)/(a-b) 当x∈[a,b]时
+        if self.maximum > self.minimum: # 如果最大大于最小，说明至少更新了两次（第一次更新掉max<min的情况，但是max=min;之后更新使其产生范围）
+            # We normalize only when we have set the maximum and minimum values
+            return (value - self.minimum) / (self.maximum - self.minimum)
+        return value # 如果范围没有更新，就直接返回value
diff --git a/simplifiedMuZero/without_rb/game_play.py b/simplifiedMuZero/without_rb/game_play.py
new file mode 100644
index 00000000..b0304d64
--- /dev/null
+++ b/simplifiedMuZero/without_rb/game_play.py
@@ -0,0 +1,182 @@
+import numpy
+import torch
+from self_play import GameHistory, MCTS
+class GamePlay:
+    """
+    Class which run in a dedicated thread to play games and save them to the replay-buffer.
+    """
+
+    def __init__(self, model, initial_checkpoint, Game, config, seed):
+        self.config = config
+        self.game = Game(seed)
+
+        # Fix random generator seed
+        numpy.random.seed(seed)
+        torch.manual_seed(seed)
+
+        # Initialize the network
+        # self.model = models.MuZeroNetwork(self.config)
+        # self.model.set_weights(initial_checkpoint["weights"])
+        self.model = model
+        self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu"))
+        self.model.eval()
+        self.trained_steps = initial_checkpoint["training_step"]
+        self.terminate = False
+
+    #play game 运行
+    # 合法的actions是固定的，由游戏文件提供(在本函数中，可以看到调用legal_actions函数没有使用env，这表面现游戏环境于的改变于动作无关)。
+    # 运行步骤：
+    #   1. 创建GameHistory用来存储数据
+    #   2. 检查游戏是否结束或者到底最大移动次数
+    #   3. 获取stacked observation（因为有些游戏需要考虑之前的历史数据和移动轨迹)
+    #   4. 运行MCTS搜索下一步的action
+    #   5. 调用游戏函数step（action），获取下一步action之后的observation、reward和done
+    #   6. 持续运行2-5步直到结束
+    #   7. 返回GameHistory
+    def play_game(
+        self, temperature, temperature_threshold, render, opponent, muzero_player
+    ):
+        """
+        Play one game with actions based on the Monte Carlo tree search at each moves.
+        """
+        game_history = GameHistory()
+        observation = self.game.reset()
+        game_history.action_history.append(0)
+        game_history.observation_history.append(observation) # 添加reset之后的observation
+        game_history.reward_history.append(0)
+        game_history.to_play_history.append(self.game.to_play())
+
+        done = False
+        game_id = None
+
+        if render:
+            self.game.render()
+
+        game_id = self.game.to_play()
+
+        with torch.no_grad():
+            while (
+                not done and len(game_history.action_history) <= self.config.max_moves
+            ): # 游戏没有结束且运行步数小于最大移动步长
+                assert (
+                    len(numpy.array(observation).shape) == 3
+                ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
+                assert (
+                    numpy.array(observation).shape == self.config.observation_shape
+                ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
+                stacked_observations = game_history.get_stacked_observations(
+                    -1, self.config.stacked_observations, len(self.config.action_space)
+                )
+                # index是-1，game_history 会在创建时添加reset的observation，因此其长度为1.index取模（%）之后时1
+                # config.stacked_observationis是存储之前的observation的数量，如果不要之前的信息，可以设为0，这样就不会存储之前的信息
+
+                # 一下的if-else部分主要是为了选择一个动作
+                # Choose the action
+                if opponent == "self" or muzero_player == self.game.to_play():
+                    root, mcts_info = MCTS(self.config).run(
+                        self.model,
+                        stacked_observations,
+                        self.game.legal_actions(),
+                        self.game.to_play(), # to_play返回当期玩游戏的玩家ID，默认是0
+                        True,
+                    )
+                    action = self.select_action(
+                        root,
+                        temperature
+                        if not temperature_threshold
+                        or len(game_history.action_history) < temperature_threshold
+                        else 0,
+                    ) # 根据temperature选择动作
+
+                    if render:
+                        print(f'Tree depth: {mcts_info["max_tree_depth"]}')
+                        print(
+                            f"Root value for player {self.game.to_play()}: {root.value():.2f}"
+                        )
+                else:
+                    action, root = self.select_opponent_action( #选择对手动作，分为随机，human和expert三种
+                        opponent, stacked_observations
+                    )
+
+                observation, reward, done = self.game.step(action) # 运行游戏
+
+                if render:
+                    print(f"Played action: {self.game.action_to_string(action)}")
+                    self.game.render()
+
+                game_history.store_search_statistics(root, self.config.action_space)
+
+                # Next batch
+                game_history.action_history.append(action)
+                game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
+                game_history.reward_history.append(reward)
+                game_history.to_play_history.append(self.game.to_play())
+
+        return game_id, game_history
+
+    def close_game(self):
+        self.game.close()
+
+    def select_opponent_action(self, opponent, stacked_observations):
+        """
+        Select opponent action for evaluating MuZero level.
+        """
+        if opponent == "human":
+            root, mcts_info = MCTS(self.config).run(
+                self.model,
+                stacked_observations,
+                self.game.legal_actions(),
+                self.game.to_play(),
+                True,
+            )
+            print(f'Tree depth: {mcts_info["max_tree_depth"]}')
+            print(f"Root value for player {self.game.to_play()}: {root.value():.2f}")
+            print(
+                f"Player {self.game.to_play()} turn. MuZero suggests {self.game.action_to_string(self.select_action(root, 0))}"
+            )
+            return self.game.human_to_action(), root
+        elif opponent == "expert":
+            return self.game.expert_agent(), None
+        elif opponent == "random":
+            assert (
+                self.game.legal_actions()
+            ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}."
+            assert set(self.game.legal_actions()).issubset(
+                set(self.config.action_space)
+            ), "Legal actions should be a subset of the action space."
+
+            return numpy.random.choice(self.game.legal_actions()), None
+        else:
+            raise NotImplementedError(
+                'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"'
+            )
+
+    # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。
+    # 公式为 c^(1/t)。可以看到：
+    #   t越小，1/t于接近于无穷大，值大的c就越容易被选中。
+    #   t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1，难以区分大小，因此就会相当于随机选择
+    #   特殊地，当t=0时，使用random完全随机选择，当t=+∞,使用argmax选择最大的
+    @staticmethod # 静态方法修饰符，类似于static关键字
+    def select_action(node, temperature):
+        """
+        Select action according to the visit count distribution and the temperature.
+        The temperature is changed dynamically with the visit_softmax_temperature function
+        in the config.
+        """
+        visit_counts = numpy.array(
+            [child.visit_count for child in node.children.values()], dtype="int32"
+        )
+        actions = [action for action in node.children.keys()]
+        if temperature == 0:
+            action = actions[numpy.argmax(visit_counts)]
+        elif temperature == float("inf"):
+            action = numpy.random.choice(actions)
+        else:
+            # See paper appendix Data Generation
+            visit_count_distribution = visit_counts ** (1 / temperature)
+            visit_count_distribution = visit_count_distribution / sum(
+                visit_count_distribution
+            )
+            action = numpy.random.choice(actions, p=visit_count_distribution)
+
+        return action
\ No newline at end of file
diff --git a/simplifiedMuZero/without_rb/play_buffer.py b/simplifiedMuZero/without_rb/play_buffer.py
new file mode 100644
index 00000000..ad13a67f
--- /dev/null
+++ b/simplifiedMuZero/without_rb/play_buffer.py
@@ -0,0 +1,214 @@
+import numpy
+import torch
+import copy
+class PlayBuffer:
+    """
+    Class which run in a dedicated thread to store played games and generate batch.
+    """
+
+    def __init__(self, initial_checkpoint, initial_buffer, config):
+        self.config = config
+        self.buffer = copy.deepcopy(initial_buffer) # initial_buffer默认为{}
+        self.num_played_games = initial_checkpoint["num_played_games"]
+        self.num_played_steps = initial_checkpoint["num_played_steps"]
+        self.total_samples = sum(
+            [len(game_history.root_values) for game_history in self.buffer.values()]
+        )
+        if self.total_samples != 0:
+            print(
+                f"Replay buffer initialized with {self.total_samples} samples ({self.num_played_games} games).\n"
+            )
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+
+    def save_game(self, game_history):
+        self.buffer[self.num_played_games] = game_history
+        self.num_played_games += 1
+        self.num_played_steps += len(game_history.root_values)
+        self.total_samples += len(game_history.root_values)
+
+        if self.config.replay_buffer_size < len(self.buffer):
+            del_id = self.num_played_games - len(self.buffer)
+            self.total_samples -= len(self.buffer[del_id].root_values)
+            del self.buffer[del_id]
+
+    def get_buffer(self):
+        return self.buffer
+
+    def get_batch(self):
+        (
+            index_batch,
+            observation_batch,
+            action_batch,
+            reward_batch,
+            value_batch,
+            policy_batch,
+            gradient_scale_batch,
+        ) = ([], [], [], [], [], [], [])
+        weight_batch = None
+
+        for game_id, game_history, game_prob in self.sample_n_games(
+            self.config.batch_size
+        ):
+            game_pos, pos_prob = self.sample_position(game_history)
+
+            values, rewards, policies, actions = self.make_target(
+                game_history, game_pos
+            )
+
+            index_batch.append([game_id, game_pos])
+            observation_batch.append(
+                game_history.get_stacked_observations(
+                    game_pos,
+                    self.config.stacked_observations,
+                    len(self.config.action_space),
+                )
+            )
+            action_batch.append(actions)
+            value_batch.append(values)
+            reward_batch.append(rewards)
+            policy_batch.append(policies)
+            gradient_scale_batch.append(
+                [
+                    min(
+                        self.config.num_unroll_steps,
+                        len(game_history.action_history) - game_pos,
+                    )
+                ]
+                * len(actions)
+            )
+
+        # observation_batch: batch, channels, height, width
+        # action_batch: batch, num_unroll_steps+1
+        # value_batch: batch, num_unroll_steps+1
+        # reward_batch: batch, num_unroll_steps+1
+        # policy_batch: batch, num_unroll_steps+1, len(action_space)
+        # weight_batch: batch
+        # gradient_scale_batch: batch, num_unroll_steps+1
+        return (
+            index_batch,
+            (
+                observation_batch,
+                action_batch,
+                value_batch,
+                reward_batch,
+                policy_batch,
+                weight_batch,
+                gradient_scale_batch,
+            ),
+        )
+
+    def sample_game(self, force_uniform=True): #将force_uniform 设置为True，强制安装平均分布选取
+        """
+        Sample game from buffer either uniformly or according to some priority.
+        See paper appendix Training.
+        """
+        game_prob = None
+
+        game_index = numpy.random.choice(len(self.buffer))
+        game_id = self.num_played_games - len(self.buffer) + game_index
+
+        return game_id, self.buffer[game_id], game_prob
+
+    def sample_n_games(self, n_games):
+        selected_games = numpy.random.choice(list(self.buffer.keys()), n_games)
+        game_prob_dict = {}
+        ret = [
+            (game_id, self.buffer[game_id], game_prob_dict.get(game_id))
+            for game_id in selected_games
+        ]
+        return ret
+
+    def sample_position(self, game_history):
+        """
+        Sample position from game either uniformly or according to some priority.
+        See paper appendix Training.
+        """
+        position_prob = None
+
+        position_index = numpy.random.choice(len(game_history.root_values))
+
+        return position_index, position_prob
+
+    def update_game_history(self, game_id, game_history):
+        # The element could have been removed since its selection and update
+        # if next(iter(self.buffer)) <= game_id:
+        #     self.buffer[game_id] = game_history
+
+        self.buffer[game_id] = game_history
+
+    def compute_target_value(self, game_history, index):
+        # The value target is the discounted root value of the search tree td_steps into the
+        # future, plus the discounted sum of all rewards until then.
+        bootstrap_index = index + self.config.td_steps
+        if bootstrap_index < len(game_history.root_values):
+            root_values = (
+                game_history.root_values
+                if game_history.reanalysed_predicted_root_values is None
+                else game_history.reanalysed_predicted_root_values
+            )
+            last_step_value = (
+                root_values[bootstrap_index]
+                if game_history.to_play_history[bootstrap_index]
+                == game_history.to_play_history[index]
+                else -root_values[bootstrap_index]
+            )
+
+            value = last_step_value * self.config.discount**self.config.td_steps
+        else:
+            value = 0
+
+        for i, reward in enumerate(
+            game_history.reward_history[index + 1 : bootstrap_index + 1]
+        ):
+            # The value is oriented from the perspective of the current player
+            value += (
+                reward
+                if game_history.to_play_history[index]
+                == game_history.to_play_history[index + i]
+                else -reward
+            ) * self.config.discount**i
+
+        return value
+
+    def make_target(self, game_history, state_index):
+        """
+        Generate targets for every unroll steps.
+        """
+        target_values, target_rewards, target_policies, actions = [], [], [], []
+        for current_index in range(
+            state_index, state_index + self.config.num_unroll_steps + 1
+        ):
+            value = self.compute_target_value(game_history, current_index)
+
+            if current_index < len(game_history.root_values):
+                target_values.append(value)
+                target_rewards.append(game_history.reward_history[current_index])
+                target_policies.append(game_history.child_visits[current_index])
+                actions.append(game_history.action_history[current_index])
+            elif current_index == len(game_history.root_values):
+                target_values.append(0)
+                target_rewards.append(game_history.reward_history[current_index])
+                # Uniform policy
+                target_policies.append(
+                    [
+                        1 / len(game_history.child_visits[0])
+                        for _ in range(len(game_history.child_visits[0]))
+                    ]
+                )
+                actions.append(game_history.action_history[current_index])
+            else:
+                # States past the end of games are treated as absorbing states
+                target_values.append(0)
+                target_rewards.append(0)
+                # Uniform policy
+                target_policies.append(
+                    [
+                        1 / len(game_history.child_visits[0])
+                        for _ in range(len(game_history.child_visits[0]))
+                    ]
+                )
+                actions.append(numpy.random.choice(self.config.action_space))
+
+        return target_values, target_rewards, target_policies, actions
diff --git a/simplifiedMuZero/without_rb/trainer.py b/simplifiedMuZero/without_rb/trainer.py
new file mode 100644
index 00000000..265b13c5
--- /dev/null
+++ b/simplifiedMuZero/without_rb/trainer.py
@@ -0,0 +1,243 @@
+import numpy
+import torch
+import models
+
+class Trainer:
+    """
+    Class which run in a dedicated thread to train a neural network and save it
+    in the shared storage.
+    """
+
+    def __init__(self, model_cls, initial_checkpoint, config):
+        self.config = config
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Initialize the network
+        self.model = model_cls(self.config)
+        # self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"]))
+        self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu"))
+        self.model.train()
+
+        self.training_step = initial_checkpoint["training_step"]
+
+        if "cuda" not in str(next(self.model.parameters()).device):
+            print("You are not training on GPU.\n")
+
+        # Initialize the optimizer
+        if self.config.optimizer == "SGD":
+            self.optimizer = torch.optim.SGD(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                momentum=self.config.momentum,
+                weight_decay=self.config.weight_decay,
+            )
+        elif self.config.optimizer == "Adam":
+            self.optimizer = torch.optim.Adam(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                weight_decay=self.config.weight_decay,
+            )
+        else:
+            raise NotImplementedError(
+                f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py."
+            )
+
+        # if initial_checkpoint["optimizer_state"] is not None:
+        #     print("Loading optimizer...\n")
+        #     self.optimizer.load_state_dict(
+        #         copy.deepcopy(initial_checkpoint["optimizer_state"])
+        #     )
+
+    # # update weights 与 continuous update weights 的区别
+    # #   1. update weights 是实际计算更新network的权重
+    # #   2. continuous update weights 从replay buffer 里获取数据batch， 并将batch传递给update weights，使update weights完成参数更新
+    # def continuous_update_weights(self, play_buffer, terminate): # terminate是用来记录replay buffer等其它程序是否终止的，跟game的状态无关
+    #     next_batch = play_buffer.get_batch()
+    #     # Training loop
+    #     while self.training_step < self.config.training_steps and not terminate:
+    #         index_batch, batch = next_batch
+    #         next_batch = play_buffer.get_batch()
+    #         self.update_lr()
+    #         (
+    #             priorities,
+    #             total_loss,
+    #             value_loss,
+    #             reward_loss,
+    #             policy_loss,
+    #         ) = self.update_weights(batch)
+
+    def update_weights(self, batch):
+        """
+        Perform one training step.
+        """
+
+        (
+            observation_batch,
+            action_batch,
+            target_value,
+            target_reward,
+            target_policy,
+            weight_batch,
+            gradient_scale_batch,
+        ) = batch
+
+        # Keep values as scalars for calculating the priorities for the prioritized replay
+        target_value_scalar = numpy.array(target_value, dtype="float32")
+        priorities = numpy.zeros_like(target_value_scalar)
+
+        device = next(self.model.parameters()).device
+        observation_batch = (
+            torch.tensor(numpy.array(observation_batch)).float().to(device)
+        )
+        action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1)
+        target_value = torch.tensor(target_value).float().to(device)
+        target_reward = torch.tensor(target_reward).float().to(device)
+        target_policy = torch.tensor(target_policy).float().to(device)
+        gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device)
+        # observation_batch: batch, channels, height, width
+        # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze)
+        # target_value: batch, num_unroll_steps+1
+        # target_reward: batch, num_unroll_steps+1
+        # target_policy: batch, num_unroll_steps+1, len(action_space)
+        # gradient_scale_batch: batch, num_unroll_steps+1
+
+        target_value = models.scalar_to_support(target_value, self.config.support_size)
+        target_reward = models.scalar_to_support(
+            target_reward, self.config.support_size
+        )
+        # target_value: batch, num_unroll_steps+1, 2*support_size+1
+        # target_reward: batch, num_unroll_steps+1, 2*support_size+1
+
+        ## Generate predictions
+        value, reward, policy_logits, hidden_state = self.model.initial_inference(
+            observation_batch
+        )
+        predictions = [(value, reward, policy_logits)]
+        for i in range(1, action_batch.shape[1]):
+            value, reward, policy_logits, hidden_state = self.model.recurrent_inference(
+                hidden_state, action_batch[:, i]
+            )
+            # Scale the gradient at the start of the dynamics function (See paper appendix Training)
+            hidden_state.register_hook(lambda grad: grad * 0.5)
+            predictions.append((value, reward, policy_logits))
+        # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim)
+
+        ## Compute losses
+        value_loss, reward_loss, policy_loss = (0, 0, 0)
+        value, reward, policy_logits = predictions[0]
+        # Ignore reward loss for the first batch step
+        current_value_loss, _, current_policy_loss = self.loss_function(
+            value.squeeze(-1),
+            reward.squeeze(-1),
+            policy_logits,
+            target_value[:, 0],
+            target_reward[:, 0],
+            target_policy[:, 0],
+        )
+        value_loss += current_value_loss
+        policy_loss += current_policy_loss
+        # Compute priorities for the prioritized replay (See paper appendix Training)
+        pred_value_scalar = (
+            models.support_to_scalar(value, self.config.support_size)
+            .detach()
+            .cpu()
+            .numpy()
+            .squeeze()
+        )
+        priorities[:, 0] = (
+            numpy.abs(pred_value_scalar - target_value_scalar[:, 0])
+            ** self.config.PER_alpha
+        )
+
+        for i in range(1, len(predictions)):
+            value, reward, policy_logits = predictions[i]
+            (
+                current_value_loss,
+                current_reward_loss,
+                current_policy_loss,
+            ) = self.loss_function(
+                value.squeeze(-1),
+                reward.squeeze(-1),
+                policy_logits,
+                target_value[:, i],
+                target_reward[:, i],
+                target_policy[:, i],
+            )
+
+            # Scale gradient by the number of unroll steps (See paper appendix Training)
+            current_value_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_reward_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_policy_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+
+            value_loss += current_value_loss
+            reward_loss += current_reward_loss
+            policy_loss += current_policy_loss
+
+            # Compute priorities for the prioritized replay (See paper appendix Training)
+            pred_value_scalar = (
+                models.support_to_scalar(value, self.config.support_size)
+                .detach()
+                .cpu()
+                .numpy()
+                .squeeze()
+            )
+            priorities[:, i] = (
+                numpy.abs(pred_value_scalar - target_value_scalar[:, i])
+                ** self.config.PER_alpha
+            )
+
+        # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze)
+        loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss
+
+        # Mean over batch dimension (pseudocode do a sum)
+        loss = loss.mean()
+
+        # Optimize
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        self.training_step += 1
+
+        return (
+            priorities,
+            # For log purpose
+            loss.item(),
+            value_loss.mean().item(),
+            reward_loss.mean().item(),
+            policy_loss.mean().item(),
+        )
+
+    def update_lr(self):
+        """
+        Update learning rate
+        """
+        lr = self.config.lr_init * self.config.lr_decay_rate ** (
+            self.training_step / self.config.lr_decay_steps
+        )
+        for param_group in self.optimizer.param_groups:
+            param_group["lr"] = lr
+
+    @staticmethod
+    def loss_function(
+        value,
+        reward,
+        policy_logits,
+        target_value,
+        target_reward,
+        target_policy,
+    ):
+        # Cross-entropy seems to have a better convergence than MSE
+        value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1)
+        reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1)
+        policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(1)
+
+        return value_loss, reward_loss, policy_loss
diff --git a/simplifiedMuZero/without_rb/trainer_no_PV.py b/simplifiedMuZero/without_rb/trainer_no_PV.py
new file mode 100644
index 00000000..265b13c5
--- /dev/null
+++ b/simplifiedMuZero/without_rb/trainer_no_PV.py
@@ -0,0 +1,243 @@
+import numpy
+import torch
+import models
+
+class Trainer:
+    """
+    Class which run in a dedicated thread to train a neural network and save it
+    in the shared storage.
+    """
+
+    def __init__(self, model_cls, initial_checkpoint, config):
+        self.config = config
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Initialize the network
+        self.model = model_cls(self.config)
+        # self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"]))
+        self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu"))
+        self.model.train()
+
+        self.training_step = initial_checkpoint["training_step"]
+
+        if "cuda" not in str(next(self.model.parameters()).device):
+            print("You are not training on GPU.\n")
+
+        # Initialize the optimizer
+        if self.config.optimizer == "SGD":
+            self.optimizer = torch.optim.SGD(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                momentum=self.config.momentum,
+                weight_decay=self.config.weight_decay,
+            )
+        elif self.config.optimizer == "Adam":
+            self.optimizer = torch.optim.Adam(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                weight_decay=self.config.weight_decay,
+            )
+        else:
+            raise NotImplementedError(
+                f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py."
+            )
+
+        # if initial_checkpoint["optimizer_state"] is not None:
+        #     print("Loading optimizer...\n")
+        #     self.optimizer.load_state_dict(
+        #         copy.deepcopy(initial_checkpoint["optimizer_state"])
+        #     )
+
+    # # update weights 与 continuous update weights 的区别
+    # #   1. update weights 是实际计算更新network的权重
+    # #   2. continuous update weights 从replay buffer 里获取数据batch， 并将batch传递给update weights，使update weights完成参数更新
+    # def continuous_update_weights(self, play_buffer, terminate): # terminate是用来记录replay buffer等其它程序是否终止的，跟game的状态无关
+    #     next_batch = play_buffer.get_batch()
+    #     # Training loop
+    #     while self.training_step < self.config.training_steps and not terminate:
+    #         index_batch, batch = next_batch
+    #         next_batch = play_buffer.get_batch()
+    #         self.update_lr()
+    #         (
+    #             priorities,
+    #             total_loss,
+    #             value_loss,
+    #             reward_loss,
+    #             policy_loss,
+    #         ) = self.update_weights(batch)
+
+    def update_weights(self, batch):
+        """
+        Perform one training step.
+        """
+
+        (
+            observation_batch,
+            action_batch,
+            target_value,
+            target_reward,
+            target_policy,
+            weight_batch,
+            gradient_scale_batch,
+        ) = batch
+
+        # Keep values as scalars for calculating the priorities for the prioritized replay
+        target_value_scalar = numpy.array(target_value, dtype="float32")
+        priorities = numpy.zeros_like(target_value_scalar)
+
+        device = next(self.model.parameters()).device
+        observation_batch = (
+            torch.tensor(numpy.array(observation_batch)).float().to(device)
+        )
+        action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1)
+        target_value = torch.tensor(target_value).float().to(device)
+        target_reward = torch.tensor(target_reward).float().to(device)
+        target_policy = torch.tensor(target_policy).float().to(device)
+        gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device)
+        # observation_batch: batch, channels, height, width
+        # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze)
+        # target_value: batch, num_unroll_steps+1
+        # target_reward: batch, num_unroll_steps+1
+        # target_policy: batch, num_unroll_steps+1, len(action_space)
+        # gradient_scale_batch: batch, num_unroll_steps+1
+
+        target_value = models.scalar_to_support(target_value, self.config.support_size)
+        target_reward = models.scalar_to_support(
+            target_reward, self.config.support_size
+        )
+        # target_value: batch, num_unroll_steps+1, 2*support_size+1
+        # target_reward: batch, num_unroll_steps+1, 2*support_size+1
+
+        ## Generate predictions
+        value, reward, policy_logits, hidden_state = self.model.initial_inference(
+            observation_batch
+        )
+        predictions = [(value, reward, policy_logits)]
+        for i in range(1, action_batch.shape[1]):
+            value, reward, policy_logits, hidden_state = self.model.recurrent_inference(
+                hidden_state, action_batch[:, i]
+            )
+            # Scale the gradient at the start of the dynamics function (See paper appendix Training)
+            hidden_state.register_hook(lambda grad: grad * 0.5)
+            predictions.append((value, reward, policy_logits))
+        # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim)
+
+        ## Compute losses
+        value_loss, reward_loss, policy_loss = (0, 0, 0)
+        value, reward, policy_logits = predictions[0]
+        # Ignore reward loss for the first batch step
+        current_value_loss, _, current_policy_loss = self.loss_function(
+            value.squeeze(-1),
+            reward.squeeze(-1),
+            policy_logits,
+            target_value[:, 0],
+            target_reward[:, 0],
+            target_policy[:, 0],
+        )
+        value_loss += current_value_loss
+        policy_loss += current_policy_loss
+        # Compute priorities for the prioritized replay (See paper appendix Training)
+        pred_value_scalar = (
+            models.support_to_scalar(value, self.config.support_size)
+            .detach()
+            .cpu()
+            .numpy()
+            .squeeze()
+        )
+        priorities[:, 0] = (
+            numpy.abs(pred_value_scalar - target_value_scalar[:, 0])
+            ** self.config.PER_alpha
+        )
+
+        for i in range(1, len(predictions)):
+            value, reward, policy_logits = predictions[i]
+            (
+                current_value_loss,
+                current_reward_loss,
+                current_policy_loss,
+            ) = self.loss_function(
+                value.squeeze(-1),
+                reward.squeeze(-1),
+                policy_logits,
+                target_value[:, i],
+                target_reward[:, i],
+                target_policy[:, i],
+            )
+
+            # Scale gradient by the number of unroll steps (See paper appendix Training)
+            current_value_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_reward_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_policy_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+
+            value_loss += current_value_loss
+            reward_loss += current_reward_loss
+            policy_loss += current_policy_loss
+
+            # Compute priorities for the prioritized replay (See paper appendix Training)
+            pred_value_scalar = (
+                models.support_to_scalar(value, self.config.support_size)
+                .detach()
+                .cpu()
+                .numpy()
+                .squeeze()
+            )
+            priorities[:, i] = (
+                numpy.abs(pred_value_scalar - target_value_scalar[:, i])
+                ** self.config.PER_alpha
+            )
+
+        # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze)
+        loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss
+
+        # Mean over batch dimension (pseudocode do a sum)
+        loss = loss.mean()
+
+        # Optimize
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        self.training_step += 1
+
+        return (
+            priorities,
+            # For log purpose
+            loss.item(),
+            value_loss.mean().item(),
+            reward_loss.mean().item(),
+            policy_loss.mean().item(),
+        )
+
+    def update_lr(self):
+        """
+        Update learning rate
+        """
+        lr = self.config.lr_init * self.config.lr_decay_rate ** (
+            self.training_step / self.config.lr_decay_steps
+        )
+        for param_group in self.optimizer.param_groups:
+            param_group["lr"] = lr
+
+    @staticmethod
+    def loss_function(
+        value,
+        reward,
+        policy_logits,
+        target_value,
+        target_reward,
+        target_policy,
+    ):
+        # Cross-entropy seems to have a better convergence than MSE
+        value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1)
+        reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1)
+        policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(1)
+
+        return value_loss, reward_loss, policy_loss
diff --git a/simplified_muzero.py b/simplified_muzero.py
new file mode 100644
index 00000000..11cf7591
--- /dev/null
+++ b/simplified_muzero.py
@@ -0,0 +1,108 @@
+from simplifiedMuZero.net2.models2 import MuZeroNetwork_2net
+from muzero_general import MuZeroGeneral
+from muzero import load_model_menu, hyperparameter_search
+
+import json
+import sys
+import pathlib
+import time
+import nevergrad
+
+if __name__ == "__main__":
+    # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb")
+    # start_time = time.time()
+    # muzero.train()
+    # end_time = time.time()
+    # print("耗时: {:.2f}秒".format(end_time - start_time))
+    model_cls = MuZeroNetwork_2net
+    if len(sys.argv) == 2:
+        # Train directly with: python muzero.py cartpole
+        muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls)
+        muzero.train()
+    elif len(sys.argv) == 3:
+        # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}'
+        config = json.loads(sys.argv[2])
+        muzero = MuZeroGeneral(sys.argv[1], config, model_cls=model_cls)
+        muzero.train()
+    else:
+        print("\nWelcome to MuZero! Here's a list of games:")
+        # Let user pick a game
+        games = [
+            filename.stem
+            for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py")))
+            if filename.name != "abstract_game.py"
+        ]
+        for i in range(len(games)):
+            print(f"{i}. {games[i]}")
+        choice = input("Enter a number to choose the game: ")
+        valid_inputs = [str(i) for i in range(len(games))]
+        while choice not in valid_inputs:
+            choice = input("Invalid input, enter a number listed above: ")
+
+        # Initialize MuZero
+        choice = int(choice)
+        game_name = games[choice]
+        muzero = MuZeroGeneral(game_name, model_cls=model_cls)
+
+        while True:
+            # Configure running options
+            options = [
+                "Train",
+                "Load pretrained model",
+                "Diagnose model",
+                "Render some self play games",
+                "Play against MuZero",
+                "Test the game manually",
+                "Hyperparameter search",
+                "Exit",
+            ]
+            print()
+            for i in range(len(options)):
+                print(f"{i}. {options[i]}")
+
+            choice = input("Enter a number to choose an action: ")
+            valid_inputs = [str(i) for i in range(len(options))]
+            while choice not in valid_inputs:
+                choice = input("Invalid input, enter a number listed above: ")
+            choice = int(choice)
+            if choice == 0:
+                start_time = time.time()
+                muzero.train()
+                end_time = time.time()
+                print("耗时: {:.2f}秒".format(end_time - start_time))
+            elif choice == 1:
+                load_model_menu(muzero, game_name)
+            elif choice == 2:
+                muzero.diagnose_model(30)
+            elif choice == 3:
+                muzero.test(render=True, opponent="self", muzero_player=None)
+            elif choice == 4:
+                muzero.test(render=True, opponent="human", muzero_player=0)
+            elif choice == 5:
+                env = muzero.Game()
+                env.reset()
+                env.render()
+
+                done = False
+                while not done:
+                    action = env.human_to_action()
+                    observation, reward, done = env.step(action)
+                    print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}")
+                    env.render()
+            elif choice == 6:
+                # Define here the parameters to tune
+                # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html
+                muzero.terminate_workers()
+                del muzero
+                budget = 20
+                parallel_experiments = 2
+                lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1)
+                discount = nevergrad.p.Log(lower=0.95, upper=0.9999)
+                parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount)
+                best_hyperparameters = hyperparameter_search(
+                    game_name, parametrization, budget, parallel_experiments, 20
+                )
+                muzero = MuZeroGeneral(game_name, best_hyperparameters, model_cls=model_cls)
+            else:
+                break
+            print("\nDone")
\ No newline at end of file
diff --git a/test/Simple_grid_test.py b/test/Simple_grid_test.py
new file mode 100644
index 00000000..501ac2df
--- /dev/null
+++ b/test/Simple_grid_test.py
@@ -0,0 +1,23 @@
+import numpy as np
+
+from games.simple_grid import Game
+import random
+import time
+
+g = Game()
+observation = g.env.get_observation()
+
+# print(observer)
+for i in range(1000):
+    actions = g.legal_actions()
+    observation, reward, done = g.step(random.choice(actions))
+    # g.render()
+    print(np.array(observation).shape)
+
+    if done:
+        break
+
+
+    # time.sleep(10)
+
+g.close()
diff --git a/test/deap_test.py b/test/deap_test.py
new file mode 100644
index 00000000..51b930c8
--- /dev/null
+++ b/test/deap_test.py
@@ -0,0 +1,120 @@
+import copy
+import random
+
+import deap
+from games.tictactoe import Game, MuZeroConfig
+import numpy as np
+
+config = MuZeroConfig()
+
+from deap import base, creator, tools
+import numpy as np
+# 定义问题
+# creator创建的是类，第一个参数是类名，第二个参数是基类，后面的是其它参数
+creator.create('FitnessMax', base.Fitness, weights=(1.0,))
+creator.create('Individual', list, fitness = creator.FitnessMax)
+
+legal_actions = 9
+
+toolbox = base.Toolbox()
+# 注册生成基因的函数。第一个参数是函数名，因此下面的调用是toolbox.Actions。
+# 第二鸽参数是生成action的函数。
+# 后边的参数是生成函数的参数，如此为np.random.choice(range(n), N, replace=False)
+toolbox.register("Actions", np.random.choice, range(legal_actions), config.max_moves, replace=False)
+# tools.initIterate返回一个生成的动作序列
+toolbox.register("Individual", tools.initIterate, creator.Individual, toolbox.Actions)
+
+# ind1 = toolbox.Individual()
+# print(ind1)
+
+# 重复生成动作序列
+toolbox.register("population", tools.initRepeat, list, toolbox.Individual)
+
+# pop = toolbox.population(n=36)
+# print(len(pop))
+
+
+
+game = Game(0)
+game2 = copy.deepcopy(game)
+game.reset()
+game2.reset()
+
+actions = game.legal_actions()
+np.random.shuffle(actions)
+
+# for i in range(config.max_moves):
+#     # game.render()
+#     print(game.legal_actions())
+#     observation, reward, done = game.step(np.random.choice(game.legal_actions()))
+#
+#     if done:
+#         break
+
+def evaluate(actions):
+    game = Game(1)
+    game.reset()
+
+    for i in range(len(actions)):
+        player = game.to_play()
+        observation, reward, done = game.step(actions[i])
+        if done:
+            break
+
+    game.close()
+    reward = reward if 0 == player else -reward
+    # 因为i是从0开始的，如果第一个action就结束，会出现NAN异常
+    reward /= i + 1  # 路径越长，回报越低。以便寻找到最近的路径
+    return reward,
+
+
+def game_evaluate(actions, game=None, play_id=None):
+    game = copy.deepcopy(game)
+    game.reset()
+
+    for i in range(len(actions)):
+        player = game.to_play()
+        observation, reward, done = game.step(actions[i])
+        if done:
+            break
+
+    game.close()
+    reward = reward if play_id == player else -reward
+    # 因为i是从0开始的，如果第一个action就结束，会出现NAN异常
+    reward /= i+1 # 路径越长，回报越低。以便寻找到最近的路径
+    return reward,
+        # print(actions[i])
+        # game.render()
+
+toolbox.register("evaluate", game_evaluate, game=game, play_id = 0)
+# toolbox.register("evaluate", evaluate)
+toolbox.register("mate", tools.cxTwoPoint)
+toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
+# toolbox.register("select", tools.selTournament, tournsize=2000)
+# toolbox.register("select", tools.selBest)
+toolbox.register("select", tools.selStochasticUniversalSampling)
+
+pop = toolbox.population(n=100)
+
+# from deap import algorithms
+# pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=10, verbose=False)
+# # print(logbook)
+# result = tools.selBest(pop, k=1)
+
+results = [[0, 6, 8, 7, 4, 5, 2, 1, 3]]
+print(results)
+print(evaluate(results[0]))
+reward = game_evaluate(results[0],game,0)
+print(reward)
+
+# reward = game_evaluate([0,1,3,4,6,7,2,5,9],game,0)
+# print(reward)
+#
+# for i in range(20):
+#     print(game_evaluate(pop[i], game, 0))
+
+# print(evaluate(actions, game, 0))
+
+# print(actions[:i])
+# game.render()
+# game2.render()
diff --git a/test/deap_test2.py b/test/deap_test2.py
new file mode 100644
index 00000000..ad6de6bc
--- /dev/null
+++ b/test/deap_test2.py
@@ -0,0 +1,119 @@
+import copy
+import random
+
+import deap
+from games.tictactoe import Game, MuZeroConfig
+import numpy as np
+from functools import partial
+
+config = MuZeroConfig()
+
+from deap import base, creator, tools
+import numpy as np
+# 定义问题
+# creator创建的是类，第一个参数是类名，第二个参数是基类，后面的是其它参数
+creator.create('FitnessMax', base.Fitness, weights=(1.0,))
+creator.create('Individual', list, fitness = creator.FitnessMax)
+
+legal_actions = 9
+
+toolbox = base.Toolbox()
+# 注册生成基因的函数。第一个参数是函数名，因此下面的调用是toolbox.Actions。
+# 第二鸽参数是生成action的函数。
+# 后边的参数是生成函数的参数，如此为np.random.choice(range(n), N, replace=False)
+# toolbox.register("Actions", np.random.choice, range(legal_actions), config.max_moves, replace=False)
+# # tools.initIterate返回一个生成的动作序列
+# toolbox.register("Individual", tools.initIterate, creator.Individual, toolbox.Actions)
+
+def individual(actions, max_moves, replace=False):
+    max_moves = max_moves if replace else len(actions)
+    return tools.initIterate(creator.Individual, partial(np.random.choice, actions, max_moves, replace=replace))
+
+# print(individual([0,1,2,3,4], 9, replace=False))
+# print(individual([0,1,2,3,4], 9, replace=True))
+# exit()
+
+def population(actions, max_moves, N, replace=False):
+    return tools.initRepeat(list, partial(individual, actions, max_moves, replace), N)
+
+pop = population(range(9),9,  N=4, replace=False)
+print(pop)
+
+# exit()
+#
+# # 重复生成动作序列
+# toolbox.register("population", tools.initRepeat, list, toolbox.Individual)
+
+game = Game(0)
+
+actions = game.legal_actions()
+np.random.shuffle(actions)
+
+def evaluate(actions):
+    game = Game(1)
+    game.reset()
+
+    for i in range(len(actions)):
+        player = game.to_play()
+        observation, reward, done = game.step(actions[i])
+        if done:
+            break
+
+    game.close()
+    reward = reward if 0 == player else -reward
+    # 因为i是从0开始的，如果第一个action就结束，会出现NAN异常
+    reward /= i + 1  # 路径越长，回报越低。以便寻找到最近的路径
+    return reward,
+
+
+def game_evaluate(actions, game=None, play_id=None):
+    game = copy.deepcopy(game)
+    game.reset()
+
+    for i in range(len(actions)):
+        player = game.to_play()
+        observation, reward, done = game.step(actions[i])
+        if done:
+            break
+
+    game.close()
+    reward = reward if play_id == player else -reward
+    # 因为i是从0开始的，如果第一个action就结束，会出现NAN异常
+    reward /= i+1 # 路径越长，回报越低。以便寻找到最近的路径
+    return reward,
+        # print(actions[i])
+        # game.render()
+
+toolbox.register("evaluate", game_evaluate, game=game, play_id = 0)
+# toolbox.register("evaluate", evaluate)
+toolbox.register("mate", tools.cxTwoPoint)
+toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
+# toolbox.register("select", tools.selTournament, tournsize=2000)
+# toolbox.register("select", tools.selBest)
+toolbox.register("select", tools.selStochasticUniversalSampling)
+
+# pop = toolbox.population(n=100)
+# pop = [[0, 6, 8, 7, 4, 5, 2, 1, 3], [0, 6, 3, 7, 4, 5, 2, 1, 8]]
+
+from deap import algorithms
+pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=10, verbose=False)
+# # print(logbook)
+results = tools.selBest(pop, k=1)
+
+# results = [[0, 6, 8, 7, 4, 5, 2, 1, 3]]
+print(results)
+print(evaluate(results[0]))
+reward = game_evaluate(results[0],game,0)
+print(reward)
+
+# reward = game_evaluate([0,1,3,4,6,7,2,5,9],game,0)
+# print(reward)
+#
+# for i in range(20):
+#     print(game_evaluate(pop[i], game, 0))
+
+# print(evaluate(actions, game, 0))
+
+# print(actions[:i])
+# game.render()
+# game2.render()
diff --git a/test/game_play_test.py b/test/game_play_test.py
new file mode 100644
index 00000000..78fdc4a5
--- /dev/null
+++ b/test/game_play_test.py
@@ -0,0 +1,704 @@
+from self_play import MCTS, GameHistory
+from games.simple_grid import MuZeroConfig, Game
+# from games.tictactoe import MuZeroConfig, Game
+import models
+
+import numpy
+import torch
+
+import math
+import time
+import copy
+
+class MySelfPlay:
+    """
+    Class which run in a dedicated thread to play games and save them to the replay-buffer.
+    """
+
+    def __init__(self, model, initial_checkpoint, Game, config, seed):
+        self.config = config
+        self.game = Game(seed)
+
+        # Fix random generator seed
+        numpy.random.seed(seed)
+        torch.manual_seed(seed)
+
+        # Initialize the network
+        # self.model = models.MuZeroNetwork(self.config)
+        # self.model.set_weights(initial_checkpoint["weights"])
+        self.model = model
+        self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu"))
+        self.model.eval()
+        self.trained_steps = initial_checkpoint["training_step"]
+        self.terminate = False
+
+    #play game 运行
+    # 合法的actions是固定的，由游戏文件提供(在本函数中，可以看到调用legal_actions函数没有使用env，这表面现游戏环境于的改变于动作无关)。
+    # 运行步骤：
+    #   1. 创建GameHistory用来存储数据
+    #   2. 检查游戏是否结束或者到底最大移动次数
+    #   3. 获取stacked observation（因为有些游戏需要考虑之前的历史数据和移动轨迹)
+    #   4. 运行MCTS搜索下一步的action
+    #   5. 调用游戏函数step（action），获取下一步action之后的observation、reward和done
+    #   6. 持续运行2-5步直到结束
+    #   7. 返回GameHistory
+    def play_game(
+        self, temperature, temperature_threshold, render, opponent, muzero_player
+    ):
+        """
+        Play one game with actions based on the Monte Carlo tree search at each moves.
+        """
+        game_history = GameHistory()
+        observation = self.game.reset()
+        game_history.action_history.append(0)
+        game_history.observation_history.append(observation) # 添加reset之后的observation
+        game_history.reward_history.append(0)
+        game_history.to_play_history.append(self.game.to_play())
+
+        done = False
+        game_id = None
+
+        if render:
+            self.game.render()
+
+        game_id = self.game.to_play()
+
+        with torch.no_grad():
+            while (
+                not done and len(game_history.action_history) <= self.config.max_moves
+            ): # 游戏没有结束且运行步数小于最大移动步长
+                assert (
+                    len(numpy.array(observation).shape) == 3
+                ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}"
+                assert (
+                    numpy.array(observation).shape == self.config.observation_shape
+                ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}."
+                stacked_observations = game_history.get_stacked_observations(
+                    -1, self.config.stacked_observations, len(self.config.action_space)
+                )
+                # index是-1，game_history 会在创建时添加reset的observation，因此其长度为1.index取模（%）之后时1
+                # config.stacked_observationis是存储之前的observation的数量，如果不要之前的信息，可以设为0，这样就不会存储之前的信息
+
+                # 一下的if-else部分主要是为了选择一个动作
+                # Choose the action
+                if opponent == "self" or muzero_player == self.game.to_play():
+                    root, mcts_info = MCTS(self.config).run(
+                        self.model,
+                        stacked_observations,
+                        self.game.legal_actions(),
+                        self.game.to_play(), # to_play返回当期玩游戏的玩家ID，默认是0
+                        True,
+                    )
+                    action = self.select_action(
+                        root,
+                        temperature
+                        if not temperature_threshold
+                        or len(game_history.action_history) < temperature_threshold
+                        else 0,
+                    ) # 根据temperature选择动作
+
+                    if render:
+                        print(f'Tree depth: {mcts_info["max_tree_depth"]}')
+                        print(
+                            f"Root value for player {self.game.to_play()}: {root.value():.2f}"
+                        )
+                else:
+                    action, root = self.select_opponent_action( #选择对手动作，分为随机，human和expert三种
+                        opponent, stacked_observations
+                    )
+
+                observation, reward, done = self.game.step(action) # 运行游戏
+
+                if render:
+                    print(f"Played action: {self.game.action_to_string(action)}")
+                    self.game.render()
+
+                game_history.store_search_statistics(root, self.config.action_space)
+
+                # Next batch
+                game_history.action_history.append(action)
+                game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数，从后往前取
+                game_history.reward_history.append(reward)
+                game_history.to_play_history.append(self.game.to_play())
+
+        return game_id, game_history
+
+    def close_game(self):
+        self.game.close()
+
+    def select_opponent_action(self, opponent, stacked_observations):
+        """
+        Select opponent action for evaluating MuZero level.
+        """
+        if opponent == "human":
+            root, mcts_info = MCTS(self.config).run(
+                self.model,
+                stacked_observations,
+                self.game.legal_actions(),
+                self.game.to_play(),
+                True,
+            )
+            print(f'Tree depth: {mcts_info["max_tree_depth"]}')
+            print(f"Root value for player {self.game.to_play()}: {root.value():.2f}")
+            print(
+                f"Player {self.game.to_play()} turn. MuZero suggests {self.game.action_to_string(self.select_action(root, 0))}"
+            )
+            return self.game.human_to_action(), root
+        elif opponent == "expert":
+            return self.game.expert_agent(), None
+        elif opponent == "random":
+            assert (
+                self.game.legal_actions()
+            ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}."
+            assert set(self.game.legal_actions()).issubset(
+                set(self.config.action_space)
+            ), "Legal actions should be a subset of the action space."
+
+            return numpy.random.choice(self.game.legal_actions()), None
+        else:
+            raise NotImplementedError(
+                'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"'
+            )
+
+    # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。
+    # 公式为 c^(1/t)。可以看到：
+    #   t越小，1/t于接近于无穷大，值大的c就越容易被选中。
+    #   t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1，难以区分大小，因此就会相当于随机选择
+    #   特殊地，当t=0时，使用random完全随机选择，当t=+∞,使用argmax选择最大的
+    @staticmethod # 静态方法修饰符，类似于static关键字
+    def select_action(node, temperature):
+        """
+        Select action according to the visit count distribution and the temperature.
+        The temperature is changed dynamically with the visit_softmax_temperature function
+        in the config.
+        """
+        visit_counts = numpy.array(
+            [child.visit_count for child in node.children.values()], dtype="int32"
+        )
+        actions = [action for action in node.children.keys()]
+        if temperature == 0:
+            action = actions[numpy.argmax(visit_counts)]
+        elif temperature == float("inf"):
+            action = numpy.random.choice(actions)
+        else:
+            # See paper appendix Data Generation
+            visit_count_distribution = visit_counts ** (1 / temperature)
+            visit_count_distribution = visit_count_distribution / sum(
+                visit_count_distribution
+            )
+            action = numpy.random.choice(actions, p=visit_count_distribution)
+
+        return action
+
+class PlayBuffer:
+    """
+    Class which run in a dedicated thread to store played games and generate batch.
+    """
+
+    def __init__(self, initial_checkpoint, initial_buffer, config):
+        self.config = config
+        self.buffer = copy.deepcopy(initial_buffer) # initial_buffer默认为{}
+        self.num_played_games = initial_checkpoint["num_played_games"]
+        self.num_played_steps = initial_checkpoint["num_played_steps"]
+        self.total_samples = sum(
+            [len(game_history.root_values) for game_history in self.buffer.values()]
+        )
+        if self.total_samples != 0:
+            print(
+                f"Replay buffer initialized with {self.total_samples} samples ({self.num_played_games} games).\n"
+            )
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+
+    def save_game(self, game_history):
+        self.buffer[self.num_played_games] = game_history
+        self.num_played_games += 1
+        self.num_played_steps += len(game_history.root_values)
+        self.total_samples += len(game_history.root_values)
+
+        if self.config.replay_buffer_size < len(self.buffer):
+            del_id = self.num_played_games - len(self.buffer)
+            self.total_samples -= len(self.buffer[del_id].root_values)
+            del self.buffer[del_id]
+
+    def get_buffer(self):
+        return self.buffer
+
+    def get_batch(self):
+        (
+            index_batch,
+            observation_batch,
+            action_batch,
+            reward_batch,
+            value_batch,
+            policy_batch,
+            gradient_scale_batch,
+        ) = ([], [], [], [], [], [], [])
+        weight_batch = None
+
+        for game_id, game_history, game_prob in self.sample_n_games(
+            self.config.batch_size
+        ):
+            game_pos, pos_prob = self.sample_position(game_history)
+
+            values, rewards, policies, actions = self.make_target(
+                game_history, game_pos
+            )
+
+            index_batch.append([game_id, game_pos])
+            observation_batch.append(
+                game_history.get_stacked_observations(
+                    game_pos,
+                    self.config.stacked_observations,
+                    len(self.config.action_space),
+                )
+            )
+            action_batch.append(actions)
+            value_batch.append(values)
+            reward_batch.append(rewards)
+            policy_batch.append(policies)
+            gradient_scale_batch.append(
+                [
+                    min(
+                        self.config.num_unroll_steps,
+                        len(game_history.action_history) - game_pos,
+                    )
+                ]
+                * len(actions)
+            )
+
+        # observation_batch: batch, channels, height, width
+        # action_batch: batch, num_unroll_steps+1
+        # value_batch: batch, num_unroll_steps+1
+        # reward_batch: batch, num_unroll_steps+1
+        # policy_batch: batch, num_unroll_steps+1, len(action_space)
+        # weight_batch: batch
+        # gradient_scale_batch: batch, num_unroll_steps+1
+        return (
+            index_batch,
+            (
+                observation_batch,
+                action_batch,
+                value_batch,
+                reward_batch,
+                policy_batch,
+                weight_batch,
+                gradient_scale_batch,
+            ),
+        )
+
+    def sample_game(self, force_uniform=True): #将force_uniform 设置为True，强制安装平均分布选取
+        """
+        Sample game from buffer either uniformly or according to some priority.
+        See paper appendix Training.
+        """
+        game_prob = None
+
+        game_index = numpy.random.choice(len(self.buffer))
+        game_id = self.num_played_games - len(self.buffer) + game_index
+
+        return game_id, self.buffer[game_id], game_prob
+
+    def sample_n_games(self, n_games):
+        selected_games = numpy.random.choice(list(self.buffer.keys()), n_games)
+        game_prob_dict = {}
+        ret = [
+            (game_id, self.buffer[game_id], game_prob_dict.get(game_id))
+            for game_id in selected_games
+        ]
+        return ret
+
+    def sample_position(self, game_history):
+        """
+        Sample position from game either uniformly or according to some priority.
+        See paper appendix Training.
+        """
+        position_prob = None
+
+        position_index = numpy.random.choice(len(game_history.root_values))
+
+        return position_index, position_prob
+
+    def update_game_history(self, game_id, game_history):
+        # The element could have been removed since its selection and update
+        # if next(iter(self.buffer)) <= game_id:
+        #     self.buffer[game_id] = game_history
+
+        self.buffer[game_id] = game_history
+
+    def compute_target_value(self, game_history, index):
+        # The value target is the discounted root value of the search tree td_steps into the
+        # future, plus the discounted sum of all rewards until then.
+        bootstrap_index = index + self.config.td_steps
+        if bootstrap_index < len(game_history.root_values):
+            root_values = (
+                game_history.root_values
+                if game_history.reanalysed_predicted_root_values is None
+                else game_history.reanalysed_predicted_root_values
+            )
+            last_step_value = (
+                root_values[bootstrap_index]
+                if game_history.to_play_history[bootstrap_index]
+                == game_history.to_play_history[index]
+                else -root_values[bootstrap_index]
+            )
+
+            value = last_step_value * self.config.discount**self.config.td_steps
+        else:
+            value = 0
+
+        for i, reward in enumerate(
+            game_history.reward_history[index + 1 : bootstrap_index + 1]
+        ):
+            # The value is oriented from the perspective of the current player
+            value += (
+                reward
+                if game_history.to_play_history[index]
+                == game_history.to_play_history[index + i]
+                else -reward
+            ) * self.config.discount**i
+
+        return value
+
+    def make_target(self, game_history, state_index):
+        """
+        Generate targets for every unroll steps.
+        """
+        target_values, target_rewards, target_policies, actions = [], [], [], []
+        for current_index in range(
+            state_index, state_index + self.config.num_unroll_steps + 1
+        ):
+            value = self.compute_target_value(game_history, current_index)
+
+            if current_index < len(game_history.root_values):
+                target_values.append(value)
+                target_rewards.append(game_history.reward_history[current_index])
+                target_policies.append(game_history.child_visits[current_index])
+                actions.append(game_history.action_history[current_index])
+            elif current_index == len(game_history.root_values):
+                target_values.append(0)
+                target_rewards.append(game_history.reward_history[current_index])
+                # Uniform policy
+                target_policies.append(
+                    [
+                        1 / len(game_history.child_visits[0])
+                        for _ in range(len(game_history.child_visits[0]))
+                    ]
+                )
+                actions.append(game_history.action_history[current_index])
+            else:
+                # States past the end of games are treated as absorbing states
+                target_values.append(0)
+                target_rewards.append(0)
+                # Uniform policy
+                target_policies.append(
+                    [
+                        1 / len(game_history.child_visits[0])
+                        for _ in range(len(game_history.child_visits[0]))
+                    ]
+                )
+                actions.append(numpy.random.choice(self.config.action_space))
+
+        return target_values, target_rewards, target_policies, actions
+
+class Trainer:
+    """
+    Class which run in a dedicated thread to train a neural network and save it
+    in the shared storage.
+    """
+
+    def __init__(self, initial_checkpoint, config):
+        self.config = config
+
+        # Fix random generator seed
+        numpy.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+
+        # Initialize the network
+        self.model = models.MuZeroNetwork(self.config)
+        # self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"]))
+        self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu"))
+        self.model.train()
+
+        self.training_step = initial_checkpoint["training_step"]
+
+        if "cuda" not in str(next(self.model.parameters()).device):
+            print("You are not training on GPU.\n")
+
+        # Initialize the optimizer
+        if self.config.optimizer == "SGD":
+            self.optimizer = torch.optim.SGD(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                momentum=self.config.momentum,
+                weight_decay=self.config.weight_decay,
+            )
+        elif self.config.optimizer == "Adam":
+            self.optimizer = torch.optim.Adam(
+                self.model.parameters(),
+                lr=self.config.lr_init,
+                weight_decay=self.config.weight_decay,
+            )
+        else:
+            raise NotImplementedError(
+                f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py."
+            )
+
+        # if initial_checkpoint["optimizer_state"] is not None:
+        #     print("Loading optimizer...\n")
+        #     self.optimizer.load_state_dict(
+        #         copy.deepcopy(initial_checkpoint["optimizer_state"])
+        #     )
+
+    # # update weights 与 continuous update weights 的区别
+    # #   1. update weights 是实际计算更新network的权重
+    # #   2. continuous update weights 从replay buffer 里获取数据batch， 并将batch传递给update weights，使update weights完成参数更新
+    # def continuous_update_weights(self, play_buffer, terminate): # terminate是用来记录replay buffer等其它程序是否终止的，跟game的状态无关
+    #     next_batch = play_buffer.get_batch()
+    #     # Training loop
+    #     while self.training_step < self.config.training_steps and not terminate:
+    #         index_batch, batch = next_batch
+    #         next_batch = play_buffer.get_batch()
+    #         self.update_lr()
+    #         (
+    #             priorities,
+    #             total_loss,
+    #             value_loss,
+    #             reward_loss,
+    #             policy_loss,
+    #         ) = self.update_weights(batch)
+
+    def update_weights(self, batch):
+        """
+        Perform one training step.
+        """
+
+        (
+            observation_batch,
+            action_batch,
+            target_value,
+            target_reward,
+            target_policy,
+            weight_batch,
+            gradient_scale_batch,
+        ) = batch
+
+        # Keep values as scalars for calculating the priorities for the prioritized replay
+        target_value_scalar = numpy.array(target_value, dtype="float32")
+        priorities = numpy.zeros_like(target_value_scalar)
+
+        device = next(self.model.parameters()).device
+        observation_batch = (
+            torch.tensor(numpy.array(observation_batch)).float().to(device)
+        )
+        action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1)
+        target_value = torch.tensor(target_value).float().to(device)
+        target_reward = torch.tensor(target_reward).float().to(device)
+        target_policy = torch.tensor(target_policy).float().to(device)
+        gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device)
+        # observation_batch: batch, channels, height, width
+        # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze)
+        # target_value: batch, num_unroll_steps+1
+        # target_reward: batch, num_unroll_steps+1
+        # target_policy: batch, num_unroll_steps+1, len(action_space)
+        # gradient_scale_batch: batch, num_unroll_steps+1
+
+        target_value = models.scalar_to_support(target_value, self.config.support_size)
+        target_reward = models.scalar_to_support(
+            target_reward, self.config.support_size
+        )
+        # target_value: batch, num_unroll_steps+1, 2*support_size+1
+        # target_reward: batch, num_unroll_steps+1, 2*support_size+1
+
+        ## Generate predictions
+        value, reward, policy_logits, hidden_state = self.model.initial_inference(
+            observation_batch
+        )
+        predictions = [(value, reward, policy_logits)]
+        for i in range(1, action_batch.shape[1]):
+            value, reward, policy_logits, hidden_state = self.model.recurrent_inference(
+                hidden_state, action_batch[:, i]
+            )
+            # Scale the gradient at the start of the dynamics function (See paper appendix Training)
+            hidden_state.register_hook(lambda grad: grad * 0.5)
+            predictions.append((value, reward, policy_logits))
+        # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim)
+
+        ## Compute losses
+        value_loss, reward_loss, policy_loss = (0, 0, 0)
+        value, reward, policy_logits = predictions[0]
+        # Ignore reward loss for the first batch step
+        current_value_loss, _, current_policy_loss = self.loss_function(
+            value.squeeze(-1),
+            reward.squeeze(-1),
+            policy_logits,
+            target_value[:, 0],
+            target_reward[:, 0],
+            target_policy[:, 0],
+        )
+        value_loss += current_value_loss
+        policy_loss += current_policy_loss
+        # Compute priorities for the prioritized replay (See paper appendix Training)
+        pred_value_scalar = (
+            models.support_to_scalar(value, self.config.support_size)
+            .detach()
+            .cpu()
+            .numpy()
+            .squeeze()
+        )
+        priorities[:, 0] = (
+            numpy.abs(pred_value_scalar - target_value_scalar[:, 0])
+            ** self.config.PER_alpha
+        )
+
+        for i in range(1, len(predictions)):
+            value, reward, policy_logits = predictions[i]
+            (
+                current_value_loss,
+                current_reward_loss,
+                current_policy_loss,
+            ) = self.loss_function(
+                value.squeeze(-1),
+                reward.squeeze(-1),
+                policy_logits,
+                target_value[:, i],
+                target_reward[:, i],
+                target_policy[:, i],
+            )
+
+            # Scale gradient by the number of unroll steps (See paper appendix Training)
+            current_value_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_reward_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+            current_policy_loss.register_hook(
+                lambda grad: grad / gradient_scale_batch[:, i]
+            )
+
+            value_loss += current_value_loss
+            reward_loss += current_reward_loss
+            policy_loss += current_policy_loss
+
+            # Compute priorities for the prioritized replay (See paper appendix Training)
+            pred_value_scalar = (
+                models.support_to_scalar(value, self.config.support_size)
+                .detach()
+                .cpu()
+                .numpy()
+                .squeeze()
+            )
+            priorities[:, i] = (
+                numpy.abs(pred_value_scalar - target_value_scalar[:, i])
+                ** self.config.PER_alpha
+            )
+
+        # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze)
+        loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss
+
+        # Mean over batch dimension (pseudocode do a sum)
+        loss = loss.mean()
+
+        # Optimize
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        self.training_step += 1
+
+        return (
+            priorities,
+            # For log purpose
+            loss.item(),
+            value_loss.mean().item(),
+            reward_loss.mean().item(),
+            policy_loss.mean().item(),
+        )
+
+    def update_lr(self):
+        """
+        Update learning rate
+        """
+        lr = self.config.lr_init * self.config.lr_decay_rate ** (
+            self.training_step / self.config.lr_decay_steps
+        )
+        for param_group in self.optimizer.param_groups:
+            param_group["lr"] = lr
+
+    @staticmethod
+    def loss_function(
+        value,
+        reward,
+        policy_logits,
+        target_value,
+        target_reward,
+        target_policy,
+    ):
+        # Cross-entropy seems to have a better convergence than MSE
+        value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1)
+        reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1)
+        policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(
+            1
+        )
+        return value_loss, reward_loss, policy_loss
+
+if __name__ == "__main__":
+    config = MuZeroConfig()
+
+    checkpoint = {
+        "weights": None,
+        "optimizer_state": None,
+        "total_reward": 0,
+        "muzero_reward": 0,
+        "opponent_reward": 0,
+        "episode_length": 0,
+        "mean_value": 0,
+        "training_step": 0,
+        "lr": 0,
+        "total_loss": 0,
+        "value_loss": 0,
+        "reward_loss": 0,
+        "policy_loss": 0,
+        "num_played_games": 0,
+        "num_played_steps": 0,
+        "num_reanalysed_games": 0,
+        "terminate": False,
+    }
+
+    trainer = Trainer(checkpoint, config)
+    selfplay = MySelfPlay(trainer.model, checkpoint, Game, config, config.seed)
+    buffer = {}
+    play_buffer = PlayBuffer(checkpoint, buffer, config)
+    for i in range(config.training_steps):
+        game_id, game_history = selfplay.play_game(selfplay.config.visit_softmax_temperature_fn(0), selfplay.config.temperature_threshold, False, "self",0)
+
+        # print(game_id)
+        # print(game_history.action_history)
+        print(game_history.reward_history)
+        muzero_reward = sum(
+            reward
+            for i, reward in enumerate(game_history.reward_history)
+            if game_history.to_play_history[i - 1]
+            == config.muzero_player
+        )
+
+        print(muzero_reward)
+        # print(game_history.to_play_history)
+        # # print(game_history.observation_history)
+        # print("child visits", game_history.child_visits)
+        # print(game_history.root_values) # root value指的是root节点的UCB值
+
+        # buffer[game_id] = game_history
+
+        play_buffer.update_game_history(game_id, game_history)
+
+        for i in range(10):
+            index_batch, batch = play_buffer.get_batch()
+            # print(batch[1])
+            trainer.update_lr()
+            trainer.update_weights(batch)
+
+    selfplay.close_game()
+
+
diff --git a/test/load_model.py b/test/load_model.py
new file mode 100644
index 00000000..88e83520
--- /dev/null
+++ b/test/load_model.py
@@ -0,0 +1,12 @@
+import torch
+
+import simplifiedMuZero.net2.models2 as models
+from games.tictactoe import Game, MuZeroConfig
+
+from game_tournament import load_model
+
+config = MuZeroConfig()
+
+muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint"
+muzero_2net_model = load_model(models.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config)
+
diff --git a/test/mcts_test.py b/test/mcts_test.py
new file mode 100644
index 00000000..d3edc0f3
--- /dev/null
+++ b/test/mcts_test.py
@@ -0,0 +1,245 @@
+import models
+from self_play import MCTS, GameHistory, Node, MinMaxStats
+from games.tictactoe import MuZeroConfig, Game
+
+import torch
+import numpy
+import math
+
+class MCTS1:
+    """
+    Core Monte Carlo Tree Search algorithm.
+    To decide on an action, we run N simulations, always starting at the root of
+    the search tree and traversing the tree according to the UCB formula until we
+    reach a leaf node.
+    """
+
+    def __init__(self, config):
+        self.config = config
+
+    # run函数运行流程：
+    #   1. 获取root节点
+    #       (1)如果由指定节点这将root赋值为该节点；
+    #       (2)如果没有，则
+    #           i. 创建新的节点Node(0)
+    #           ii. 使用initial_inference函数通过observation获取相应的reward，hidden state，legal actions等数据
+    #           iii. 将ii中获取的数据赋值到创建的root节点中取
+    #           PS. 可以看到，在（1）的情况下不需要调用initial_inference函数
+    #   2. 检查是否需要添加探索噪音
+    #   3. 开始循环模拟游戏，模拟的次数由num simulation决定
+    #       （1） 将初始节点node设置为root，并将节点node加入search tree中
+    #       （2） 检查该节点是否已经扩展，如果已经扩展，则通过ucb值来选择子节点expand. 并将node 设置为选中的节点。并将节点node加入search tree中
+    #       （3） 重复2，直到找到expanded为false的node为止
+    #       （4） 选择search_tree[-2]为parent(因为最后一个是node)
+    #       （5） 运行recurrent_inference函数，获得reward，hidden state，legal actions等数据
+    #       （6） 扩展node,即为node创建子节点，使node展开。
+    #       （7） 反向传播算法，对路径上的所有访问次数+1，value值加reward
+    #       PS: 可以看到，通过不停的模拟，节点被一层层的扩展（每次模拟扩展一个节点）。
+    #   4. 返回扩展过后的节点树root，以便之后的程序根据它选择动作action
+    def run(
+        self,
+        model,
+        observation,
+        legal_actions,
+        to_play,
+        add_exploration_noise,
+        override_root_with=None,
+    ):
+        """
+        At the root of the search tree we use the representation function to obtain a
+        hidden state given the current observation.
+        We then run a Monte Carlo Tree Search using only action sequences and the model
+        learned by the network.
+        """
+        print(override_root_with)
+        if override_root_with: #检查有没有提供Node,如果有，则指定；如果没有，则自己创建一个
+            root = override_root_with
+            root_predicted_value = None
+        else:
+            root = Node(0)
+            observation = (
+                torch.tensor(observation)
+                .float()
+                .unsqueeze(0)
+                .to(next(model.parameters()).device)
+            ) # observation转tensor，外面包一层形成一个batch。 Observation的长度由参数stacked_observation配置，主要存储之前的previous。不要之前privious的配置为0
+            (
+                root_predicted_value,
+                reward,
+                policy_logits,
+                hidden_state,
+            ) = model.initial_inference(observation)
+            root_predicted_value = models.support_to_scalar(
+                root_predicted_value, self.config.support_size
+            ).item()
+            reward = models.support_to_scalar(reward, self.config.support_size).item()
+            assert (
+                legal_actions
+            ), f"Legal actions should not be an empty array. Got {legal_actions}."
+            assert set(legal_actions).issubset(
+                set(self.config.action_space)
+            ), "Legal actions should be a subset of the action space."
+            root.expand(
+                legal_actions,
+                to_play,
+                reward,
+                policy_logits,
+                hidden_state,
+            )
+
+        if add_exploration_noise:
+            root.add_exploration_noise(
+                dirichlet_alpha=self.config.root_dirichlet_alpha,
+                exploration_fraction=self.config.root_exploration_fraction,
+            )
+
+        min_max_stats = MinMaxStats()
+
+        max_tree_depth = 0
+        for _ in range(self.config.num_simulations): # 开始模拟游戏
+            virtual_to_play = to_play
+            node = root
+            search_path = [node]
+            current_tree_depth = 0
+
+            # expanded根据node的子节点个数判断是否已经扩展了，如果没有子节点，说明没被扩展
+            while node.expanded(): #这个循环一直在搜索没有expand的子节点。如果子节点已经expand了，则通过select_child选择下一个
+                current_tree_depth += 1
+                action, node = self.select_child(node, min_max_stats) #选取ucb最大的一个action，如果有多个action得分相同，随机选取一个
+                search_path.append(node) #把节点添加到搜索队列
+
+                # Players play turn by turn
+                if virtual_to_play + 1 < len(self.config.players):
+                    virtual_to_play = self.config.players[virtual_to_play + 1]
+                else:
+                    virtual_to_play = self.config.players[0]
+
+            # 在搜索树内部，我们使用动态函数来获取给定动作的下一个hidden_state和previous hidden state
+            # Inside the search tree we use the dynamics function to obtain the next hidden
+            # state given an action and the previous hidden state
+            parent = search_path[-2] # 选择倒数第二个节点，因为当前的node是-1，则-2是它的parent
+            value, reward, policy_logits, hidden_state = model.recurrent_inference(
+                parent.hidden_state,
+                torch.tensor([[action]]).to(parent.hidden_state.device),
+            )
+            value = models.support_to_scalar(value, self.config.support_size).item()
+            reward = models.support_to_scalar(reward, self.config.support_size).item()
+            # expand一层节点，actions是动作列表，policy_logits是rewards列表
+            # 通过该函数，在该节点扩展一层节点
+            node.expand(
+                self.config.action_space,
+                virtual_to_play,
+                reward,
+                policy_logits,
+                hidden_state,
+            )
+
+            self.backpropagate(search_path, value, virtual_to_play, min_max_stats)
+
+            max_tree_depth = max(max_tree_depth, current_tree_depth)
+
+        extra_info = {
+            "max_tree_depth": max_tree_depth,
+            "root_predicted_value": root_predicted_value,
+        }
+        return root, extra_info
+
+    # MCTS 的select child和之前SelfPlay的select action逻辑是不一样的
+    #   1. select child是根据UCB选取的，select action是根据各个动作的visit count和temperature选取的
+    #   2. select child 选择的对象是Node,Node是由当前的state执行action后生成的新Node形成的。select action单纯的是选action
+    def select_child(self, node, min_max_stats):
+        """
+        Select the child with the highest UCB score.
+        """
+        max_ucb = max(
+            self.ucb_score(node, child, min_max_stats)
+            for action, child in node.children.items()
+        )
+        action = numpy.random.choice( # 随机选择ucb值等于最大ucb的动作（因为可能有多个动作的值都达到了最大的ucb,如果只有一个，那么就会选取这个)
+            [
+                action
+                for action, child in node.children.items()
+                if self.ucb_score(node, child, min_max_stats) == max_ucb
+            ]
+        )
+        return action, node.children[action]
+
+    def ucb_score(self, parent, child, min_max_stats): #该函数只进行一步查询，不进行多步
+        """
+        The score for a node is based on its value, plus an exploration bonus based on the prior.
+        """
+        pb_c = (
+            math.log(
+                (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base # pc_c_base由配置文件决定
+            )
+            + self.config.pb_c_init
+        )
+        pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1)
+
+        prior_score = pb_c * child.prior # prior 之前的p_value
+        # 公式 pb_c = (log((N+C+1)/C)+init ) * sqrt(N/(VC+1))
+        # prior_score = pbc * prior
+
+        if child.visit_count > 0:
+            # Mean value Q
+            value_score = min_max_stats.normalize( # 括号里的是Q值，Q=E[r+r*Q'。此处在对其进行正则化
+                child.reward
+                + self.config.discount # 衰减系数， 之后乘以子节点的值
+                * (child.value() if len(self.config.players) == 1 else -child.value()) # 根据players的个数，如果大于1，则子节点必定是对手，因此子节点的取负。
+            )
+        else:
+            value_score = 0
+
+        return prior_score + value_score # 先前的分数加上Q值就是新的UCB值
+
+    # 反向传播算法
+    # 对路径上的所有访问次数+1，value值加reward
+    def backpropagate(self, search_path, value, to_play, min_max_stats): # MCTS反向传播，visit count加1
+        """
+        At the end of a simulation, we propagate the evaluation all the way up the tree
+        to the root.
+        """
+        if len(self.config.players) == 1:
+            for node in reversed(search_path):
+                node.value_sum += value
+                node.visit_count += 1
+                min_max_stats.update(node.reward + self.config.discount * node.value())
+
+                value = node.reward + self.config.discount * value
+
+        elif len(self.config.players) == 2:
+            for node in reversed(search_path):
+                node.value_sum += value if node.to_play == to_play else -value
+                node.visit_count += 1
+                min_max_stats.update(node.reward + self.config.discount * -node.value())
+
+                value = (
+                    -node.reward if node.to_play == to_play else node.reward
+                ) + self.config.discount * value
+
+        else:
+            raise NotImplementedError("More than two player mode not implemented.")
+
+config = MuZeroConfig()
+game = Game(config.seed)
+
+game_history = GameHistory()
+
+observation = game.reset()
+
+game_history.action_history.append(0)
+game_history.observation_history.append(observation)  # 添加reset之后的observation
+game_history.reward_history.append(0)
+game_history.to_play_history.append(game.to_play())
+
+stacked_observations = game_history.get_stacked_observations( -1, config.stacked_observations, len(config.action_space))
+
+done = False
+
+model = models.MuZeroNetwork(config)
+
+root, mcts_info = MCTS1(config).run(model, stacked_observations, game.legal_actions(), game.to_play(), True)
+
+print(root)
+
+game.close()
\ No newline at end of file
diff --git a/test/muzero_config_test.py b/test/muzero_config_test.py
new file mode 100644
index 00000000..1b5fc135
--- /dev/null
+++ b/test/muzero_config_test.py
@@ -0,0 +1,6 @@
+from games.simple_grid import MuZeroConfig
+
+if __name__ == "__main__":
+    config = MuZeroConfig()
+    config.results_path /= "config_test"
+    print(config.results_path)
\ No newline at end of file
diff --git a/test/ray_test.py b/test/ray_test.py
new file mode 100644
index 00000000..7d7f0cf6
--- /dev/null
+++ b/test/ray_test.py
@@ -0,0 +1,20 @@
+import ray
+import time
+
+ray.init()
+
+@ray.remote
+def hello():
+    return "Hello world!"
+
+object_id = hello.remote()
+
+hello = ray.get(object_id)
+
+print(hello)
+
+# time.sleep(100)
+results_ids = [ray.put(i) for i in range(10)]
+print(ray.get(results_ids))
+
+ray.shutdown()
\ No newline at end of file
diff --git a/trainer.py b/trainer.py
index faa5f941..849beaa2 100644
--- a/trainer.py
+++ b/trainer.py
@@ -66,7 +66,7 @@ def continuous_update_weights(self, replay_buffer, shared_storage):
         next_batch = replay_buffer.get_batch.remote()
         # Training loop
         while self.training_step < self.config.training_steps and not ray.get(
-            shared_storage.get_info.remote("terminate")
+            shared_storage.get_info.remote("terminate") # terminate是用来记录replay buffer等其它程序是否终止的，跟game的状态无关
         ):
             index_batch, batch = ray.get(next_batch)
             next_batch = replay_buffer.get_batch.remote()
@@ -117,7 +117,7 @@ def continuous_update_weights(self, replay_buffer, shared_storage):
                     )
                     > self.config.ratio
                     and self.training_step < self.config.training_steps
-                    and not ray.get(shared_storage.get_info.remote("terminate"))
+                    and not ray.get(shared_storage.get_info.remote("terminate")) # terminate是用来记录replay buffer等其它程序是否终止的，跟game的状态无关
                 ):
                     time.sleep(0.5)
 
@@ -279,7 +279,7 @@ def update_lr(self):
         lr = self.config.lr_init * self.config.lr_decay_rate ** (
             self.training_step / self.config.lr_decay_steps
         )
-        for param_group in self.optimizer.param_groups:
+        for param_group in self.optimizer.param_groups: # 更新optimizer的lr
             param_group["lr"] = lr
 
     @staticmethod