werner-duvaud · ChunchangShao · Feb 15, 2023 · Aug 11, 2023 · Aug 14, 2023 · Aug 16, 2023
diff --git a/.gitignore b/.gitignore
@@ -90,4 +90,6 @@ venv.bak/
 # mypy
 .mypy_cache/
 .dmypy.json
-dmypy.json
+dmypy.json
+
+results/
diff --git a/game_tournament.py b/game_tournament.py
diff --git a/games/simple_grid.py b/games/simple_grid.py
@@ -23,6 +23,8 @@ def __init__(self):
         self.players = list(range(1))  # List of players. You should only edit the length
         self.stacked_observations = 0  # Number of previous observations and previous actions to add to the current observation
 
+        self.action_replace = True
+
         # Evaluate
         self.muzero_player = 0  # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second)
         self.opponent = None  # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class

diff --git a/games/tictactoe.py b/games/tictactoe.py
@@ -27,7 +27,8 @@ def __init__(self):
         self.muzero_player = 0  # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second)
         self.opponent = "expert"  # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class
 
-
+        # 动作是否能重复
+        self.action_replace = False
 
         ### Self-Play
         self.num_workers = 1  # Number of simultaneous threads/workers self-playing to feed the replay buffer
@@ -48,7 +49,8 @@ def __init__(self):
 
 
         ### Network
-        self.network = "resnet"  # "resnet" / "fullyconnected"
+        # self.network = "resnet"  # "resnet" / "fullyconnected"
+        self.network = "fullyconnected"
         self.support_size = 10  # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size. Choose it so that support_size <= sqrt(max(abs(discounted reward)))
 
         # Residual Network
@@ -63,19 +65,27 @@ def __init__(self):
         self.resnet_fc_policy_layers = [8]  # Define the hidden layers in the policy head of the prediction network
 
         # Fully Connected Network
+        # self.encoding_size = 32
+        # self.fc_representation_layers = []  # Define the hidden layers in the representation network
+        # self.fc_dynamics_layers = [16]  # Define the hidden layers in the dynamics network
+        # self.fc_reward_layers = [16]  # Define the hidden layers in the reward network
+        # self.fc_value_layers = []  # Define the hidden layers in the value network
+        # self.fc_policy_layers = []  # Define the hidden layers in the policy network
+
         self.encoding_size = 32
-        self.fc_representation_layers = []  # Define the hidden layers in the representation network
+        self.fc_representation_layers = [16]  # Define the hidden layers in the representation network
         self.fc_dynamics_layers = [16]  # Define the hidden layers in the dynamics network
         self.fc_reward_layers = [16]  # Define the hidden layers in the reward network
-        self.fc_value_layers = []  # Define the hidden layers in the value network
-        self.fc_policy_layers = []  # Define the hidden layers in the policy network
-
+        self.fc_value_layers = [16]  # Define the hidden layers in the value network
+        self.fc_policy_layers = [16]
 
 
         ### Training
         self.results_path = pathlib.Path(__file__).resolve().parents[1] / "results" / pathlib.Path(__file__).stem / datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S")  # Path to store the model weights and TensorBoard logs
         self.save_model = True  # Save the checkpoint in results_path as model.checkpoint
-        self.training_steps = 1000000  # Total number of training steps (ie weights update according to a batch)
+        # self.training_steps = 1000000  # Total number of training steps (ie weights update according to a batch)
+        # self.training_steps = 50000
+        self.training_steps = 500000
         self.batch_size = 64  # Number of parts of games to train on at each training step
         self.checkpoint_interval = 10  # Number of training steps before using the model for self-playing
         self.value_loss_weight = 0.25  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)

diff --git a/models.py b/models.py
@@ -94,6 +94,7 @@ def __init__(
         super().__init__()
         self.action_space_size = action_space_size
         self.full_support_size = 2 * support_size + 1
+        # support_size 表示的应该是一个选择的范围【-support_size, support_size】.最后+1是因为range最后不包含最后的数
 
         self.representation_network = torch.nn.DataParallel(
             mlp(
@@ -107,6 +108,7 @@ def __init__(
             )
         )
 
+        #dynamics的输入是encoding_size+action_space_size
         self.dynamics_encoded_state_network = torch.nn.DataParallel(
             mlp(
                 encoding_size + self.action_space_size,
@@ -115,14 +117,14 @@ def __init__(
             )
         )
         self.dynamics_reward_network = torch.nn.DataParallel(
-            mlp(encoding_size, fc_reward_layers, self.full_support_size)
+            mlp(encoding_size, fc_reward_layers, self.full_support_size) #最后的输出为full_support_size，因为范围是[-support_size, support_size]
         )
 
         self.prediction_policy_network = torch.nn.DataParallel(
-            mlp(encoding_size, fc_policy_layers, self.action_space_size)
+            mlp(encoding_size, fc_policy_layers, self.action_space_size) #输出action的概率
         )
         self.prediction_value_network = torch.nn.DataParallel(
-            mlp(encoding_size, fc_value_layers, self.full_support_size)
+            mlp(encoding_size, fc_value_layers, self.full_support_size) #最后的输出为full_support_size，因为范围是[-support_size, support_size]
         )
 
     def prediction(self, encoded_state):
@@ -134,35 +136,39 @@ def representation(self, observation):
         encoded_state = self.representation_network(
             observation.view(observation.shape[0], -1)
         )
+
+        # 正则化
         # Scale encoded state between [0, 1] (See appendix paper Training)
         min_encoded_state = encoded_state.min(1, keepdim=True)[0]
         max_encoded_state = encoded_state.max(1, keepdim=True)[0]
         scale_encoded_state = max_encoded_state - min_encoded_state
-        scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5
+        scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5 # 防止为0，造成NAN
         encoded_state_normalized = (
             encoded_state - min_encoded_state
         ) / scale_encoded_state
         return encoded_state_normalized
 
+    # dynamic同representation的唯一不同就是前者需要将encoded_state和action合并在一起作为输入，而representation不需要绑定action
     def dynamics(self, encoded_state, action):
         # Stack encoded_state with a game specific one hot encoded action (See paper appendix Network Architecture)
         action_one_hot = (
             torch.zeros((action.shape[0], self.action_space_size))
             .to(action.device)
             .float()
         )
-        action_one_hot.scatter_(1, action.long(), 1.0)
+        action_one_hot.scatter_(1, action.long(), 1.0) #将action的位置赋值为1
         x = torch.cat((encoded_state, action_one_hot), dim=1)
 
         next_encoded_state = self.dynamics_encoded_state_network(x)
 
         reward = self.dynamics_reward_network(next_encoded_state)
 
+        # 正则化
         # Scale encoded state between [0, 1] (See paper appendix Training)
         min_next_encoded_state = next_encoded_state.min(1, keepdim=True)[0]
         max_next_encoded_state = next_encoded_state.max(1, keepdim=True)[0]
         scale_next_encoded_state = max_next_encoded_state - min_next_encoded_state
-        scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5
+        scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5 # 防止为0，造成NAN
         next_encoded_state_normalized = (
             next_encoded_state - min_next_encoded_state
         ) / scale_next_encoded_state
@@ -172,7 +178,7 @@ def dynamics(self, encoded_state, action):
     def initial_inference(self, observation):
         encoded_state = self.representation(observation)
         policy_logits, value = self.prediction(encoded_state)
-        # reward equal to 0 for consistency
+        # reward equal to 0 for consistency 一致性奖励等于 0
         reward = torch.log(
             (
                 torch.zeros(1, self.full_support_size)
@@ -181,6 +187,7 @@ def initial_inference(self, observation):
                 .to(observation.device)
             )
         )
+        # reward的样子为[[0,0,...,0,1,0,...,0,0]，...]。即中间值为1，其余全为0，然后重复于observation行数相同的次数
 
         return (
             value,
@@ -605,8 +612,8 @@ def initial_inference(self, observation):
         reward = torch.log(
             (
                 torch.zeros(1, self.full_support_size)
-                .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0)
-                .repeat(len(observation), 1)
+                .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0) # 将support_size位置设为1
+                .repeat(len(observation), 1) # 根据observation的长度复制，保证reward的维度于observation的一致，即之前的observation也赋值
                 .to(observation.device)
             )
         )
@@ -637,29 +644,29 @@ def mlp(
     sizes = [input_size] + layer_sizes + [output_size]
     layers = []
     for i in range(len(sizes) - 1):
-        act = activation if i < len(sizes) - 2 else output_activation
+        act = activation if i < len(sizes) - 2 else output_activation #激活函数，最后一层是output_activation，其余的都一样
         layers += [torch.nn.Linear(sizes[i], sizes[i + 1]), act()]
     return torch.nn.Sequential(*layers)
 
 
-def support_to_scalar(logits, support_size):
+def support_to_scalar(logits, support_size): # logits 是 value的对数值，support_size是转换后的范围。
     """
     Transform a categorical representation to a scalar
     See paper appendix Network Architecture
     """
     # Decode to a scalar
-    probabilities = torch.softmax(logits, dim=1)
+    probabilities = torch.softmax(logits, dim=1) # softmax在指定的向量和为1，softmax扩大大的，缩小下的，shape为[stacked_size, fully_support_size]
     support = (
-        torch.tensor([x for x in range(-support_size, support_size + 1)])
+        torch.tensor([x for x in range(-support_size, support_size + 1)]) # 范围是-support_size, support_szie。因为support_size+1
         .expand(probabilities.shape)
         .float()
         .to(device=probabilities.device)
-    )
-    x = torch.sum(support * probabilities, dim=1, keepdim=True)
+    ) # shape 为【stacked_size, fully_support_size】，
+    x = torch.sum(support * probabilities, dim=1, keepdim=True) # 输出为【1，fully_support_size】，因为dim=1，另外keep_dim=True，所有是【1，fully_support_size】而不是【fully_support_size]
 
     # Invert the scaling (defined in https://arxiv.org/abs/1805.11593)
-    x = torch.sign(x) * (
-        ((torch.sqrt(1 + 4 * 0.001 * (torch.abs(x) + 1 + 0.001)) - 1) / (2 * 0.001))
+    x = torch.sign(x) * ( # sign函数为分段函数, x小于0为-1，大于0为1，0为0。主要是获取x的符号
+        ((torch.sqrt(1 + 4 * 0.001 * (torch.abs(x) + 1 + 0.001)) - 1) / (2 * 0.001)) # (sqrt(1+0.04*(|x|+1.001))-1)/0.002
         ** 2
         - 1
     )
@@ -675,9 +682,9 @@ def scalar_to_support(x, support_size):
     x = torch.sign(x) * (torch.sqrt(torch.abs(x) + 1) - 1) + 0.001 * x
 
     # Encode on a vector
-    x = torch.clamp(x, -support_size, support_size)
-    floor = x.floor()
-    prob = x - floor
+    x = torch.clamp(x, -support_size, support_size) # 裁剪x的范围，使x的范围定为[-support_size, support_size]
+    floor = x.floor() # floor向下取整，类似的，ceil为向上取整
+    prob = x - floor # 减去整数，保留小数部分（因为在support_to_scala部分是index位置乘上概率)
     logits = torch.zeros(x.shape[0], x.shape[1], 2 * support_size + 1).to(x.device)
     logits.scatter_(
         2, (floor + support_size).long().unsqueeze(-1), (1 - prob).unsqueeze(-1)

diff --git a/muzero.py b/muzero.py
@@ -43,6 +43,7 @@ def __init__(self, game_name, config=None, split_resources_in=1):
         # Load the game and the config from the module with the game name
         try:
             game_module = importlib.import_module("games." + game_name)
+            print("games." + game_name)
             self.Game = game_module.Game
             self.config = game_module.MuZeroConfig()
         except ModuleNotFoundError as err:
@@ -671,7 +672,10 @@ def load_model_menu(muzero, game_name):
                 choice = input("Invalid input, enter a number listed above: ")
             choice = int(choice)
             if choice == 0:
+                start_time = time.time()
                 muzero.train()
+                end_time = time.time()
+                print("耗时: {:.2f}秒".format(end_time - start_time))
             elif choice == 1:
                 load_model_menu(muzero, game_name)
             elif choice == 2: