diff --git a/.gitignore b/.gitignore index f106bb6b..844f676b 100644 --- a/.gitignore +++ b/.gitignore @@ -90,4 +90,6 @@ venv.bak/ # mypy .mypy_cache/ .dmypy.json -dmypy.json \ No newline at end of file +dmypy.json + +results/ \ No newline at end of file diff --git a/game_tournament.py b/game_tournament.py new file mode 100644 index 00000000..81b1e363 --- /dev/null +++ b/game_tournament.py @@ -0,0 +1,392 @@ +import pickle + +import torch +import copy +import numpy + +from games.tictactoe import MuZeroConfig, Game +import models +import simplifiedMuZero.net2.models2 as models2 +from self_play import MCTS, GameHistory,SelfPlay + +class GameTournament: + def __init__(self, config:MuZeroConfig): + self.models = [] + self.game = Game(config.seed) + self.config = config + self.board = numpy.zeros((3, 3), dtype="int32") + self.player = 0 + + def have_winner(self): + # Horizontal and vertical checks + for i in range(3): + if (self.board[i, :] == self.player * numpy.ones(3, dtype="int32")).all(): + return True + if (self.board[:, i] == self.player * numpy.ones(3, dtype="int32")).all(): + return True + + # Diagonal checks + if ( + self.board[0, 0] == self.player + and self.board[1, 1] == self.player + and self.board[2, 2] == self.player + ): + return True + if ( + self.board[2, 0] == self.player + and self.board[1, 1] == self.player + and self.board[0, 2] == self.player + ): + return True + + return False + + def play_competition(self, model1, search_policy1, model2, search_policy2): + game_history = GameHistory() + + observation = self.game.reset() + + game_history.action_history.append(0) + game_history.observation_history.append(observation) # 添加reset之后的observation + game_history.reward_history.append(0) + game_history.to_play_history.append(self.game.to_play()) + + done = False + + model1.eval() + model2.eval() + + is_model1 = True + while not done: + assert ( + len(numpy.array(observation).shape) == 3 + ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" + assert ( + numpy.array(observation).shape == self.config.observation_shape + ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." + stacked_observations = game_history.get_stacked_observations( + -1, self.config.stacked_observations, len(self.config.action_space) + ) + + model = model1 if is_model1 else model2 + search_policy = search_policy1 if is_model1 else search_policy2 + + root, mcts_info = search_policy(self.config).run( + model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 + True, + ) + + action = SelfPlay.select_action(root, 0) # 第二个参数阈值为0表示不会偏移,选择最大的 + observation, reward, done = self.game.step(action) + + game_history.store_search_statistics(root, self.config.action_space) + + # Next batch + game_history.action_history.append(action) + game_history.observation_history.append(observation) # 添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 + game_history.reward_history.append(reward) + game_history.to_play_history.append(self.game.to_play()) + + # 如果没有结束,就取反 + if not done: + is_model1 = not is_model1 + + # print("is model",is_model1, "reward is ", reward) + + # 将player的id变回之前的id,否则检查是否有圣者时会发生错误 + self.game.env.player *= -1 + + # 返回值处理 + # |-----|-----|-----| + # | True | True | True | 表示模型1结束,结果为获胜。因此获胜的模型为模型1 + # | True | False | False | 表示模型1结束,结果为失败。因此获胜的模型为模型2 + # | False | True | False | 表示模型2结束,结果为获胜。因此获胜的模型为模型2 + # | False | False | True | 表示模型2结束,结果为失败。因此获胜的模型为模型1 + return self.game.env.have_winner(), is_model1 == (reward > 0) + + def play_with_expert(self, model, search_policy, expert_first=True): + game_history = GameHistory() + + observation = self.game.reset() + + game_history.action_history.append(0) + game_history.observation_history.append(observation) # 添加reset之后的observation + game_history.reward_history.append(0) + game_history.to_play_history.append(self.game.to_play()) + + done = False + + model.eval() + + is_model = not expert_first + while not done: + assert ( + len(numpy.array(observation).shape) == 3 + ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" + assert ( + numpy.array(observation).shape == self.config.observation_shape + ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." + stacked_observations = game_history.get_stacked_observations( + -1, self.config.stacked_observations, len(self.config.action_space) + ) + + + if is_model: + root, mcts_info = search_policy(self.config).run( + model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 + True, + ) + action = SelfPlay.select_action(root, 0) # 第二个参数阈值为0表示不会偏移,选择最大的 + else: + action = self.game.expert_agent() + root = None + + observation, reward, done = self.game.step(action) + + game_history.store_search_statistics(root, self.config.action_space) + + # Next batch + game_history.action_history.append(action) + game_history.observation_history.append(observation) # 添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 + game_history.reward_history.append(reward) + game_history.to_play_history.append(self.game.to_play()) + + # 如果没有结束,就取反 + if not done: + is_model = not is_model + + # print("is model",is_model1, "reward is ", reward) + + # 将player的id变回之前的id,否则检查是否有圣者时会发生错误 + self.game.env.player *= -1 + + # 返回值处理 + # |-----|-----|-----| + # | True | True | True | 表示模型1结束,结果为获胜。因此获胜的模型为模型1 + # | True | False | False | 表示模型1结束,结果为失败。因此获胜的模型为模型2 + # | False | True | False | 表示模型2结束,结果为获胜。因此获胜的模型为模型2 + # | False | False | True | 表示模型2结束,结果为失败。因此获胜的模型为模型1 + return self.game.env.have_winner(), is_model == (reward > 0) + + def close_game(self): + self.game.close() + + def play_tournament(self, models, rollnum=1000): + model_num = len(models) + + for i in range(model_num): + for j in range(i+1, model_num): + model1 = models[i]["model"] + model2 = models[j]["model"] + + # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)]) + model1_win_num = 0 + model2_win_num = 0 + no_winner_num = 0 + + for _ in range(rollnum): + have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS) + + if have_winner: + if is_model1: + model1_win_num += 1 + else: + model2_win_num += 1 + else: + no_winner_num += 1 + + # # 交换顺序,再来一遍 + # for _ in range(rollnum): + # have_winner, is_model1 = self.play_competition(model2, MCTS, model1, MCTS) + # + # if have_winner: + # if is_model1: + # model2_win_num += 1 + # else: + # model1_win_num += 1 + # else: + # no_winner_num += 1 + + # print(is_model1) + + print(models[i]["name"]," ,", models[j]["name"]," : ") + + print(models[i]["name"], " win : ", model1_win_num) + print(models[j]["name"], " win : ", model2_win_num) + print("No Winner", no_winner_num) + print("===================================") + + model1_win_num = 0 + model2_win_num = 0 + no_winner_num = 0 + for i in range(model_num): + for j in range(i+1, model_num): + model1 = models[i]["model"] + model2 = models[j]["model"] + + # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)]) + model1_win_num = 0 + model2_win_num = 0 + no_winner_num = 0 + + for _ in range(rollnum): + have_winner, is_model1 = self.play_competition(model1, MCTS, model2, MCTS) + + if have_winner: + if is_model1: + model1_win_num += 1 + else: + model2_win_num += 1 + else: + no_winner_num += 1 + + + print(models[j]["name"]," ,", models[i]["name"]," : ") + + print(models[j]["name"], " win : ", model1_win_num) + print(models[i]["name"], " win : ", model2_win_num) + print("No Winner", no_winner_num) + print("===================================") + + def play_tournament_with_expert(self, models, rollnum=1000): + model_num = len(models) + + for i in range(model_num): + model = models[i]["model"] + + # model1_win_num = sum([game_tournament.play_tournament(model2, "", model1, "") for i in range(rollnum)]) + model_win_num = 0 + expert_win_num = 0 + no_winner_num = 0 + + for _ in range(rollnum): + have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False) + + if have_winner: + if is_model: + model_win_num += 1 + else: + expert_win_num += 1 + else: + no_winner_num += 1 + + # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True) + # + # if have_winner: + # if is_model: + # model_win_num += 1 + # else: + # expert_win_num += 1 + # else: + # no_winner_num += 1 + + + print(models[i]["name"], " ,", "expert : ") + + print(models[i]["name"], " win : ", model_win_num) + print("expert win : ", expert_win_num) + print("No Winner", no_winner_num) + print("===================================") + + model_win_num = 0 + expert_win_num = 0 + no_winner_num = 0 + for _ in range(rollnum): + # have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=False) + # + # if have_winner: + # if is_model: + # model_win_num += 1 + # else: + # expert_win_num += 1 + # else: + # no_winner_num += 1 + + have_winner, is_model = self.play_with_expert(model, MCTS, expert_first=True) + + if have_winner: + if is_model: + model_win_num += 1 + else: + expert_win_num += 1 + else: + no_winner_num += 1 + + print("expert : ", " ,", models[i]["name"]) + + print("expert win : ", expert_win_num) + print(models[i]["name"], " win : ", model_win_num) + print("No Winner", no_winner_num) + print("===================================") + + + +def load_model(model_cls, model_path, config): + checkpoint = torch.load(model_path) + model = model_cls(config) + model.set_weights(checkpoint["weights"]) + + return model + + +if __name__ == "__main__": + config = MuZeroConfig() + + # config.network = "fullyconnected" + # checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-10--20-03-39\model.checkpoint" + checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe2\2023-08-23--16-24-04\model.checkpoint" + checkpoint_path1 = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe2\2023-08-23--17-12-53\model.checkpoint" + muzero_model = load_model(models.MuZeroNetwork, checkpoint_path1, config) + + # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" + # muzero_2net_model = load_model(models.MuZeroNetwork, muzero_2net_checkpoint_path, config) + + config2 = MuZeroConfig() + # config2.network = "resnet" + # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint" + # muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-22--20-25-51\muzero_2net\model.checkpoint" + muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe2\2023-08-24--02-55-21\muzero_2net\model.checkpoint" + muzero_2net_model = load_model(models2.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config2) + + # uniform_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--08-20-50\muzero_uniform\model.checkpoint" + # uniform_model = load_model(models.MuZeroNetwork, uniform_checkpoint_path, config) + # + # without_rb_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-16--04-35-40\muzero_without_rb\model.checkpoint" + # without_rb_model = load_model(models.MuZeroNetwork, without_rb_checkpoint_path, config) + # + # muzero_no_policy_value_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" + # muzero_no_policy_model = load_model(models.MuZeroNetwork, muzero_no_policy_value_checkpoint_path, config) + # + # + # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-15--11-08-42\muzero_2net\model.checkpoint" + # simplified_muzero = load_model(models.MuZeroNetwork, simplified_muzero_checkpoint_path, config) + # + # # simplified_muzero_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-18--03-02-10\MuZeroNetwork_2net\model.checkpoint" + # # simplified_muzero = load_model(models_2net.SimplifiedMuZeroNetwork, simplified_muzero_checkpoint_path, config) + + + game_tournament = GameTournament(config) + + models = [ + {"name":"muzero_2net", "model":muzero_2net_model}, + # {"name":"uniform", "model":uniform_model}, + {"name":"muzero", "model":muzero_model}, + # {"name": "muzero2", "model": muzero_model}, + # {"name": "without_rb", "model": without_rb_model}, + # {"name": "no policy value", "model": muzero_no_policy_model}, + # {"name": "simplified_muzero", "model": without_rb_model}, + ] + + + # game_tournament.play_tournament(models, rollnum=1000) + # game_tournament.play_tournament(models, rollnum=1000) + game_tournament.play_tournament_with_expert(models, rollnum=500) + + game_tournament.close_game() + diff --git a/games/simple_grid.py b/games/simple_grid.py index f26ae429..d163d7de 100644 --- a/games/simple_grid.py +++ b/games/simple_grid.py @@ -23,6 +23,8 @@ def __init__(self): self.players = list(range(1)) # List of players. You should only edit the length self.stacked_observations = 0 # Number of previous observations and previous actions to add to the current observation + self.action_replace = True + # Evaluate self.muzero_player = 0 # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second) self.opponent = None # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class diff --git a/games/tictactoe.py b/games/tictactoe.py index f331a9ae..ff9a90bf 100644 --- a/games/tictactoe.py +++ b/games/tictactoe.py @@ -27,7 +27,8 @@ def __init__(self): self.muzero_player = 0 # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second) self.opponent = "expert" # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class - + # 动作是否能重复 + self.action_replace = False ### Self-Play self.num_workers = 1 # Number of simultaneous threads/workers self-playing to feed the replay buffer @@ -48,7 +49,8 @@ def __init__(self): ### Network - self.network = "resnet" # "resnet" / "fullyconnected" + # self.network = "resnet" # "resnet" / "fullyconnected" + self.network = "fullyconnected" self.support_size = 10 # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size. Choose it so that support_size <= sqrt(max(abs(discounted reward))) # Residual Network @@ -63,19 +65,27 @@ def __init__(self): self.resnet_fc_policy_layers = [8] # Define the hidden layers in the policy head of the prediction network # Fully Connected Network + # self.encoding_size = 32 + # self.fc_representation_layers = [] # Define the hidden layers in the representation network + # self.fc_dynamics_layers = [16] # Define the hidden layers in the dynamics network + # self.fc_reward_layers = [16] # Define the hidden layers in the reward network + # self.fc_value_layers = [] # Define the hidden layers in the value network + # self.fc_policy_layers = [] # Define the hidden layers in the policy network + self.encoding_size = 32 - self.fc_representation_layers = [] # Define the hidden layers in the representation network + self.fc_representation_layers = [16] # Define the hidden layers in the representation network self.fc_dynamics_layers = [16] # Define the hidden layers in the dynamics network self.fc_reward_layers = [16] # Define the hidden layers in the reward network - self.fc_value_layers = [] # Define the hidden layers in the value network - self.fc_policy_layers = [] # Define the hidden layers in the policy network - + self.fc_value_layers = [16] # Define the hidden layers in the value network + self.fc_policy_layers = [16] ### Training self.results_path = pathlib.Path(__file__).resolve().parents[1] / "results" / pathlib.Path(__file__).stem / datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S") # Path to store the model weights and TensorBoard logs self.save_model = True # Save the checkpoint in results_path as model.checkpoint - self.training_steps = 1000000 # Total number of training steps (ie weights update according to a batch) + # self.training_steps = 1000000 # Total number of training steps (ie weights update according to a batch) + # self.training_steps = 50000 + self.training_steps = 500000 self.batch_size = 64 # Number of parts of games to train on at each training step self.checkpoint_interval = 10 # Number of training steps before using the model for self-playing self.value_loss_weight = 0.25 # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze) diff --git a/models.py b/models.py index be847fef..d4b8bc2f 100644 --- a/models.py +++ b/models.py @@ -94,6 +94,7 @@ def __init__( super().__init__() self.action_space_size = action_space_size self.full_support_size = 2 * support_size + 1 + # support_size 表示的应该是一个选择的范围【-support_size, support_size】.最后+1是因为range最后不包含最后的数 self.representation_network = torch.nn.DataParallel( mlp( @@ -107,6 +108,7 @@ def __init__( ) ) + #dynamics的输入是encoding_size+action_space_size self.dynamics_encoded_state_network = torch.nn.DataParallel( mlp( encoding_size + self.action_space_size, @@ -115,14 +117,14 @@ def __init__( ) ) self.dynamics_reward_network = torch.nn.DataParallel( - mlp(encoding_size, fc_reward_layers, self.full_support_size) + mlp(encoding_size, fc_reward_layers, self.full_support_size) #最后的输出为full_support_size,因为范围是[-support_size, support_size] ) self.prediction_policy_network = torch.nn.DataParallel( - mlp(encoding_size, fc_policy_layers, self.action_space_size) + mlp(encoding_size, fc_policy_layers, self.action_space_size) #输出action的概率 ) self.prediction_value_network = torch.nn.DataParallel( - mlp(encoding_size, fc_value_layers, self.full_support_size) + mlp(encoding_size, fc_value_layers, self.full_support_size) #最后的输出为full_support_size,因为范围是[-support_size, support_size] ) def prediction(self, encoded_state): @@ -134,16 +136,19 @@ def representation(self, observation): encoded_state = self.representation_network( observation.view(observation.shape[0], -1) ) + + # 正则化 # Scale encoded state between [0, 1] (See appendix paper Training) min_encoded_state = encoded_state.min(1, keepdim=True)[0] max_encoded_state = encoded_state.max(1, keepdim=True)[0] scale_encoded_state = max_encoded_state - min_encoded_state - scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5 + scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5 # 防止为0,造成NAN encoded_state_normalized = ( encoded_state - min_encoded_state ) / scale_encoded_state return encoded_state_normalized + # dynamic同representation的唯一不同就是前者需要将encoded_state和action合并在一起作为输入,而representation不需要绑定action def dynamics(self, encoded_state, action): # Stack encoded_state with a game specific one hot encoded action (See paper appendix Network Architecture) action_one_hot = ( @@ -151,18 +156,19 @@ def dynamics(self, encoded_state, action): .to(action.device) .float() ) - action_one_hot.scatter_(1, action.long(), 1.0) + action_one_hot.scatter_(1, action.long(), 1.0) #将action的位置赋值为1 x = torch.cat((encoded_state, action_one_hot), dim=1) next_encoded_state = self.dynamics_encoded_state_network(x) reward = self.dynamics_reward_network(next_encoded_state) + # 正则化 # Scale encoded state between [0, 1] (See paper appendix Training) min_next_encoded_state = next_encoded_state.min(1, keepdim=True)[0] max_next_encoded_state = next_encoded_state.max(1, keepdim=True)[0] scale_next_encoded_state = max_next_encoded_state - min_next_encoded_state - scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5 + scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5 # 防止为0,造成NAN next_encoded_state_normalized = ( next_encoded_state - min_next_encoded_state ) / scale_next_encoded_state @@ -172,7 +178,7 @@ def dynamics(self, encoded_state, action): def initial_inference(self, observation): encoded_state = self.representation(observation) policy_logits, value = self.prediction(encoded_state) - # reward equal to 0 for consistency + # reward equal to 0 for consistency 一致性奖励等于 0 reward = torch.log( ( torch.zeros(1, self.full_support_size) @@ -181,6 +187,7 @@ def initial_inference(self, observation): .to(observation.device) ) ) + # reward的样子为[[0,0,...,0,1,0,...,0,0],...]。即中间值为1,其余全为0,然后重复于observation行数相同的次数 return ( value, @@ -605,8 +612,8 @@ def initial_inference(self, observation): reward = torch.log( ( torch.zeros(1, self.full_support_size) - .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0) - .repeat(len(observation), 1) + .scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0) # 将support_size位置设为1 + .repeat(len(observation), 1) # 根据observation的长度复制,保证reward的维度于observation的一致,即之前的observation也赋值 .to(observation.device) ) ) @@ -637,29 +644,29 @@ def mlp( sizes = [input_size] + layer_sizes + [output_size] layers = [] for i in range(len(sizes) - 1): - act = activation if i < len(sizes) - 2 else output_activation + act = activation if i < len(sizes) - 2 else output_activation #激活函数,最后一层是output_activation,其余的都一样 layers += [torch.nn.Linear(sizes[i], sizes[i + 1]), act()] return torch.nn.Sequential(*layers) -def support_to_scalar(logits, support_size): +def support_to_scalar(logits, support_size): # logits 是 value的对数值,support_size是转换后的范围。 """ Transform a categorical representation to a scalar See paper appendix Network Architecture """ # Decode to a scalar - probabilities = torch.softmax(logits, dim=1) + probabilities = torch.softmax(logits, dim=1) # softmax在指定的向量和为1,softmax扩大大的,缩小下的,shape为[stacked_size, fully_support_size] support = ( - torch.tensor([x for x in range(-support_size, support_size + 1)]) + torch.tensor([x for x in range(-support_size, support_size + 1)]) # 范围是-support_size, support_szie。因为support_size+1 .expand(probabilities.shape) .float() .to(device=probabilities.device) - ) - x = torch.sum(support * probabilities, dim=1, keepdim=True) + ) # shape 为【stacked_size, fully_support_size】, + x = torch.sum(support * probabilities, dim=1, keepdim=True) # 输出为【1,fully_support_size】,因为dim=1,另外keep_dim=True,所有是【1,fully_support_size】而不是【fully_support_size] # Invert the scaling (defined in https://arxiv.org/abs/1805.11593) - x = torch.sign(x) * ( - ((torch.sqrt(1 + 4 * 0.001 * (torch.abs(x) + 1 + 0.001)) - 1) / (2 * 0.001)) + x = torch.sign(x) * ( # sign函数为分段函数, x小于0为-1,大于0为1,0为0。主要是获取x的符号 + ((torch.sqrt(1 + 4 * 0.001 * (torch.abs(x) + 1 + 0.001)) - 1) / (2 * 0.001)) # (sqrt(1+0.04*(|x|+1.001))-1)/0.002 ** 2 - 1 ) @@ -675,9 +682,9 @@ def scalar_to_support(x, support_size): x = torch.sign(x) * (torch.sqrt(torch.abs(x) + 1) - 1) + 0.001 * x # Encode on a vector - x = torch.clamp(x, -support_size, support_size) - floor = x.floor() - prob = x - floor + x = torch.clamp(x, -support_size, support_size) # 裁剪x的范围,使x的范围定为[-support_size, support_size] + floor = x.floor() # floor向下取整,类似的,ceil为向上取整 + prob = x - floor # 减去整数,保留小数部分(因为在support_to_scala部分是index位置乘上概率) logits = torch.zeros(x.shape[0], x.shape[1], 2 * support_size + 1).to(x.device) logits.scatter_( 2, (floor + support_size).long().unsqueeze(-1), (1 - prob).unsqueeze(-1) diff --git a/muzero.py b/muzero.py index f7601c9b..3e075e96 100644 --- a/muzero.py +++ b/muzero.py @@ -43,6 +43,7 @@ def __init__(self, game_name, config=None, split_resources_in=1): # Load the game and the config from the module with the game name try: game_module = importlib.import_module("games." + game_name) + print("games." + game_name) self.Game = game_module.Game self.config = game_module.MuZeroConfig() except ModuleNotFoundError as err: @@ -671,7 +672,10 @@ def load_model_menu(muzero, game_name): choice = input("Invalid input, enter a number listed above: ") choice = int(choice) if choice == 0: + start_time = time.time() muzero.train() + end_time = time.time() + print("耗时: {:.2f}秒".format(end_time - start_time)) elif choice == 1: load_model_menu(muzero, game_name) elif choice == 2: diff --git a/muzero_2net.py b/muzero_2net.py new file mode 100644 index 00000000..fe9f6478 --- /dev/null +++ b/muzero_2net.py @@ -0,0 +1,723 @@ +import copy +import importlib +import json +import math +import pathlib +import pickle +import sys +import time + +import nevergrad +import numpy +import ray +import torch +from torch.utils.tensorboard import SummaryWriter + +sys.path.append("") + +import diagnose_model +# import simplifiedMuZero.net2.models_2net as models +import models +from simplifiedMuZero.net2.models2 import MuZeroNetwork_2net +import simplifiedMuZero.net2.replay_buffer_2net as replay_buffer +import simplifiedMuZero.net2.self_play_2net as self_play +import shared_storage +import simplifiedMuZero.net2.trainer_2net as trainer + + +class MuZero_2Net: + """ + Main class to manage MuZero. + + Args: + game_name (str): Name of the game module, it should match the name of a .py file + in the "./games" directory. + + config (dict, MuZeroConfig, optional): Override the default config of the game. + + split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances. + + Example: + >>> muzero = MuZero_2Net("cartpole") + >>> muzero.train() + >>> muzero.test(render=True) + """ + + def __init__(self, game_name, config=None, split_resources_in=1): + # Load the game and the config from the module with the game name + try: + game_module = importlib.import_module("games." + game_name) + print("games." + game_name) + self.Game = game_module.Game + self.config = game_module.MuZeroConfig() + except ModuleNotFoundError as err: + print( + f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.' + ) + raise err + + # Overwrite the config + if config: + if type(config) is dict: + for param, value in config.items(): + if hasattr(self.config, param): + setattr(self.config, param, value) + else: + raise AttributeError( + f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters." + ) + else: + self.config = config + + # 重命名路径,以便区分不同的模型 + self.config.results_path /= "muzero_2net" + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Manage GPUs + if self.config.max_num_gpus == 0 and ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + raise ValueError( + "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu." + ) + if ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + total_gpus = ( + self.config.max_num_gpus + if self.config.max_num_gpus is not None + else torch.cuda.device_count() + ) + else: + total_gpus = 0 + self.num_gpus = total_gpus / split_resources_in + if 1 < self.num_gpus: + self.num_gpus = math.floor(self.num_gpus) + + ray.init(num_gpus=total_gpus, ignore_reinit_error=True) + + # Checkpoint and replay buffer used to initialize workers + self.checkpoint = { + "weights": None, + "optimizer_state": None, + "total_reward": 0, + "muzero_reward": 0, + "opponent_reward": 0, + "episode_length": 0, + "mean_value": 0, + "training_step": 0, + "lr": 0, + "total_loss": 0, + "value_loss": 0, + "reward_loss": 0, + "policy_loss": 0, + "num_played_games": 0, + "num_played_steps": 0, + "num_reanalysed_games": 0, + "terminate": False, + } + self.replay_buffer = {} + + cpu_actor = CPUActor.remote() + cpu_weights = cpu_actor.get_initial_weights.remote(self.config) + self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights)) + + # Workers + self.self_play_workers = None + self.test_worker = None + self.training_worker = None + self.reanalyse_worker = None + self.replay_buffer_worker = None + self.shared_storage_worker = None + + def train(self, log_in_tensorboard=True): + """ + Spawn ray workers and launch the training. + + Args: + log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard. + """ + if log_in_tensorboard or self.config.save_model: + self.config.results_path.mkdir(parents=True, exist_ok=True) + + # Manage GPUs + if 0 < self.num_gpus: + num_gpus_per_worker = self.num_gpus / ( + self.config.train_on_gpu + + self.config.num_workers * self.config.selfplay_on_gpu + + log_in_tensorboard * self.config.selfplay_on_gpu + + self.config.use_last_model_value * self.config.reanalyse_on_gpu + ) + if 1 < num_gpus_per_worker: + num_gpus_per_worker = math.floor(num_gpus_per_worker) + else: + num_gpus_per_worker = 0 + + # Initialize workers + self.training_worker = trainer.Trainer.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0, + ).remote(self.checkpoint, self.config) + + self.shared_storage_worker = shared_storage.SharedStorage.remote( + self.checkpoint, + self.config, + ) + self.shared_storage_worker.set_info.remote("terminate", False) + + self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote( + self.checkpoint, self.replay_buffer, self.config + ) + + if self.config.use_last_model_value: + self.reanalyse_worker = replay_buffer.Reanalyse.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0, + ).remote(self.checkpoint, self.config) + + self.self_play_workers = [ + self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0, + ).remote( + self.checkpoint, + self.Game, + self.config, + self.config.seed + seed, + ) + for seed in range(self.config.num_workers) + ] + + # Launch workers + [ + self_play_worker.continuous_self_play.remote( + self.shared_storage_worker, self.replay_buffer_worker + ) + for self_play_worker in self.self_play_workers + ] + self.training_worker.continuous_update_weights.remote( + self.replay_buffer_worker, self.shared_storage_worker + ) + if self.config.use_last_model_value: + self.reanalyse_worker.reanalyse.remote( + self.replay_buffer_worker, self.shared_storage_worker + ) + + if log_in_tensorboard: + self.logging_loop( + num_gpus_per_worker if self.config.selfplay_on_gpu else 0, + ) + + def logging_loop(self, num_gpus): + """ + Keep track of the training performance. + """ + # Launch the test worker to get performance metrics + self.test_worker = self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus, + ).remote( + self.checkpoint, + self.Game, + self.config, + self.config.seed + self.config.num_workers, + ) + self.test_worker.continuous_self_play.remote( + self.shared_storage_worker, None, True + ) + + # Write everything in TensorBoard + writer = SummaryWriter(self.config.results_path) + + print( + "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" + ) + + # Save hyperparameters to TensorBoard + hp_table = [ + f"| {key} | {value} |" for key, value in self.config.__dict__.items() + ] + writer.add_text( + "Hyperparameters", + "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), + ) + # Save model representation + writer.add_text( + "Model summary", + self.summary, + ) + # Loop for updating the training performance + counter = 0 + keys = [ + "total_reward", + "muzero_reward", + "opponent_reward", + "episode_length", + "mean_value", + "training_step", + "lr", + "total_loss", + "value_loss", + "reward_loss", + "policy_loss", + "num_played_games", + "num_played_steps", + "num_reanalysed_games", + ] + info = ray.get(self.shared_storage_worker.get_info.remote(keys)) + try: + while info["training_step"] < self.config.training_steps: + info = ray.get(self.shared_storage_worker.get_info.remote(keys)) + writer.add_scalar( + "1.Total_reward/1.Total_reward", + info["total_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/2.Mean_value", + info["mean_value"], + counter, + ) + writer.add_scalar( + "1.Total_reward/3.Episode_length", + info["episode_length"], + counter, + ) + writer.add_scalar( + "1.Total_reward/4.MuZero_reward", + info["muzero_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/5.Opponent_reward", + info["opponent_reward"], + counter, + ) + writer.add_scalar( + "2.Workers/1.Self_played_games", + info["num_played_games"], + counter, + ) + writer.add_scalar( + "2.Workers/2.Training_steps", info["training_step"], counter + ) + writer.add_scalar( + "2.Workers/3.Self_played_steps", info["num_played_steps"], counter + ) + writer.add_scalar( + "2.Workers/4.Reanalysed_games", + info["num_reanalysed_games"], + counter, + ) + writer.add_scalar( + "2.Workers/5.Training_steps_per_self_played_step_ratio", + info["training_step"] / max(1, info["num_played_steps"]), + counter, + ) + writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter) + writer.add_scalar( + "3.Loss/1.Total_weighted_loss", info["total_loss"], counter + ) + writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter) + writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter) + writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter) + print( + f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}', + end="\r", + ) + counter += 1 + time.sleep(0.5) + except KeyboardInterrupt: + pass + + self.terminate_workers() + + if self.config.save_model: + # Persist replay buffer to disk + path = self.config.results_path / "replay_buffer.pkl" + print(f"\n\nPersisting replay buffer games to disk at {path}") + pickle.dump( + { + "buffer": self.replay_buffer, + "num_played_games": self.checkpoint["num_played_games"], + "num_played_steps": self.checkpoint["num_played_steps"], + "num_reanalysed_games": self.checkpoint["num_reanalysed_games"], + }, + open(path, "wb"), + ) + + def terminate_workers(self): + """ + Softly terminate the running tasks and garbage collect the workers. + """ + if self.shared_storage_worker: + self.shared_storage_worker.set_info.remote("terminate", True) + self.checkpoint = ray.get( + self.shared_storage_worker.get_checkpoint.remote() + ) + if self.replay_buffer_worker: + self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote()) + + print("\nShutting down workers...") + + self.self_play_workers = None + self.test_worker = None + self.training_worker = None + self.reanalyse_worker = None + self.replay_buffer_worker = None + self.shared_storage_worker = None + + def test( + self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0 + ): + """ + Test the model in a dedicated thread. + + Args: + render (bool): To display or not the environment. Defaults to True. + + opponent (str): "self" for self-play, "human" for playing against MuZero and "random" + for a random agent, None will use the opponent in the config. Defaults to None. + + muzero_player (int): Player number of MuZero in case of multiplayer + games, None let MuZero play all players turn by turn, None will use muzero_player in + the config. Defaults to None. + + num_tests (int): Number of games to average. Defaults to 1. + + num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0. + """ + opponent = opponent if opponent else self.config.opponent + muzero_player = muzero_player if muzero_player else self.config.muzero_player + self_play_worker = self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus, + ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000)) + results = [] + for i in range(num_tests): + print(f"Testing {i+1}/{num_tests}") + results.append( + ray.get( + self_play_worker.play_game.remote( + 0, + 0, + render, + opponent, + muzero_player, + ) + ) + ) + self_play_worker.close_game.remote() + + if len(self.config.players) == 1: + result = numpy.mean([sum(history.reward_history) for history in results]) + else: + result = numpy.mean( + [ + sum( + reward + for i, reward in enumerate(history.reward_history) + if history.to_play_history[i - 1] == muzero_player + ) + for history in results + ] + ) + return result + + def load_model(self, checkpoint_path=None, replay_buffer_path=None): + """ + Load a model and/or a saved replay buffer. + + Args: + checkpoint_path (str): Path to model.checkpoint or model.weights. + + replay_buffer_path (str): Path to replay_buffer.pkl + """ + # Load checkpoint + if checkpoint_path: + checkpoint_path = pathlib.Path(checkpoint_path) + self.checkpoint = torch.load(checkpoint_path) + print(f"\nUsing checkpoint from {checkpoint_path}") + + # Load replay buffer + if replay_buffer_path: + replay_buffer_path = pathlib.Path(replay_buffer_path) + with open(replay_buffer_path, "rb") as f: + replay_buffer_infos = pickle.load(f) + self.replay_buffer = replay_buffer_infos["buffer"] + self.checkpoint["num_played_steps"] = replay_buffer_infos[ + "num_played_steps" + ] + self.checkpoint["num_played_games"] = replay_buffer_infos[ + "num_played_games" + ] + self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[ + "num_reanalysed_games" + ] + + print(f"\nInitializing replay buffer with {replay_buffer_path}") + else: + print(f"Using empty buffer.") + self.replay_buffer = {} + self.checkpoint["training_step"] = 0 + self.checkpoint["num_played_steps"] = 0 + self.checkpoint["num_played_games"] = 0 + self.checkpoint["num_reanalysed_games"] = 0 + + def diagnose_model(self, horizon): + """ + Play a game only with the learned model then play the same trajectory in the real + environment and display information. + + Args: + horizon (int): Number of timesteps for which we collect information. + """ + game = self.Game(self.config.seed) + obs = game.reset() + dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config) + dm.compare_virtual_with_real_trajectories(obs, game, horizon) + input("Press enter to close all plots") + dm.close_all() + + +@ray.remote(num_cpus=0, num_gpus=0) +class CPUActor: + # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU + def __init__(self): + pass + + def get_initial_weights(self, config): + # model = models.SimplifiedMuZeroNetwork(config) + model = MuZeroNetwork_2net(config) + weigths = model.get_weights() + summary = str(model).replace("\n", " \n\n") + return weigths, summary + + +def hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, num_tests +): + """ + Search for hyperparameters by launching parallel experiments. + + Args: + game_name (str): Name of the game module, it should match the name of a .py file + in the "./games" directory. + + parametrization : Nevergrad parametrization, please refer to nevergrad documentation. + + budget (int): Number of experiments to launch in total. + + parallel_experiments (int): Number of experiments to launch in parallel. + + num_tests (int): Number of games to average for evaluating an experiment. + """ + optimizer = nevergrad.optimizers.OnePlusOne( + parametrization=parametrization, budget=budget + ) + + running_experiments = [] + best_training = None + try: + # Launch initial experiments + for i in range(parallel_experiments): + if 0 < budget: + param = optimizer.ask() + print(f"Launching new experiment: {param.value}") + muzero = MuZero_2Net(game_name, param.value, parallel_experiments) + muzero.param = param + muzero.train(False) + running_experiments.append(muzero) + budget -= 1 + + while 0 < budget or any(running_experiments): + for i, experiment in enumerate(running_experiments): + if experiment and experiment.config.training_steps <= ray.get( + experiment.shared_storage_worker.get_info.remote("training_step") + ): + experiment.terminate_workers() + result = experiment.test(False, num_tests=num_tests) + if not best_training or best_training["result"] < result: + best_training = { + "result": result, + "config": experiment.config, + "checkpoint": experiment.checkpoint, + } + print(f"Parameters: {experiment.param.value}") + print(f"Result: {result}") + optimizer.tell(experiment.param, -result) + + if 0 < budget: + param = optimizer.ask() + print(f"Launching new experiment: {param.value}") + muzero = MuZero_2Net(game_name, param.value, parallel_experiments) + muzero.param = param + muzero.train(False) + running_experiments[i] = muzero + budget -= 1 + else: + running_experiments[i] = None + + except KeyboardInterrupt: + for experiment in running_experiments: + if isinstance(experiment, MuZero_2Net): + experiment.terminate_workers() + + recommendation = optimizer.provide_recommendation() + print("Best hyperparameters:") + print(recommendation.value) + if best_training: + # Save best training weights (but it's not the recommended weights) + best_training["config"].results_path.mkdir(parents=True, exist_ok=True) + torch.save( + best_training["checkpoint"], + best_training["config"].results_path / "model.checkpoint", + ) + # Save the recommended hyperparameters + text_file = open( + best_training["config"].results_path / "best_parameters.txt", + "w", + ) + text_file.write(str(recommendation.value)) + text_file.close() + return recommendation.value + + +def load_model_menu(muzero, game_name): + # Configure running options + options = ["Specify paths manually"] + sorted( + (pathlib.Path("results") / game_name).glob("*/") + ) + options.reverse() + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose a model to load: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + + if choice == (len(options) - 1): + # manual path option + checkpoint_path = input( + "Enter a path to the model.checkpoint, or ENTER if none: " + ) + while checkpoint_path and not pathlib.Path(checkpoint_path).is_file(): + checkpoint_path = input("Invalid checkpoint path. Try again: ") + replay_buffer_path = input( + "Enter a path to the replay_buffer.pkl, or ENTER if none: " + ) + while replay_buffer_path and not pathlib.Path(replay_buffer_path).is_file(): + replay_buffer_path = input("Invalid replay buffer path. Try again: ") + else: + checkpoint_path = options[choice] / "model.checkpoint" + replay_buffer_path = options[choice] / "replay_buffer.pkl" + + muzero.load_model( + checkpoint_path=checkpoint_path, + replay_buffer_path=replay_buffer_path, + ) + + +if __name__ == "__main__": + if len(sys.argv) == 2: + # Train directly with: python muzero.py cartpole + muzero = MuZero_2Net(sys.argv[1]) + muzero.train() + elif len(sys.argv) == 3: + # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' + config = json.loads(sys.argv[2]) + muzero = MuZero_2Net(sys.argv[1], config) + muzero.train() + else: + print("\nWelcome to MuZero! Here's a list of games:") + # Let user pick a game + games = [ + filename.stem + for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) + if filename.name != "abstract_game.py" + ] + for i in range(len(games)): + print(f"{i}. {games[i]}") + choice = input("Enter a number to choose the game: ") + valid_inputs = [str(i) for i in range(len(games))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + + # Initialize MuZero + choice = int(choice) + game_name = games[choice] + muzero = MuZero_2Net(game_name) + + while True: + # Configure running options + options = [ + "Train", + "Load pretrained model", + "Diagnose model", + "Render some self play games", + "Play against MuZero", + "Test the game manually", + "Hyperparameter search", + "Exit", + ] + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose an action: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + if choice == 0: + start_time = time.time() + muzero.train() + end_time = time.time() + print("耗时: {:.2f}秒".format(end_time - start_time)) + elif choice == 1: + load_model_menu(muzero, game_name) + elif choice == 2: + muzero.diagnose_model(30) + elif choice == 3: + muzero.test(render=True, opponent="self", muzero_player=None) + elif choice == 4: + muzero.test(render=True, opponent="human", muzero_player=0) + elif choice == 5: + env = muzero.Game() + env.reset() + env.render() + + done = False + while not done: + action = env.human_to_action() + observation, reward, done = env.step(action) + print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") + env.render() + elif choice == 6: + # Define here the parameters to tune + # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html + muzero.terminate_workers() + del muzero + budget = 20 + parallel_experiments = 2 + lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) + discount = nevergrad.p.Log(lower=0.95, upper=0.9999) + parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) + best_hyperparameters = hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, 20 + ) + muzero = MuZero_2Net(game_name, best_hyperparameters) + else: + break + print("\nDone") + + ray.shutdown() diff --git a/muzero_general.py b/muzero_general.py new file mode 100644 index 00000000..b3fb9411 --- /dev/null +++ b/muzero_general.py @@ -0,0 +1,416 @@ +import importlib +import ray +import pathlib + +import numpy +import torch +from torch.utils.tensorboard import SummaryWriter + +import math +import copy + +from simplifiedMuZero.without_rb.game_play import GamePlay +from simplifiedMuZero.without_rb.play_buffer import PlayBuffer +from simplifiedMuZero.without_rb.trainer_no_PV import Trainer +from muzero import load_model_menu, hyperparameter_search + +import models + + +class CPUActorWithClass: + # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU + def __init__(self): + pass + + def get_initial_weights(self, config, model_cls): + model = model_cls(config) + weigths = model.get_weights() + summary = str(model).replace("\n", " \n\n") + return weigths, summary + +class MuZeroGeneral: + def __init__(self, game_name, model_cls, config=None, split_resources_in=1, save_path_ex=None): + # Load the game and the config from the module with the game name + try: + game_module = importlib.import_module("games." + game_name) + print("games." + game_name) + self.Game = game_module.Game + self.config = game_module.MuZeroConfig() + if save_path_ex: + self.config.results_path /= save_path_ex + else: + self.config.results_path /= model_cls.__name__ + except ModuleNotFoundError as err: + print( + f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.' + ) + raise err + + self.model_cls = model_cls + + # Overwrite the config + if config: + if type(config) is dict: + for param, value in config.items(): + if hasattr(self.config, param): + setattr(self.config, param, value) + else: + raise AttributeError( + f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters." + ) + else: + self.config = config + + # using random search instand of MCTS + self.config.temperature_threshold = 0 + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Manage GPUs + if self.config.max_num_gpus == 0 and ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + raise ValueError( + "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu." + ) + if ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + total_gpus = ( + self.config.max_num_gpus + if self.config.max_num_gpus is not None + else torch.cuda.device_count() + ) + else: + total_gpus = 0 + self.num_gpus = total_gpus / split_resources_in + if 1 < self.num_gpus: + self.num_gpus = math.floor(self.num_gpus) + + # Checkpoint and replay buffer used to initialize workers + self.checkpoint = { + "weights": None, + "optimizer_state": None, + "total_reward": 0, + "muzero_reward": 0, + "opponent_reward": 0, + "episode_length": 0, + "mean_value": 0, + "training_step": 0, + "lr": 0, + "total_loss": 0, + "value_loss": 0, + "reward_loss": 0, + "policy_loss": 0, + "num_played_games": 0, + "num_played_steps": 0, + "num_reanalysed_games": 0, + "terminate": False, + } + self.replay_buffer = {} + + cpu_actor = CPUActorWithClass() + cpu_weights = cpu_actor.get_initial_weights(self.config, self.model_cls) + self.checkpoint["weights"], self.summary = copy.deepcopy((cpu_weights)) + + + def logging_loop(self, writer, training_steps): + + # print( + # "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" + # ) + + # Save hyperparameters to TensorBoard + hp_table = [ + f"| {key} | {value} |" for key, value in self.config.__dict__.items() + ] + writer.add_text( + "Hyperparameters", + "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), + ) + # # Save model representation + # writer.add_text( + # "Model summary", + # str(model).replace("\n", " \n\n") # self.summary, 换成其它的 + # ) + # Loop for updating the training performance + counter = training_steps + + try: + if True: + # while checkpoint["training_step"] < config.training_steps: + writer.add_scalar( + "1.Total_reward/1.Total_reward", + self.checkpoint["total_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/2.Mean_value", + self.checkpoint["mean_value"], + counter, + ) + writer.add_scalar( + "1.Total_reward/3.Episode_length", + self.checkpoint["episode_length"], + counter, + ) + writer.add_scalar( + "1.Total_reward/4.MuZero_reward", + self.checkpoint["muzero_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/5.Opponent_reward", + self.checkpoint["opponent_reward"], + counter, + ) + writer.add_scalar( + "2.Workers/1.Self_played_games", + self.checkpoint["num_played_games"], + counter, + ) + writer.add_scalar( + "2.Workers/2.Training_steps", self.checkpoint["training_step"], counter + ) + writer.add_scalar( + "2.Workers/3.Self_played_steps", self.checkpoint["num_played_steps"], counter + ) + writer.add_scalar( + "2.Workers/4.Reanalysed_games", + self.checkpoint["num_reanalysed_games"], + counter, + ) + writer.add_scalar( + "2.Workers/5.Training_steps_per_self_played_step_ratio", + self.checkpoint["training_step"] / max(1, self.checkpoint["num_played_steps"]), + counter, + ) + writer.add_scalar("2.Workers/6.Learning_rate", self.checkpoint["lr"], counter) + writer.add_scalar( + "3.Loss/1.Total_weighted_loss", self.checkpoint["total_loss"], counter + ) + writer.add_scalar("3.Loss/Value_loss", self.checkpoint["value_loss"], counter) + writer.add_scalar("3.Loss/Reward_loss", self.checkpoint["reward_loss"], counter) + writer.add_scalar("3.Loss/Policy_loss", self.checkpoint["policy_loss"], counter) + print( + f'Last test reward: {self.checkpoint["total_reward"]:.2f}. Training step: {self.checkpoint["training_step"]}/{self.config.training_steps}. Played games: {self.checkpoint["num_played_games"]}. Loss: {self.checkpoint["total_loss"]:.2f}', + end="\r", + ) + counter += 1 + # time.sleep(0.5) + except KeyboardInterrupt: + pass + + # if config.save_model: + # # Persist replay buffer to disk + # path = config.results_path / "replay_buffer.pkl" + # print(f"\n\nPersisting replay buffer games to disk at {path}") + # pickle.dump( + # { + # "buffer": buffer, + # "num_played_games": checkpoint["num_played_games"], + # "num_played_steps": checkpoint["num_played_steps"], + # "num_reanalysed_games": checkpoint["num_reanalysed_games"], + # }, + # open(path, "wb"), + # ) + + def update_gameplay_checkpoint(self, game_history): + self.checkpoint["episode_length"] = len(game_history.action_history) - 1 + self.checkpoint["total_reward"] = sum(game_history.reward_history) + self.checkpoint["mean_value"] = numpy.mean( [value for value in game_history.root_values if value]) + + if 1 < len(self.config.players): + self.checkpoint["muzero_reward"] = sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + == self.config.muzero_player + ) + self.checkpoint["opponent_reward"] = sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + != self.config.muzero_player + ) + + def save_checkpoint(self, path=None): #将模型存储在文件中 + if not path: + path = self.config.results_path / "model.checkpoint" + + torch.save(self.checkpoint, path) + + def train(self, log_in_tensorboard=True): + if log_in_tensorboard or self.config.save_model: + self.config.results_path.mkdir(parents=True, exist_ok=True) + + + trainer = Trainer(self.model_cls, self.checkpoint, self.config) + game_play = GamePlay(trainer.model, self.checkpoint, self.Game, self.config, self.config.seed) + buffer = {} + play_buffer = PlayBuffer(self.checkpoint, buffer, self.config) + + step = 1 # 间隔,即每次模拟后训练多少次 + max_steps = int(self.config.training_steps/step) + # max_steps = 2000 + + writer = SummaryWriter(self.config.results_path) + + for episode in range(max_steps): + game_id, game_history = game_play.play_game(game_play.config.visit_softmax_temperature_fn(0), game_play.config.temperature_threshold, False, "self",0) + + # print(game_id) + # print(game_history.action_history) + # print(game_history.reward_history) + # print(game_history.to_play_history) + # # print(game_history.observation_history) + # print("child visits", game_history.child_visits) + # print(game_history.root_values) # root value指的是root节点的UCB值 + + play_buffer.update_game_history(game_id, game_history) + self.update_gameplay_checkpoint( game_history) + + for i in range(step): + index_batch, batch = play_buffer.get_batch() + # print(batch[1]) + trainer.update_lr() + ( + priorities, + total_loss, + value_loss, + reward_loss, + policy_loss, + ) = trainer.update_weights(batch) + + + training_step = episode * step + i + if training_step % self.config.checkpoint_interval == 0: + self.checkpoint["weights"] = copy.deepcopy(trainer.model.get_weights()) + self.checkpoint["optimizer_state"] =copy.deepcopy(models.dict_to_cpu(trainer.optimizer.state_dict()) ) + + if self.config.save_model: + self.save_checkpoint() + self.checkpoint["training_step"] = training_step + self.checkpoint["lr"] = trainer.optimizer.param_groups[0]["lr"] + self.checkpoint["total_loss"] = total_loss + self.checkpoint["value_loss"] = value_loss + self.checkpoint["reward_loss"] = reward_loss + self.checkpoint["policy_loss"] = policy_loss + + # print(training_step) + # if training_step % 500 == 0: + # if training_step % config.checkpoint_interval == 0: + # # print(training_step) + # logging_loop(config, checkpoint, writer) + + self.logging_loop(writer, training_step) + + + writer.close() + + game_play.close_game() + +# if __name__ == "__main__": +# # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb") +# # start_time = time.time() +# # muzero.train() +# # end_time = time.time() +# # print("耗时: {:.2f}秒".format(end_time - start_time)) +# model_cls = models.MuZeroNetwork +# if len(sys.argv) == 2: +# # Train directly with: python muzero.py cartpole +# muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls) +# muzero.train() +# elif len(sys.argv) == 3: +# # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' +# config = json.loads(sys.argv[2]) +# muzero = MuZeroGeneral(sys.argv[1], config, model_cls=model_cls) +# muzero.train() +# else: +# print("\nWelcome to MuZero! Here's a list of games:") +# # Let user pick a game +# games = [ +# filename.stem +# for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) +# if filename.name != "abstract_game.py" +# ] +# for i in range(len(games)): +# print(f"{i}. {games[i]}") +# choice = input("Enter a number to choose the game: ") +# valid_inputs = [str(i) for i in range(len(games))] +# while choice not in valid_inputs: +# choice = input("Invalid input, enter a number listed above: ") +# +# # Initialize MuZero +# choice = int(choice) +# game_name = games[choice] +# muzero = MuZeroGeneral(game_name, model_cls=model_cls) +# +# while True: +# # Configure running options +# options = [ +# "Train", +# "Load pretrained model", +# "Diagnose model", +# "Render some self play games", +# "Play against MuZero", +# "Test the game manually", +# "Hyperparameter search", +# "Exit", +# ] +# print() +# for i in range(len(options)): +# print(f"{i}. {options[i]}") +# +# choice = input("Enter a number to choose an action: ") +# valid_inputs = [str(i) for i in range(len(options))] +# while choice not in valid_inputs: +# choice = input("Invalid input, enter a number listed above: ") +# choice = int(choice) +# if choice == 0: +# start_time = time.time() +# muzero.train() +# end_time = time.time() +# print("耗时: {:.2f}秒".format(end_time - start_time)) +# elif choice == 1: +# load_model_menu(muzero, game_name) +# elif choice == 2: +# muzero.diagnose_model(30) +# elif choice == 3: +# muzero.test(render=True, opponent="self", muzero_player=None) +# elif choice == 4: +# muzero.test(render=True, opponent="human", muzero_player=0) +# elif choice == 5: +# env = muzero.Game() +# env.reset() +# env.render() +# +# done = False +# while not done: +# action = env.human_to_action() +# observation, reward, done = env.step(action) +# print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") +# env.render() +# elif choice == 6: +# # Define here the parameters to tune +# # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html +# muzero.terminate_workers() +# del muzero +# budget = 20 +# parallel_experiments = 2 +# lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) +# discount = nevergrad.p.Log(lower=0.95, upper=0.9999) +# parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) +# best_hyperparameters = hyperparameter_search( +# game_name, parametrization, budget, parallel_experiments, 20 +# ) +# muzero = MuZeroGeneral(game_name, best_hyperparameters, model_cls=model_cls) +# else: +# break +# print("\nDone") diff --git a/muzero_no_pv.py b/muzero_no_pv.py new file mode 100644 index 00000000..e94789ed --- /dev/null +++ b/muzero_no_pv.py @@ -0,0 +1,716 @@ +import copy +import importlib +import json +import math +import pathlib +import pickle +import sys +import time + +import nevergrad +import numpy +import ray +import torch +from torch.utils.tensorboard import SummaryWriter + +import diagnose_model +import models +import replay_buffer +import self_play +import shared_storage +import simplifiedMuZero.no_pv.trainer_no_pv as trainer + + +class MuZero: + """ + Main class to manage MuZero. + + Args: + game_name (str): Name of the game module, it should match the name of a .py file + in the "./games" directory. + + config (dict, MuZeroConfig, optional): Override the default config of the game. + + split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances. + + Example: + >>> muzero = MuZero("cartpole") + >>> muzero.train() + >>> muzero.test(render=True) + """ + + def __init__(self, game_name, config=None, split_resources_in=1): + # Load the game and the config from the module with the game name + try: + game_module = importlib.import_module("games." + game_name) + print("games." + game_name) + self.Game = game_module.Game + self.config = game_module.MuZeroConfig() + except ModuleNotFoundError as err: + print( + f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.' + ) + raise err + + # Overwrite the config + if config: + if type(config) is dict: + for param, value in config.items(): + if hasattr(self.config, param): + setattr(self.config, param, value) + else: + raise AttributeError( + f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters." + ) + else: + self.config = config + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Manage GPUs + if self.config.max_num_gpus == 0 and ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + raise ValueError( + "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu." + ) + if ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + total_gpus = ( + self.config.max_num_gpus + if self.config.max_num_gpus is not None + else torch.cuda.device_count() + ) + else: + total_gpus = 0 + self.num_gpus = total_gpus / split_resources_in + if 1 < self.num_gpus: + self.num_gpus = math.floor(self.num_gpus) + + ray.init(num_gpus=total_gpus, ignore_reinit_error=True) + + # Checkpoint and replay buffer used to initialize workers + self.checkpoint = { + "weights": None, + "optimizer_state": None, + "total_reward": 0, + "muzero_reward": 0, + "opponent_reward": 0, + "episode_length": 0, + "mean_value": 0, + "training_step": 0, + "lr": 0, + "total_loss": 0, + "value_loss": 0, + "reward_loss": 0, + "policy_loss": 0, + "num_played_games": 0, + "num_played_steps": 0, + "num_reanalysed_games": 0, + "terminate": False, + } + self.replay_buffer = {} + + cpu_actor = CPUActor.remote() + cpu_weights = cpu_actor.get_initial_weights.remote(self.config) + self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights)) + + # Workers + self.self_play_workers = None + self.test_worker = None + self.training_worker = None + self.reanalyse_worker = None + self.replay_buffer_worker = None + self.shared_storage_worker = None + + def train(self, log_in_tensorboard=True): + """ + Spawn ray workers and launch the training. + + Args: + log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard. + """ + if log_in_tensorboard or self.config.save_model: + self.config.results_path.mkdir(parents=True, exist_ok=True) + + # Manage GPUs + if 0 < self.num_gpus: + num_gpus_per_worker = self.num_gpus / ( + self.config.train_on_gpu + + self.config.num_workers * self.config.selfplay_on_gpu + + log_in_tensorboard * self.config.selfplay_on_gpu + + self.config.use_last_model_value * self.config.reanalyse_on_gpu + ) + if 1 < num_gpus_per_worker: + num_gpus_per_worker = math.floor(num_gpus_per_worker) + else: + num_gpus_per_worker = 0 + + # Initialize workers + self.training_worker = trainer.Trainer.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0, + ).remote(self.checkpoint, self.config) + + self.shared_storage_worker = shared_storage.SharedStorage.remote( + self.checkpoint, + self.config, + ) + self.shared_storage_worker.set_info.remote("terminate", False) + + self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote( + self.checkpoint, self.replay_buffer, self.config + ) + + if self.config.use_last_model_value: + self.reanalyse_worker = replay_buffer.Reanalyse.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0, + ).remote(self.checkpoint, self.config) + + self.self_play_workers = [ + self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0, + ).remote( + self.checkpoint, + self.Game, + self.config, + self.config.seed + seed, + ) + for seed in range(self.config.num_workers) + ] + + # Launch workers + [ + self_play_worker.continuous_self_play.remote( + self.shared_storage_worker, self.replay_buffer_worker + ) + for self_play_worker in self.self_play_workers + ] + self.training_worker.continuous_update_weights.remote( + self.replay_buffer_worker, self.shared_storage_worker + ) + if self.config.use_last_model_value: + self.reanalyse_worker.reanalyse.remote( + self.replay_buffer_worker, self.shared_storage_worker + ) + + if log_in_tensorboard: + self.logging_loop( + num_gpus_per_worker if self.config.selfplay_on_gpu else 0, + ) + + def logging_loop(self, num_gpus): + """ + Keep track of the training performance. + """ + # Launch the test worker to get performance metrics + self.test_worker = self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus, + ).remote( + self.checkpoint, + self.Game, + self.config, + self.config.seed + self.config.num_workers, + ) + self.test_worker.continuous_self_play.remote( + self.shared_storage_worker, None, True + ) + + # Write everything in TensorBoard + writer = SummaryWriter(self.config.results_path) + + print( + "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" + ) + + # Save hyperparameters to TensorBoard + hp_table = [ + f"| {key} | {value} |" for key, value in self.config.__dict__.items() + ] + writer.add_text( + "Hyperparameters", + "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), + ) + # Save model representation + writer.add_text( + "Model summary", + self.summary, + ) + # Loop for updating the training performance + counter = 0 + keys = [ + "total_reward", + "muzero_reward", + "opponent_reward", + "episode_length", + "mean_value", + "training_step", + "lr", + "total_loss", + "value_loss", + "reward_loss", + "policy_loss", + "num_played_games", + "num_played_steps", + "num_reanalysed_games", + ] + info = ray.get(self.shared_storage_worker.get_info.remote(keys)) + try: + while info["training_step"] < self.config.training_steps: + info = ray.get(self.shared_storage_worker.get_info.remote(keys)) + writer.add_scalar( + "1.Total_reward/1.Total_reward", + info["total_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/2.Mean_value", + info["mean_value"], + counter, + ) + writer.add_scalar( + "1.Total_reward/3.Episode_length", + info["episode_length"], + counter, + ) + writer.add_scalar( + "1.Total_reward/4.MuZero_reward", + info["muzero_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/5.Opponent_reward", + info["opponent_reward"], + counter, + ) + writer.add_scalar( + "2.Workers/1.Self_played_games", + info["num_played_games"], + counter, + ) + writer.add_scalar( + "2.Workers/2.Training_steps", info["training_step"], counter + ) + writer.add_scalar( + "2.Workers/3.Self_played_steps", info["num_played_steps"], counter + ) + writer.add_scalar( + "2.Workers/4.Reanalysed_games", + info["num_reanalysed_games"], + counter, + ) + writer.add_scalar( + "2.Workers/5.Training_steps_per_self_played_step_ratio", + info["training_step"] / max(1, info["num_played_steps"]), + counter, + ) + writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter) + writer.add_scalar( + "3.Loss/1.Total_weighted_loss", info["total_loss"], counter + ) + writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter) + writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter) + writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter) + print( + f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}', + end="\r", + ) + counter += 1 + time.sleep(0.5) + except KeyboardInterrupt: + pass + + self.terminate_workers() + + if self.config.save_model: + # Persist replay buffer to disk + path = self.config.results_path / "replay_buffer.pkl" + print(f"\n\nPersisting replay buffer games to disk at {path}") + pickle.dump( + { + "buffer": self.replay_buffer, + "num_played_games": self.checkpoint["num_played_games"], + "num_played_steps": self.checkpoint["num_played_steps"], + "num_reanalysed_games": self.checkpoint["num_reanalysed_games"], + }, + open(path, "wb"), + ) + + def terminate_workers(self): + """ + Softly terminate the running tasks and garbage collect the workers. + """ + if self.shared_storage_worker: + self.shared_storage_worker.set_info.remote("terminate", True) + self.checkpoint = ray.get( + self.shared_storage_worker.get_checkpoint.remote() + ) + if self.replay_buffer_worker: + self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote()) + + print("\nShutting down workers...") + + self.self_play_workers = None + self.test_worker = None + self.training_worker = None + self.reanalyse_worker = None + self.replay_buffer_worker = None + self.shared_storage_worker = None + + def test( + self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0 + ): + """ + Test the model in a dedicated thread. + + Args: + render (bool): To display or not the environment. Defaults to True. + + opponent (str): "self" for self-play, "human" for playing against MuZero and "random" + for a random agent, None will use the opponent in the config. Defaults to None. + + muzero_player (int): Player number of MuZero in case of multiplayer + games, None let MuZero play all players turn by turn, None will use muzero_player in + the config. Defaults to None. + + num_tests (int): Number of games to average. Defaults to 1. + + num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0. + """ + opponent = opponent if opponent else self.config.opponent + muzero_player = muzero_player if muzero_player else self.config.muzero_player + self_play_worker = self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus, + ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000)) + results = [] + for i in range(num_tests): + print(f"Testing {i+1}/{num_tests}") + results.append( + ray.get( + self_play_worker.play_game.remote( + 0, + 0, + render, + opponent, + muzero_player, + ) + ) + ) + self_play_worker.close_game.remote() + + if len(self.config.players) == 1: + result = numpy.mean([sum(history.reward_history) for history in results]) + else: + result = numpy.mean( + [ + sum( + reward + for i, reward in enumerate(history.reward_history) + if history.to_play_history[i - 1] == muzero_player + ) + for history in results + ] + ) + return result + + def load_model(self, checkpoint_path=None, replay_buffer_path=None): + """ + Load a model and/or a saved replay buffer. + + Args: + checkpoint_path (str): Path to model.checkpoint or model.weights. + + replay_buffer_path (str): Path to replay_buffer.pkl + """ + # Load checkpoint + if checkpoint_path: + checkpoint_path = pathlib.Path(checkpoint_path) + self.checkpoint = torch.load(checkpoint_path) + print(f"\nUsing checkpoint from {checkpoint_path}") + + # Load replay buffer + if replay_buffer_path: + replay_buffer_path = pathlib.Path(replay_buffer_path) + with open(replay_buffer_path, "rb") as f: + replay_buffer_infos = pickle.load(f) + self.replay_buffer = replay_buffer_infos["buffer"] + self.checkpoint["num_played_steps"] = replay_buffer_infos[ + "num_played_steps" + ] + self.checkpoint["num_played_games"] = replay_buffer_infos[ + "num_played_games" + ] + self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[ + "num_reanalysed_games" + ] + + print(f"\nInitializing replay buffer with {replay_buffer_path}") + else: + print(f"Using empty buffer.") + self.replay_buffer = {} + self.checkpoint["training_step"] = 0 + self.checkpoint["num_played_steps"] = 0 + self.checkpoint["num_played_games"] = 0 + self.checkpoint["num_reanalysed_games"] = 0 + + def diagnose_model(self, horizon): + """ + Play a game only with the learned model then play the same trajectory in the real + environment and display information. + + Args: + horizon (int): Number of timesteps for which we collect information. + """ + game = self.Game(self.config.seed) + obs = game.reset() + dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config) + dm.compare_virtual_with_real_trajectories(obs, game, horizon) + input("Press enter to close all plots") + dm.close_all() + + +@ray.remote(num_cpus=0, num_gpus=0) +class CPUActor: + # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU + def __init__(self): + pass + + def get_initial_weights(self, config): + model = models.MuZeroNetwork(config) + weigths = model.get_weights() + summary = str(model).replace("\n", " \n\n") + return weigths, summary + + +def hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, num_tests +): + """ + Search for hyperparameters by launching parallel experiments. + + Args: + game_name (str): Name of the game module, it should match the name of a .py file + in the "./games" directory. + + parametrization : Nevergrad parametrization, please refer to nevergrad documentation. + + budget (int): Number of experiments to launch in total. + + parallel_experiments (int): Number of experiments to launch in parallel. + + num_tests (int): Number of games to average for evaluating an experiment. + """ + optimizer = nevergrad.optimizers.OnePlusOne( + parametrization=parametrization, budget=budget + ) + + running_experiments = [] + best_training = None + try: + # Launch initial experiments + for i in range(parallel_experiments): + if 0 < budget: + param = optimizer.ask() + print(f"Launching new experiment: {param.value}") + muzero = MuZero(game_name, param.value, parallel_experiments) + muzero.param = param + muzero.train(False) + running_experiments.append(muzero) + budget -= 1 + + while 0 < budget or any(running_experiments): + for i, experiment in enumerate(running_experiments): + if experiment and experiment.config.training_steps <= ray.get( + experiment.shared_storage_worker.get_info.remote("training_step") + ): + experiment.terminate_workers() + result = experiment.test(False, num_tests=num_tests) + if not best_training or best_training["result"] < result: + best_training = { + "result": result, + "config": experiment.config, + "checkpoint": experiment.checkpoint, + } + print(f"Parameters: {experiment.param.value}") + print(f"Result: {result}") + optimizer.tell(experiment.param, -result) + + if 0 < budget: + param = optimizer.ask() + print(f"Launching new experiment: {param.value}") + muzero = MuZero(game_name, param.value, parallel_experiments) + muzero.param = param + muzero.train(False) + running_experiments[i] = muzero + budget -= 1 + else: + running_experiments[i] = None + + except KeyboardInterrupt: + for experiment in running_experiments: + if isinstance(experiment, MuZero): + experiment.terminate_workers() + + recommendation = optimizer.provide_recommendation() + print("Best hyperparameters:") + print(recommendation.value) + if best_training: + # Save best training weights (but it's not the recommended weights) + best_training["config"].results_path.mkdir(parents=True, exist_ok=True) + torch.save( + best_training["checkpoint"], + best_training["config"].results_path / "model.checkpoint", + ) + # Save the recommended hyperparameters + text_file = open( + best_training["config"].results_path / "best_parameters.txt", + "w", + ) + text_file.write(str(recommendation.value)) + text_file.close() + return recommendation.value + + +def load_model_menu(muzero, game_name): + # Configure running options + options = ["Specify paths manually"] + sorted( + (pathlib.Path("results") / game_name).glob("*/") + ) + options.reverse() + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose a model to load: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + + if choice == (len(options) - 1): + # manual path option + checkpoint_path = input( + "Enter a path to the model.checkpoint, or ENTER if none: " + ) + while checkpoint_path and not pathlib.Path(checkpoint_path).is_file(): + checkpoint_path = input("Invalid checkpoint path. Try again: ") + replay_buffer_path = input( + "Enter a path to the replay_buffer.pkl, or ENTER if none: " + ) + while replay_buffer_path and not pathlib.Path(replay_buffer_path).is_file(): + replay_buffer_path = input("Invalid replay buffer path. Try again: ") + else: + checkpoint_path = options[choice] / "model.checkpoint" + replay_buffer_path = options[choice] / "replay_buffer.pkl" + + muzero.load_model( + checkpoint_path=checkpoint_path, + replay_buffer_path=replay_buffer_path, + ) + + +if __name__ == "__main__": + if len(sys.argv) == 2: + # Train directly with: python muzero.py cartpole + muzero = MuZero(sys.argv[1]) + muzero.train() + elif len(sys.argv) == 3: + # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' + config = json.loads(sys.argv[2]) + muzero = MuZero(sys.argv[1], config) + muzero.train() + else: + print("\nWelcome to MuZero! Here's a list of games:") + # Let user pick a game + games = [ + filename.stem + for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) + if filename.name != "abstract_game.py" + ] + for i in range(len(games)): + print(f"{i}. {games[i]}") + choice = input("Enter a number to choose the game: ") + valid_inputs = [str(i) for i in range(len(games))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + + # Initialize MuZero + choice = int(choice) + game_name = games[choice] + muzero = MuZero(game_name) + + while True: + # Configure running options + options = [ + "Train", + "Load pretrained model", + "Diagnose model", + "Render some self play games", + "Play against MuZero", + "Test the game manually", + "Hyperparameter search", + "Exit", + ] + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose an action: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + if choice == 0: + start_time = time.time() + muzero.train() + end_time = time.time() + print("耗时: {:.2f}秒".format(end_time - start_time)) + elif choice == 1: + load_model_menu(muzero, game_name) + elif choice == 2: + muzero.diagnose_model(30) + elif choice == 3: + muzero.test(render=True, opponent="self", muzero_player=None) + elif choice == 4: + muzero.test(render=True, opponent="human", muzero_player=0) + elif choice == 5: + env = muzero.Game() + env.reset() + env.render() + + done = False + while not done: + action = env.human_to_action() + observation, reward, done = env.step(action) + print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") + env.render() + elif choice == 6: + # Define here the parameters to tune + # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html + muzero.terminate_workers() + del muzero + budget = 20 + parallel_experiments = 2 + lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) + discount = nevergrad.p.Log(lower=0.95, upper=0.9999) + parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) + best_hyperparameters = hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, 20 + ) + muzero = MuZero(game_name, best_hyperparameters) + else: + break + print("\nDone") + + ray.shutdown() diff --git a/muzero_rhea.py b/muzero_rhea.py new file mode 100644 index 00000000..07ceee18 --- /dev/null +++ b/muzero_rhea.py @@ -0,0 +1,719 @@ +import copy +import importlib +import json +import math +import pathlib +import pickle +import sys +import time + +import nevergrad +import numpy +import ray +import torch +from torch.utils.tensorboard import SummaryWriter + +import diagnose_model +import models +import replay_buffer +import simplifiedMuZero.search_policy.rhea_self_play as self_play +import shared_storage +import trainer + + +class MuZero_Rhea: + """ + Main class to manage MuZero. + + Args: + game_name (str): Name of the game module, it should match the name of a .py file + in the "./games" directory. + + config (dict, MuZeroConfig, optional): Override the default config of the game. + + split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances. + + Example: + >>> muzero = MuZero_Rhea("cartpole") + >>> muzero.train() + >>> muzero.test(render=True) + """ + + def __init__(self, game_name, config=None, split_resources_in=1): + # Load the game and the config from the module with the game name + try: + game_module = importlib.import_module("games." + game_name) + print("games." + game_name) + self.Game = game_module.Game + self.config = game_module.MuZeroConfig() + except ModuleNotFoundError as err: + print( + f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.' + ) + raise err + + # Overwrite the config + if config: + if type(config) is dict: + for param, value in config.items(): + if hasattr(self.config, param): + setattr(self.config, param, value) + else: + raise AttributeError( + f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters." + ) + else: + self.config = config + + # 重命名路径,以便区分不同的模型 + self.config.results_path /= self.__class__.__name__ + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Manage GPUs + if self.config.max_num_gpus == 0 and ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + raise ValueError( + "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu." + ) + if ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + total_gpus = ( + self.config.max_num_gpus + if self.config.max_num_gpus is not None + else torch.cuda.device_count() + ) + else: + total_gpus = 0 + self.num_gpus = total_gpus / split_resources_in + if 1 < self.num_gpus: + self.num_gpus = math.floor(self.num_gpus) + + ray.init(num_gpus=total_gpus, ignore_reinit_error=True) + + # Checkpoint and replay buffer used to initialize workers + self.checkpoint = { + "weights": None, + "optimizer_state": None, + "total_reward": 0, + "muzero_reward": 0, + "opponent_reward": 0, + "episode_length": 0, + "mean_value": 0, + "training_step": 0, + "lr": 0, + "total_loss": 0, + "value_loss": 0, + "reward_loss": 0, + "policy_loss": 0, + "num_played_games": 0, + "num_played_steps": 0, + "num_reanalysed_games": 0, + "terminate": False, + } + self.replay_buffer = {} + + cpu_actor = CPUActor.remote() + cpu_weights = cpu_actor.get_initial_weights.remote(self.config) + self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights)) + + # Workers + self.self_play_workers = None + self.test_worker = None + self.training_worker = None + self.reanalyse_worker = None + self.replay_buffer_worker = None + self.shared_storage_worker = None + + def train(self, log_in_tensorboard=True): + """ + Spawn ray workers and launch the training. + + Args: + log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard. + """ + if log_in_tensorboard or self.config.save_model: + self.config.results_path.mkdir(parents=True, exist_ok=True) + + # Manage GPUs + if 0 < self.num_gpus: + num_gpus_per_worker = self.num_gpus / ( + self.config.train_on_gpu + + self.config.num_workers * self.config.selfplay_on_gpu + + log_in_tensorboard * self.config.selfplay_on_gpu + + self.config.use_last_model_value * self.config.reanalyse_on_gpu + ) + if 1 < num_gpus_per_worker: + num_gpus_per_worker = math.floor(num_gpus_per_worker) + else: + num_gpus_per_worker = 0 + + # Initialize workers + self.training_worker = trainer.Trainer.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0, + ).remote(self.checkpoint, self.config) + + self.shared_storage_worker = shared_storage.SharedStorage.remote( + self.checkpoint, + self.config, + ) + self.shared_storage_worker.set_info.remote("terminate", False) + + self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote( + self.checkpoint, self.replay_buffer, self.config + ) + + if self.config.use_last_model_value: + self.reanalyse_worker = replay_buffer.Reanalyse.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0, + ).remote(self.checkpoint, self.config) + + self.self_play_workers = [ + self_play.SelfPlayRhea.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0, + ).remote( + self.checkpoint, + self.Game, + self.config, + self.config.seed + seed, + ) + for seed in range(self.config.num_workers) + ] + + # Launch workers + [ + self_play_worker.continuous_self_play.remote( + self.shared_storage_worker, self.replay_buffer_worker + ) + for self_play_worker in self.self_play_workers + ] + self.training_worker.continuous_update_weights.remote( + self.replay_buffer_worker, self.shared_storage_worker + ) + if self.config.use_last_model_value: + self.reanalyse_worker.reanalyse.remote( + self.replay_buffer_worker, self.shared_storage_worker + ) + + if log_in_tensorboard: + self.logging_loop( + num_gpus_per_worker if self.config.selfplay_on_gpu else 0, + ) + + def logging_loop(self, num_gpus): + """ + Keep track of the training performance. + """ + # Launch the test worker to get performance metrics + self.test_worker = self_play.SelfPlayRhea.options( + num_cpus=0, + num_gpus=num_gpus, + ).remote( + self.checkpoint, + self.Game, + self.config, + self.config.seed + self.config.num_workers, + ) + self.test_worker.continuous_self_play.remote( + self.shared_storage_worker, None, True + ) + + # Write everything in TensorBoard + writer = SummaryWriter(self.config.results_path) + + print( + "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" + ) + + # Save hyperparameters to TensorBoard + hp_table = [ + f"| {key} | {value} |" for key, value in self.config.__dict__.items() + ] + writer.add_text( + "Hyperparameters", + "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), + ) + # Save model representation + writer.add_text( + "Model summary", + self.summary, + ) + # Loop for updating the training performance + counter = 0 + keys = [ + "total_reward", + "muzero_reward", + "opponent_reward", + "episode_length", + "mean_value", + "training_step", + "lr", + "total_loss", + "value_loss", + "reward_loss", + "policy_loss", + "num_played_games", + "num_played_steps", + "num_reanalysed_games", + ] + info = ray.get(self.shared_storage_worker.get_info.remote(keys)) + try: + while info["training_step"] < self.config.training_steps: + info = ray.get(self.shared_storage_worker.get_info.remote(keys)) + writer.add_scalar( + "1.Total_reward/1.Total_reward", + info["total_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/2.Mean_value", + info["mean_value"], + counter, + ) + writer.add_scalar( + "1.Total_reward/3.Episode_length", + info["episode_length"], + counter, + ) + writer.add_scalar( + "1.Total_reward/4.MuZero_reward", + info["muzero_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/5.Opponent_reward", + info["opponent_reward"], + counter, + ) + writer.add_scalar( + "2.Workers/1.Self_played_games", + info["num_played_games"], + counter, + ) + writer.add_scalar( + "2.Workers/2.Training_steps", info["training_step"], counter + ) + writer.add_scalar( + "2.Workers/3.Self_played_steps", info["num_played_steps"], counter + ) + writer.add_scalar( + "2.Workers/4.Reanalysed_games", + info["num_reanalysed_games"], + counter, + ) + writer.add_scalar( + "2.Workers/5.Training_steps_per_self_played_step_ratio", + info["training_step"] / max(1, info["num_played_steps"]), + counter, + ) + writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter) + writer.add_scalar( + "3.Loss/1.Total_weighted_loss", info["total_loss"], counter + ) + writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter) + writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter) + writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter) + print( + f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}', + end="\r", + ) + counter += 1 + time.sleep(0.5) + except KeyboardInterrupt: + pass + + self.terminate_workers() + + if self.config.save_model: + # Persist replay buffer to disk + path = self.config.results_path / "replay_buffer.pkl" + print(f"\n\nPersisting replay buffer games to disk at {path}") + pickle.dump( + { + "buffer": self.replay_buffer, + "num_played_games": self.checkpoint["num_played_games"], + "num_played_steps": self.checkpoint["num_played_steps"], + "num_reanalysed_games": self.checkpoint["num_reanalysed_games"], + }, + open(path, "wb"), + ) + + def terminate_workers(self): + """ + Softly terminate the running tasks and garbage collect the workers. + """ + if self.shared_storage_worker: + self.shared_storage_worker.set_info.remote("terminate", True) + self.checkpoint = ray.get( + self.shared_storage_worker.get_checkpoint.remote() + ) + if self.replay_buffer_worker: + self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote()) + + print("\nShutting down workers...") + + self.self_play_workers = None + self.test_worker = None + self.training_worker = None + self.reanalyse_worker = None + self.replay_buffer_worker = None + self.shared_storage_worker = None + + def test( + self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0 + ): + """ + Test the model in a dedicated thread. + + Args: + render (bool): To display or not the environment. Defaults to True. + + opponent (str): "self" for self-play, "human" for playing against MuZero and "random" + for a random agent, None will use the opponent in the config. Defaults to None. + + muzero_player (int): Player number of MuZero in case of multiplayer + games, None let MuZero play all players turn by turn, None will use muzero_player in + the config. Defaults to None. + + num_tests (int): Number of games to average. Defaults to 1. + + num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0. + """ + opponent = opponent if opponent else self.config.opponent + muzero_player = muzero_player if muzero_player else self.config.muzero_player + self_play_worker = self_play.SelfPlayRhea.options( + num_cpus=0, + num_gpus=num_gpus, + ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000)) + results = [] + for i in range(num_tests): + print(f"Testing {i+1}/{num_tests}") + results.append( + ray.get( + self_play_worker.play_game.remote( + 0, + 0, + render, + opponent, + muzero_player, + ) + ) + ) + self_play_worker.close_game.remote() + + if len(self.config.players) == 1: + result = numpy.mean([sum(history.reward_history) for history in results]) + else: + result = numpy.mean( + [ + sum( + reward + for i, reward in enumerate(history.reward_history) + if history.to_play_history[i - 1] == muzero_player + ) + for history in results + ] + ) + return result + + def load_model(self, checkpoint_path=None, replay_buffer_path=None): + """ + Load a model and/or a saved replay buffer. + + Args: + checkpoint_path (str): Path to model.checkpoint or model.weights. + + replay_buffer_path (str): Path to replay_buffer.pkl + """ + # Load checkpoint + if checkpoint_path: + checkpoint_path = pathlib.Path(checkpoint_path) + self.checkpoint = torch.load(checkpoint_path) + print(f"\nUsing checkpoint from {checkpoint_path}") + + # Load replay buffer + if replay_buffer_path: + replay_buffer_path = pathlib.Path(replay_buffer_path) + with open(replay_buffer_path, "rb") as f: + replay_buffer_infos = pickle.load(f) + self.replay_buffer = replay_buffer_infos["buffer"] + self.checkpoint["num_played_steps"] = replay_buffer_infos[ + "num_played_steps" + ] + self.checkpoint["num_played_games"] = replay_buffer_infos[ + "num_played_games" + ] + self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[ + "num_reanalysed_games" + ] + + print(f"\nInitializing replay buffer with {replay_buffer_path}") + else: + print(f"Using empty buffer.") + self.replay_buffer = {} + self.checkpoint["training_step"] = 0 + self.checkpoint["num_played_steps"] = 0 + self.checkpoint["num_played_games"] = 0 + self.checkpoint["num_reanalysed_games"] = 0 + + def diagnose_model(self, horizon): + """ + Play a game only with the learned model then play the same trajectory in the real + environment and display information. + + Args: + horizon (int): Number of timesteps for which we collect information. + """ + game = self.Game(self.config.seed) + obs = game.reset() + dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config) + dm.compare_virtual_with_real_trajectories(obs, game, horizon) + input("Press enter to close all plots") + dm.close_all() + + +@ray.remote(num_cpus=0, num_gpus=0) +class CPUActor: + # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU + def __init__(self): + pass + + def get_initial_weights(self, config): + model = models.MuZeroNetwork(config) + weigths = model.get_weights() + summary = str(model).replace("\n", " \n\n") + return weigths, summary + + +def hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, num_tests +): + """ + Search for hyperparameters by launching parallel experiments. + + Args: + game_name (str): Name of the game module, it should match the name of a .py file + in the "./games" directory. + + parametrization : Nevergrad parametrization, please refer to nevergrad documentation. + + budget (int): Number of experiments to launch in total. + + parallel_experiments (int): Number of experiments to launch in parallel. + + num_tests (int): Number of games to average for evaluating an experiment. + """ + optimizer = nevergrad.optimizers.OnePlusOne( + parametrization=parametrization, budget=budget + ) + + running_experiments = [] + best_training = None + try: + # Launch initial experiments + for i in range(parallel_experiments): + if 0 < budget: + param = optimizer.ask() + print(f"Launching new experiment: {param.value}") + muzero = MuZero_Rhea(game_name, param.value, parallel_experiments) + muzero.param = param + muzero.train(False) + running_experiments.append(muzero) + budget -= 1 + + while 0 < budget or any(running_experiments): + for i, experiment in enumerate(running_experiments): + if experiment and experiment.config.training_steps <= ray.get( + experiment.shared_storage_worker.get_info.remote("training_step") + ): + experiment.terminate_workers() + result = experiment.test(False, num_tests=num_tests) + if not best_training or best_training["result"] < result: + best_training = { + "result": result, + "config": experiment.config, + "checkpoint": experiment.checkpoint, + } + print(f"Parameters: {experiment.param.value}") + print(f"Result: {result}") + optimizer.tell(experiment.param, -result) + + if 0 < budget: + param = optimizer.ask() + print(f"Launching new experiment: {param.value}") + muzero = MuZero_Rhea(game_name, param.value, parallel_experiments) + muzero.param = param + muzero.train(False) + running_experiments[i] = muzero + budget -= 1 + else: + running_experiments[i] = None + + except KeyboardInterrupt: + for experiment in running_experiments: + if isinstance(experiment, MuZero_Rhea): + experiment.terminate_workers() + + recommendation = optimizer.provide_recommendation() + print("Best hyperparameters:") + print(recommendation.value) + if best_training: + # Save best training weights (but it's not the recommended weights) + best_training["config"].results_path.mkdir(parents=True, exist_ok=True) + torch.save( + best_training["checkpoint"], + best_training["config"].results_path / "model.checkpoint", + ) + # Save the recommended hyperparameters + text_file = open( + best_training["config"].results_path / "best_parameters.txt", + "w", + ) + text_file.write(str(recommendation.value)) + text_file.close() + return recommendation.value + + +def load_model_menu(muzero, game_name): + # Configure running options + options = ["Specify paths manually"] + sorted( + (pathlib.Path("results") / game_name).glob("*/") + ) + options.reverse() + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose a model to load: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + + if choice == (len(options) - 1): + # manual path option + checkpoint_path = input( + "Enter a path to the model.checkpoint, or ENTER if none: " + ) + while checkpoint_path and not pathlib.Path(checkpoint_path).is_file(): + checkpoint_path = input("Invalid checkpoint path. Try again: ") + replay_buffer_path = input( + "Enter a path to the replay_buffer.pkl, or ENTER if none: " + ) + while replay_buffer_path and not pathlib.Path(replay_buffer_path).is_file(): + replay_buffer_path = input("Invalid replay buffer path. Try again: ") + else: + checkpoint_path = options[choice] / "model.checkpoint" + replay_buffer_path = options[choice] / "replay_buffer.pkl" + + muzero.load_model( + checkpoint_path=checkpoint_path, + replay_buffer_path=replay_buffer_path, + ) + + +if __name__ == "__main__": + if len(sys.argv) == 2: + # Train directly with: python muzero.py cartpole + muzero = MuZero_Rhea(sys.argv[1]) + muzero.train() + elif len(sys.argv) == 3: + # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' + config = json.loads(sys.argv[2]) + muzero = MuZero_Rhea(sys.argv[1], config) + muzero.train() + else: + print("\nWelcome to MuZero! Here's a list of games:") + # Let user pick a game + games = [ + filename.stem + for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) + if filename.name != "abstract_game.py" + ] + for i in range(len(games)): + print(f"{i}. {games[i]}") + choice = input("Enter a number to choose the game: ") + valid_inputs = [str(i) for i in range(len(games))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + + # Initialize MuZero + choice = int(choice) + game_name = games[choice] + muzero = MuZero_Rhea(game_name) + + while True: + # Configure running options + options = [ + "Train", + "Load pretrained model", + "Diagnose model", + "Render some self play games", + "Play against MuZero", + "Test the game manually", + "Hyperparameter search", + "Exit", + ] + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose an action: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + if choice == 0: + start_time = time.time() + muzero.train() + end_time = time.time() + print("耗时: {:.2f}秒".format(end_time - start_time)) + elif choice == 1: + load_model_menu(muzero, game_name) + elif choice == 2: + muzero.diagnose_model(30) + elif choice == 3: + muzero.test(render=True, opponent="self", muzero_player=None) + elif choice == 4: + muzero.test(render=True, opponent="human", muzero_player=0) + elif choice == 5: + env = muzero.Game() + env.reset() + env.render() + + done = False + while not done: + action = env.human_to_action() + observation, reward, done = env.step(action) + print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") + env.render() + elif choice == 6: + # Define here the parameters to tune + # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html + muzero.terminate_workers() + del muzero + budget = 20 + parallel_experiments = 2 + lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) + discount = nevergrad.p.Log(lower=0.95, upper=0.9999) + parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) + best_hyperparameters = hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, 20 + ) + muzero = MuZero_Rhea(game_name, best_hyperparameters) + else: + break + print("\nDone") + + ray.shutdown() diff --git a/muzero_uniform.py b/muzero_uniform.py new file mode 100644 index 00000000..53d4a0b9 --- /dev/null +++ b/muzero_uniform.py @@ -0,0 +1,721 @@ +import copy +import importlib +import json +import math +import pathlib +import pickle +import sys +import time + +import nevergrad +import numpy +import ray +import torch +from torch.utils.tensorboard import SummaryWriter + +import diagnose_model +import models +import replay_buffer +import self_play +# import simplifiedMuZero.search_policy.self_play_uniform_search as self_play +import shared_storage +import trainer + + +class MuZero_uniform: + """ + Main class to manage MuZero. + + Args: + game_name (str): Name of the game module, it should match the name of a .py file + in the "./games" directory. + + config (dict, MuZeroConfig, optional): Override the default config of the game. + + split_resources_in (int, optional): Split the GPU usage when using concurent muzero instances. + + Example: + >>> muzero = MuZero_uniform("cartpole") + >>> muzero.train() + >>> muzero.test(render=True) + """ + + def __init__(self, game_name, config=None, split_resources_in=1): + # Load the game and the config from the module with the game name + try: + game_module = importlib.import_module("games." + game_name) + print("games." + game_name) + self.Game = game_module.Game + self.config = game_module.MuZeroConfig() + except ModuleNotFoundError as err: + print( + f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.' + ) + raise err + + # Overwrite the config + if config: + if type(config) is dict: + for param, value in config.items(): + if hasattr(self.config, param): + setattr(self.config, param, value) + else: + raise AttributeError( + f"{game_name} config has no attribute '{param}'. Check the config file for the complete list of parameters." + ) + else: + self.config = config + + # 重命名路径,以便区分不同的模型 + self.config.results_path /= "muzero_uniform" + self.config.temperature_threshold = 0 + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Manage GPUs + if self.config.max_num_gpus == 0 and ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + raise ValueError( + "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu." + ) + if ( + self.config.selfplay_on_gpu + or self.config.train_on_gpu + or self.config.reanalyse_on_gpu + ): + total_gpus = ( + self.config.max_num_gpus + if self.config.max_num_gpus is not None + else torch.cuda.device_count() + ) + else: + total_gpus = 0 + self.num_gpus = total_gpus / split_resources_in + if 1 < self.num_gpus: + self.num_gpus = math.floor(self.num_gpus) + + ray.init(num_gpus=total_gpus, ignore_reinit_error=True) + + # Checkpoint and replay buffer used to initialize workers + self.checkpoint = { + "weights": None, + "optimizer_state": None, + "total_reward": 0, + "muzero_reward": 0, + "opponent_reward": 0, + "episode_length": 0, + "mean_value": 0, + "training_step": 0, + "lr": 0, + "total_loss": 0, + "value_loss": 0, + "reward_loss": 0, + "policy_loss": 0, + "num_played_games": 0, + "num_played_steps": 0, + "num_reanalysed_games": 0, + "terminate": False, + } + self.replay_buffer = {} + + cpu_actor = CPUActor.remote() + cpu_weights = cpu_actor.get_initial_weights.remote(self.config) + self.checkpoint["weights"], self.summary = copy.deepcopy(ray.get(cpu_weights)) + + # Workers + self.self_play_workers = None + self.test_worker = None + self.training_worker = None + self.reanalyse_worker = None + self.replay_buffer_worker = None + self.shared_storage_worker = None + + def train(self, log_in_tensorboard=True): + """ + Spawn ray workers and launch the training. + + Args: + log_in_tensorboard (bool): Start a testing worker and log its performance in TensorBoard. + """ + if log_in_tensorboard or self.config.save_model: + self.config.results_path.mkdir(parents=True, exist_ok=True) + + # Manage GPUs + if 0 < self.num_gpus: + num_gpus_per_worker = self.num_gpus / ( + self.config.train_on_gpu + + self.config.num_workers * self.config.selfplay_on_gpu + + log_in_tensorboard * self.config.selfplay_on_gpu + + self.config.use_last_model_value * self.config.reanalyse_on_gpu + ) + if 1 < num_gpus_per_worker: + num_gpus_per_worker = math.floor(num_gpus_per_worker) + else: + num_gpus_per_worker = 0 + + # Initialize workers + self.training_worker = trainer.Trainer.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.train_on_gpu else 0, + ).remote(self.checkpoint, self.config) + + self.shared_storage_worker = shared_storage.SharedStorage.remote( + self.checkpoint, + self.config, + ) + self.shared_storage_worker.set_info.remote("terminate", False) + + self.replay_buffer_worker = replay_buffer.ReplayBuffer.remote( + self.checkpoint, self.replay_buffer, self.config + ) + + if self.config.use_last_model_value: + self.reanalyse_worker = replay_buffer.Reanalyse.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.reanalyse_on_gpu else 0, + ).remote(self.checkpoint, self.config) + + self.self_play_workers = [ + self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus_per_worker if self.config.selfplay_on_gpu else 0, + ).remote( + self.checkpoint, + self.Game, + self.config, + self.config.seed + seed, + ) + for seed in range(self.config.num_workers) + ] + + # Launch workers + [ + self_play_worker.continuous_self_play.remote( + self.shared_storage_worker, self.replay_buffer_worker + ) + for self_play_worker in self.self_play_workers + ] + self.training_worker.continuous_update_weights.remote( + self.replay_buffer_worker, self.shared_storage_worker + ) + if self.config.use_last_model_value: + self.reanalyse_worker.reanalyse.remote( + self.replay_buffer_worker, self.shared_storage_worker + ) + + if log_in_tensorboard: + self.logging_loop( + num_gpus_per_worker if self.config.selfplay_on_gpu else 0, + ) + + def logging_loop(self, num_gpus): + """ + Keep track of the training performance. + """ + # Launch the test worker to get performance metrics + self.test_worker = self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus, + ).remote( + self.checkpoint, + self.Game, + self.config, + self.config.seed + self.config.num_workers, + ) + self.test_worker.continuous_self_play.remote( + self.shared_storage_worker, None, True + ) + + # Write everything in TensorBoard + writer = SummaryWriter(self.config.results_path) + + print( + "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" + ) + + # Save hyperparameters to TensorBoard + hp_table = [ + f"| {key} | {value} |" for key, value in self.config.__dict__.items() + ] + writer.add_text( + "Hyperparameters", + "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), + ) + # Save model representation + writer.add_text( + "Model summary", + self.summary, + ) + # Loop for updating the training performance + counter = 0 + keys = [ + "total_reward", + "muzero_reward", + "opponent_reward", + "episode_length", + "mean_value", + "training_step", + "lr", + "total_loss", + "value_loss", + "reward_loss", + "policy_loss", + "num_played_games", + "num_played_steps", + "num_reanalysed_games", + ] + info = ray.get(self.shared_storage_worker.get_info.remote(keys)) + try: + while info["training_step"] < self.config.training_steps: + info = ray.get(self.shared_storage_worker.get_info.remote(keys)) + writer.add_scalar( + "1.Total_reward/1.Total_reward", + info["total_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/2.Mean_value", + info["mean_value"], + counter, + ) + writer.add_scalar( + "1.Total_reward/3.Episode_length", + info["episode_length"], + counter, + ) + writer.add_scalar( + "1.Total_reward/4.MuZero_reward", + info["muzero_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/5.Opponent_reward", + info["opponent_reward"], + counter, + ) + writer.add_scalar( + "2.Workers/1.Self_played_games", + info["num_played_games"], + counter, + ) + writer.add_scalar( + "2.Workers/2.Training_steps", info["training_step"], counter + ) + writer.add_scalar( + "2.Workers/3.Self_played_steps", info["num_played_steps"], counter + ) + writer.add_scalar( + "2.Workers/4.Reanalysed_games", + info["num_reanalysed_games"], + counter, + ) + writer.add_scalar( + "2.Workers/5.Training_steps_per_self_played_step_ratio", + info["training_step"] / max(1, info["num_played_steps"]), + counter, + ) + writer.add_scalar("2.Workers/6.Learning_rate", info["lr"], counter) + writer.add_scalar( + "3.Loss/1.Total_weighted_loss", info["total_loss"], counter + ) + writer.add_scalar("3.Loss/Value_loss", info["value_loss"], counter) + writer.add_scalar("3.Loss/Reward_loss", info["reward_loss"], counter) + writer.add_scalar("3.Loss/Policy_loss", info["policy_loss"], counter) + print( + f'Last test reward: {info["total_reward"]:.2f}. Training step: {info["training_step"]}/{self.config.training_steps}. Played games: {info["num_played_games"]}. Loss: {info["total_loss"]:.2f}', + end="\r", + ) + counter += 1 + time.sleep(0.5) + except KeyboardInterrupt: + pass + + self.terminate_workers() + + if self.config.save_model: + # Persist replay buffer to disk + path = self.config.results_path / "replay_buffer.pkl" + print(f"\n\nPersisting replay buffer games to disk at {path}") + pickle.dump( + { + "buffer": self.replay_buffer, + "num_played_games": self.checkpoint["num_played_games"], + "num_played_steps": self.checkpoint["num_played_steps"], + "num_reanalysed_games": self.checkpoint["num_reanalysed_games"], + }, + open(path, "wb"), + ) + + def terminate_workers(self): + """ + Softly terminate the running tasks and garbage collect the workers. + """ + if self.shared_storage_worker: + self.shared_storage_worker.set_info.remote("terminate", True) + self.checkpoint = ray.get( + self.shared_storage_worker.get_checkpoint.remote() + ) + if self.replay_buffer_worker: + self.replay_buffer = ray.get(self.replay_buffer_worker.get_buffer.remote()) + + print("\nShutting down workers...") + + self.self_play_workers = None + self.test_worker = None + self.training_worker = None + self.reanalyse_worker = None + self.replay_buffer_worker = None + self.shared_storage_worker = None + + def test( + self, render=True, opponent=None, muzero_player=None, num_tests=1, num_gpus=0 + ): + """ + Test the model in a dedicated thread. + + Args: + render (bool): To display or not the environment. Defaults to True. + + opponent (str): "self" for self-play, "human" for playing against MuZero and "random" + for a random agent, None will use the opponent in the config. Defaults to None. + + muzero_player (int): Player number of MuZero in case of multiplayer + games, None let MuZero play all players turn by turn, None will use muzero_player in + the config. Defaults to None. + + num_tests (int): Number of games to average. Defaults to 1. + + num_gpus (int): Number of GPUs to use, 0 forces to use the CPU. Defaults to 0. + """ + opponent = opponent if opponent else self.config.opponent + muzero_player = muzero_player if muzero_player else self.config.muzero_player + self_play_worker = self_play.SelfPlay.options( + num_cpus=0, + num_gpus=num_gpus, + ).remote(self.checkpoint, self.Game, self.config, numpy.random.randint(10000)) + results = [] + for i in range(num_tests): + print(f"Testing {i+1}/{num_tests}") + results.append( + ray.get( + self_play_worker.play_game.remote( + 0, + 0, + render, + opponent, + muzero_player, + ) + ) + ) + self_play_worker.close_game.remote() + + if len(self.config.players) == 1: + result = numpy.mean([sum(history.reward_history) for history in results]) + else: + result = numpy.mean( + [ + sum( + reward + for i, reward in enumerate(history.reward_history) + if history.to_play_history[i - 1] == muzero_player + ) + for history in results + ] + ) + return result + + def load_model(self, checkpoint_path=None, replay_buffer_path=None): + """ + Load a model and/or a saved replay buffer. + + Args: + checkpoint_path (str): Path to model.checkpoint or model.weights. + + replay_buffer_path (str): Path to replay_buffer.pkl + """ + # Load checkpoint + if checkpoint_path: + checkpoint_path = pathlib.Path(checkpoint_path) + self.checkpoint = torch.load(checkpoint_path) + print(f"\nUsing checkpoint from {checkpoint_path}") + + # Load replay buffer + if replay_buffer_path: + replay_buffer_path = pathlib.Path(replay_buffer_path) + with open(replay_buffer_path, "rb") as f: + replay_buffer_infos = pickle.load(f) + self.replay_buffer = replay_buffer_infos["buffer"] + self.checkpoint["num_played_steps"] = replay_buffer_infos[ + "num_played_steps" + ] + self.checkpoint["num_played_games"] = replay_buffer_infos[ + "num_played_games" + ] + self.checkpoint["num_reanalysed_games"] = replay_buffer_infos[ + "num_reanalysed_games" + ] + + print(f"\nInitializing replay buffer with {replay_buffer_path}") + else: + print(f"Using empty buffer.") + self.replay_buffer = {} + self.checkpoint["training_step"] = 0 + self.checkpoint["num_played_steps"] = 0 + self.checkpoint["num_played_games"] = 0 + self.checkpoint["num_reanalysed_games"] = 0 + + def diagnose_model(self, horizon): + """ + Play a game only with the learned model then play the same trajectory in the real + environment and display information. + + Args: + horizon (int): Number of timesteps for which we collect information. + """ + game = self.Game(self.config.seed) + obs = game.reset() + dm = diagnose_model.DiagnoseModel(self.checkpoint, self.config) + dm.compare_virtual_with_real_trajectories(obs, game, horizon) + input("Press enter to close all plots") + dm.close_all() + + +@ray.remote(num_cpus=0, num_gpus=0) +class CPUActor: + # Trick to force DataParallel to stay on CPU to get weights on CPU even if there is a GPU + def __init__(self): + pass + + def get_initial_weights(self, config): + model = models.MuZeroNetwork(config) + weigths = model.get_weights() + summary = str(model).replace("\n", " \n\n") + return weigths, summary + + +def hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, num_tests +): + """ + Search for hyperparameters by launching parallel experiments. + + Args: + game_name (str): Name of the game module, it should match the name of a .py file + in the "./games" directory. + + parametrization : Nevergrad parametrization, please refer to nevergrad documentation. + + budget (int): Number of experiments to launch in total. + + parallel_experiments (int): Number of experiments to launch in parallel. + + num_tests (int): Number of games to average for evaluating an experiment. + """ + optimizer = nevergrad.optimizers.OnePlusOne( + parametrization=parametrization, budget=budget + ) + + running_experiments = [] + best_training = None + try: + # Launch initial experiments + for i in range(parallel_experiments): + if 0 < budget: + param = optimizer.ask() + print(f"Launching new experiment: {param.value}") + muzero = MuZero_uniform(game_name, param.value, parallel_experiments) + muzero.param = param + muzero.train(False) + running_experiments.append(muzero) + budget -= 1 + + while 0 < budget or any(running_experiments): + for i, experiment in enumerate(running_experiments): + if experiment and experiment.config.training_steps <= ray.get( + experiment.shared_storage_worker.get_info.remote("training_step") + ): + experiment.terminate_workers() + result = experiment.test(False, num_tests=num_tests) + if not best_training or best_training["result"] < result: + best_training = { + "result": result, + "config": experiment.config, + "checkpoint": experiment.checkpoint, + } + print(f"Parameters: {experiment.param.value}") + print(f"Result: {result}") + optimizer.tell(experiment.param, -result) + + if 0 < budget: + param = optimizer.ask() + print(f"Launching new experiment: {param.value}") + muzero = MuZero_uniform(game_name, param.value, parallel_experiments) + muzero.param = param + muzero.train(False) + running_experiments[i] = muzero + budget -= 1 + else: + running_experiments[i] = None + + except KeyboardInterrupt: + for experiment in running_experiments: + if isinstance(experiment, MuZero_uniform): + experiment.terminate_workers() + + recommendation = optimizer.provide_recommendation() + print("Best hyperparameters:") + print(recommendation.value) + if best_training: + # Save best training weights (but it's not the recommended weights) + best_training["config"].results_path.mkdir(parents=True, exist_ok=True) + torch.save( + best_training["checkpoint"], + best_training["config"].results_path / "model.checkpoint", + ) + # Save the recommended hyperparameters + text_file = open( + best_training["config"].results_path / "best_parameters.txt", + "w", + ) + text_file.write(str(recommendation.value)) + text_file.close() + return recommendation.value + + +def load_model_menu(muzero, game_name): + # Configure running options + options = ["Specify paths manually"] + sorted( + (pathlib.Path("results") / game_name).glob("*/") + ) + options.reverse() + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose a model to load: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + + if choice == (len(options) - 1): + # manual path option + checkpoint_path = input( + "Enter a path to the model.checkpoint, or ENTER if none: " + ) + while checkpoint_path and not pathlib.Path(checkpoint_path).is_file(): + checkpoint_path = input("Invalid checkpoint path. Try again: ") + replay_buffer_path = input( + "Enter a path to the replay_buffer.pkl, or ENTER if none: " + ) + while replay_buffer_path and not pathlib.Path(replay_buffer_path).is_file(): + replay_buffer_path = input("Invalid replay buffer path. Try again: ") + else: + checkpoint_path = options[choice] / "model.checkpoint" + replay_buffer_path = options[choice] / "replay_buffer.pkl" + + muzero.load_model( + checkpoint_path=checkpoint_path, + replay_buffer_path=replay_buffer_path, + ) + + +if __name__ == "__main__": + if len(sys.argv) == 2: + # Train directly with: python muzero.py cartpole + muzero = MuZero_uniform(sys.argv[1]) + muzero.train() + elif len(sys.argv) == 3: + # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' + config = json.loads(sys.argv[2]) + muzero = MuZero_uniform(sys.argv[1], config) + muzero.train() + else: + print("\nWelcome to MuZero! Here's a list of games:") + # Let user pick a game + games = [ + filename.stem + for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) + if filename.name != "abstract_game.py" + ] + for i in range(len(games)): + print(f"{i}. {games[i]}") + choice = input("Enter a number to choose the game: ") + valid_inputs = [str(i) for i in range(len(games))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + + # Initialize MuZero + choice = int(choice) + game_name = games[choice] + muzero = MuZero_uniform(game_name) + + while True: + # Configure running options + options = [ + "Train", + "Load pretrained model", + "Diagnose model", + "Render some self play games", + "Play against MuZero", + "Test the game manually", + "Hyperparameter search", + "Exit", + ] + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose an action: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + if choice == 0: + start_time = time.time() + muzero.train() + end_time = time.time() + print("耗时: {:.2f}秒".format(end_time - start_time)) + elif choice == 1: + load_model_menu(muzero, game_name) + elif choice == 2: + muzero.diagnose_model(30) + elif choice == 3: + muzero.test(render=True, opponent="self", muzero_player=None) + elif choice == 4: + muzero.test(render=True, opponent="human", muzero_player=0) + elif choice == 5: + env = muzero.Game() + env.reset() + env.render() + + done = False + while not done: + action = env.human_to_action() + observation, reward, done = env.step(action) + print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") + env.render() + elif choice == 6: + # Define here the parameters to tune + # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html + muzero.terminate_workers() + del muzero + budget = 20 + parallel_experiments = 2 + lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) + discount = nevergrad.p.Log(lower=0.95, upper=0.9999) + parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) + best_hyperparameters = hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, 20 + ) + muzero = MuZero_uniform(game_name, best_hyperparameters) + else: + break + print("\nDone") + + ray.shutdown() diff --git a/muzero_without_replay_buffer.py b/muzero_without_replay_buffer.py new file mode 100644 index 00000000..4b87fc7b --- /dev/null +++ b/muzero_without_replay_buffer.py @@ -0,0 +1,108 @@ +import models +from muzero_general import MuZeroGeneral +from muzero import load_model_menu, hyperparameter_search + +import json +import sys +import pathlib +import time +import nevergrad + +if __name__ == "__main__": + # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb") + # start_time = time.time() + # muzero.train() + # end_time = time.time() + # print("耗时: {:.2f}秒".format(end_time - start_time)) + model_cls = models.MuZeroNetwork + if len(sys.argv) == 2: + # Train directly with: python muzero.py cartpole + muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls) + muzero.train() + elif len(sys.argv) == 3: + # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' + config = json.loads(sys.argv[2]) + muzero = MuZeroGeneral(sys.argv[1], config, model_cls=model_cls) + muzero.train() + else: + print("\nWelcome to MuZero! Here's a list of games:") + # Let user pick a game + games = [ + filename.stem + for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) + if filename.name != "abstract_game.py" + ] + for i in range(len(games)): + print(f"{i}. {games[i]}") + choice = input("Enter a number to choose the game: ") + valid_inputs = [str(i) for i in range(len(games))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + + # Initialize MuZero + choice = int(choice) + game_name = games[choice] + muzero = MuZeroGeneral(game_name, model_cls=model_cls) + + while True: + # Configure running options + options = [ + "Train", + "Load pretrained model", + "Diagnose model", + "Render some self play games", + "Play against MuZero", + "Test the game manually", + "Hyperparameter search", + "Exit", + ] + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose an action: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + if choice == 0: + start_time = time.time() + muzero.train() + end_time = time.time() + print("耗时: {:.2f}秒".format(end_time - start_time)) + elif choice == 1: + load_model_menu(muzero, game_name) + elif choice == 2: + muzero.diagnose_model(30) + elif choice == 3: + muzero.test(render=True, opponent="self", muzero_player=None) + elif choice == 4: + muzero.test(render=True, opponent="human", muzero_player=0) + elif choice == 5: + env = muzero.Game() + env.reset() + env.render() + + done = False + while not done: + action = env.human_to_action() + observation, reward, done = env.step(action) + print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") + env.render() + elif choice == 6: + # Define here the parameters to tune + # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html + muzero.terminate_workers() + del muzero + budget = 20 + parallel_experiments = 2 + lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) + discount = nevergrad.p.Log(lower=0.95, upper=0.9999) + parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) + best_hyperparameters = hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, 20 + ) + muzero = MuZeroGeneral(game_name, best_hyperparameters, model_cls=model_cls) + else: + break + print("\nDone") \ No newline at end of file diff --git a/muzero_without_replay_buffer_tictactoe.py b/muzero_without_replay_buffer_tictactoe.py new file mode 100644 index 00000000..f64413ab --- /dev/null +++ b/muzero_without_replay_buffer_tictactoe.py @@ -0,0 +1,242 @@ +from self_play import MCTS, GameHistory +from games.tictactoe import MuZeroConfig, Game +# from games.tictactoe import MuZeroConfig, Game +import models + +import numpy +import torch +from torch.utils.tensorboard import SummaryWriter +import pickle + +import math +import time +import copy + +from simplifiedMuZero.without_rb.game_play import GamePlay +from simplifiedMuZero.without_rb.play_buffer import PlayBuffer +from simplifiedMuZero.without_rb.trainer import Trainer + +def logging_loop(config, checkpoint, writer, training_steps): + # writer = SummaryWriter(config.results_path) + + # print( + # "\nTraining...\nRun tensorboard --logdir ./results and go to http://localhost:6006/ to see in real time the training performance.\n" + # ) + + # Save hyperparameters to TensorBoard + hp_table = [ + f"| {key} | {value} |" for key, value in config.__dict__.items() + ] + writer.add_text( + "Hyperparameters", + "| Parameter | Value |\n|-------|-------|\n" + "\n".join(hp_table), + ) + # # Save model representation + # writer.add_text( + # "Model summary", + # str(model).replace("\n", " \n\n") # self.summary, 换成其它的 + # ) + # Loop for updating the training performance + counter = training_steps + + try: + if True: + # while checkpoint["training_step"] < config.training_steps: + writer.add_scalar( + "1.Total_reward/1.Total_reward", + checkpoint["total_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/2.Mean_value", + checkpoint["mean_value"], + counter, + ) + writer.add_scalar( + "1.Total_reward/3.Episode_length", + checkpoint["episode_length"], + counter, + ) + writer.add_scalar( + "1.Total_reward/4.MuZero_reward", + checkpoint["muzero_reward"], + counter, + ) + writer.add_scalar( + "1.Total_reward/5.Opponent_reward", + checkpoint["opponent_reward"], + counter, + ) + writer.add_scalar( + "2.Workers/1.Self_played_games", + checkpoint["num_played_games"], + counter, + ) + writer.add_scalar( + "2.Workers/2.Training_steps", checkpoint["training_step"], counter + ) + writer.add_scalar( + "2.Workers/3.Self_played_steps", checkpoint["num_played_steps"], counter + ) + writer.add_scalar( + "2.Workers/4.Reanalysed_games", + checkpoint["num_reanalysed_games"], + counter, + ) + writer.add_scalar( + "2.Workers/5.Training_steps_per_self_played_step_ratio", + checkpoint["training_step"] / max(1, checkpoint["num_played_steps"]), + counter, + ) + writer.add_scalar("2.Workers/6.Learning_rate", checkpoint["lr"], counter) + writer.add_scalar( + "3.Loss/1.Total_weighted_loss", checkpoint["total_loss"], counter + ) + writer.add_scalar("3.Loss/Value_loss", checkpoint["value_loss"], counter) + writer.add_scalar("3.Loss/Reward_loss", checkpoint["reward_loss"], counter) + writer.add_scalar("3.Loss/Policy_loss", checkpoint["policy_loss"], counter) + print( + f'Last test reward: {checkpoint["total_reward"]:.2f}. Training step: {checkpoint["training_step"]}/{config.training_steps}. Played games: {checkpoint["num_played_games"]}. Loss: {checkpoint["total_loss"]:.2f}', + end="\r", + ) + counter += 1 + # time.sleep(0.5) + except KeyboardInterrupt: + pass + + # if config.save_model: + # # Persist replay buffer to disk + # path = config.results_path / "replay_buffer.pkl" + # print(f"\n\nPersisting replay buffer games to disk at {path}") + # pickle.dump( + # { + # "buffer": buffer, + # "num_played_games": checkpoint["num_played_games"], + # "num_played_steps": checkpoint["num_played_steps"], + # "num_reanalysed_games": checkpoint["num_reanalysed_games"], + # }, + # open(path, "wb"), + # ) + +def update_gameplay_checkpoint(config, checkpoint, game_history): + checkpoint["episode_length"] = len(game_history.action_history) - 1 + checkpoint["total_reward"] = sum(game_history.reward_history) + checkpoint["mean_value"] = numpy.mean( [value for value in game_history.root_values if value]) + + if 1 < len(config.players): + checkpoint["muzero_reward"] = sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + == config.muzero_player + ) + checkpoint["opponent_reward"] = sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + != config.muzero_player + ) + +def save_checkpoint(config, checkpoint, path=None): #将模型存储在文件中 + if not path: + path = config.results_path / "model.checkpoint" + + torch.save(checkpoint, path) + +def train(log_in_tensorboard=True): + config = MuZeroConfig() + config.results_path /= "muzero_without_rb" + + if log_in_tensorboard or config.save_model: + config.results_path.mkdir(parents=True, exist_ok=True) + + checkpoint = { + "weights": None, + "optimizer_state": None, + "total_reward": 0, + "muzero_reward": 0, + "opponent_reward": 0, + "episode_length": 0, + "mean_value": 0, + "training_step": 0, + "lr": 0, + "total_loss": 0, + "value_loss": 0, + "reward_loss": 0, + "policy_loss": 0, + "num_played_games": 0, + "num_played_steps": 0, + "num_reanalysed_games": 0, + "terminate": False, + } + + trainer = Trainer(models.MuZeroNetwork, checkpoint, config) + selfplay = GamePlay(trainer.model, checkpoint, Game, config, config.seed) + buffer = {} + play_buffer = PlayBuffer(checkpoint, buffer, config) + + step = 1 # 间隔,即每次模拟后训练多少次 + max_steps = int(config.training_steps/step) + # max_steps = 2000 + + writer = SummaryWriter(config.results_path) + + for episode in range(max_steps): + game_id, game_history = selfplay.play_game(selfplay.config.visit_softmax_temperature_fn(0), selfplay.config.temperature_threshold, False, "self",0) + + # print(game_id) + # print(game_history.action_history) + # print(game_history.reward_history) + # print(game_history.to_play_history) + # # print(game_history.observation_history) + # print("child visits", game_history.child_visits) + # print(game_history.root_values) # root value指的是root节点的UCB值 + + play_buffer.update_game_history(game_id, game_history) + update_gameplay_checkpoint(config, checkpoint, game_history) + + for i in range(step): + index_batch, batch = play_buffer.get_batch() + # print(batch[1]) + trainer.update_lr() + ( + priorities, + total_loss, + value_loss, + reward_loss, + policy_loss, + ) = trainer.update_weights(batch) + + + training_step = episode * step + i + if training_step % config.checkpoint_interval == 0: + checkpoint["weights"] = copy.deepcopy(trainer.model.get_weights()) + checkpoint["optimizer_state"] =copy.deepcopy(models.dict_to_cpu(trainer.optimizer.state_dict()) ) + + if config.save_model: + save_checkpoint(config, checkpoint) + checkpoint["training_step"] = training_step + checkpoint["lr"] = trainer.optimizer.param_groups[0]["lr"] + checkpoint["total_loss"] = total_loss + checkpoint["value_loss"] = value_loss + checkpoint["reward_loss"] = reward_loss + checkpoint["policy_loss"] = policy_loss + + # print(training_step) + # if training_step % 500 == 0: + # if training_step % config.checkpoint_interval == 0: + # # print(training_step) + # logging_loop(config, checkpoint, writer) + + logging_loop(config, checkpoint, writer, training_step) + + + writer.close() + + selfplay.close_game() + +if __name__ == "__main__": + start_time = time.time() + train() + end_time = time.time() + print("耗时: {:.2f}秒".format(end_time - start_time)) \ No newline at end of file diff --git a/replay_buffer.py b/replay_buffer.py index 81bc813e..cc1115db 100644 --- a/replay_buffer.py +++ b/replay_buffer.py @@ -16,7 +16,7 @@ class ReplayBuffer: def __init__(self, initial_checkpoint, initial_buffer, config): self.config = config - self.buffer = copy.deepcopy(initial_buffer) + self.buffer = copy.deepcopy(initial_buffer) # buffer是一个字典,key是game id,value是game_history self.num_played_games = initial_checkpoint["num_played_games"] self.num_played_steps = initial_checkpoint["num_played_steps"] self.total_samples = sum( @@ -79,11 +79,14 @@ def get_batch(self): ) = ([], [], [], [], [], [], []) weight_batch = [] if self.config.PER else None + # 从buffer里抽取n鸽样本,有probs的话安装probs的概率抽取,没有的话按照uniform抽取 for game_id, game_history, game_prob in self.sample_n_games( self.config.batch_size ): + # 每个game_history都是一个游戏运行的序列,使用sample_position从这些序列里随机抽取一个位置 game_pos, pos_prob = self.sample_position(game_history) + # 计算从该位置开始的值,rewards等数据 values, rewards, policies, actions = self.make_target( game_history, game_pos ) @@ -165,11 +168,11 @@ def sample_n_games(self, n_games, force_uniform=False): game_id_list.append(game_id) game_probs.append(game_history.game_priority) game_probs = numpy.array(game_probs, dtype="float32") - game_probs /= numpy.sum(game_probs) + game_probs /= numpy.sum(game_probs) # 每一个都除以game_probs的总和,可以看成是归一化 game_prob_dict = dict( [(game_id, prob) for game_id, prob in zip(game_id_list, game_probs)] ) - selected_games = numpy.random.choice(game_id_list, n_games, p=game_probs) + selected_games = numpy.random.choice(game_id_list, n_games, p=game_probs) # 抽取n个样本, 抽取的概率是根据game_probs确定的 else: selected_games = numpy.random.choice(list(self.buffer.keys()), n_games) game_prob_dict = {} @@ -177,10 +180,11 @@ def sample_n_games(self, n_games, force_uniform=False): (game_id, self.buffer[game_id], game_prob_dict.get(game_id)) for game_id in selected_games ] - return ret + return ret # ret格式为[game_id, game_history, game_prob] def sample_position(self, game_history, force_uniform=False): """ + 统一或根据某些优先级从游戏中采样位置。 Sample position from game either uniformly or according to some priority. See paper appendix Training. """ @@ -230,6 +234,8 @@ def update_priorities(self, priorities, index_info): def compute_target_value(self, game_history, index): # The value target is the discounted root value of the search tree td_steps into the # future, plus the discounted sum of all rewards until then. + # 价值目标是未来搜索树 td_steps 的折扣根值,加上到那时为止的所有奖励的折扣总和。 + # 计算公式 ∑r*γ^n bootstrap_index = index + self.config.td_steps if bootstrap_index < len(game_history.root_values): root_values = ( @@ -237,6 +243,8 @@ def compute_target_value(self, game_history, index): if game_history.reanalysed_predicted_root_values is None else game_history.reanalysed_predicted_root_values ) + + # 检查当前的id和目标id是否一致,如果不一致则取负 last_step_value = ( root_values[bootstrap_index] if game_history.to_play_history[bootstrap_index] @@ -244,13 +252,15 @@ def compute_target_value(self, game_history, index): else -root_values[bootstrap_index] ) + # 计算公式 r*γ^n value = last_step_value * self.config.discount**self.config.td_steps - else: + else: # 因为终点的长度超过了数据,因此设为0 value = 0 for i, reward in enumerate( - game_history.reward_history[index + 1 : bootstrap_index + 1] + game_history.reward_history[index + 1 : bootstrap_index + 1] # 获取reward,从index+1到最大(如果长度不够则只会取到最后) ): + # 根据对手决定正负号,只会累计到value上 # The value is oriented from the perspective of the current player value += ( reward @@ -259,12 +269,13 @@ def compute_target_value(self, game_history, index): else -reward ) * self.config.discount**i - return value + return value # 返回value def make_target(self, game_history, state_index): """ Generate targets for every unroll steps. """ + # target policies 是 策略选择的概率序列,如[[0.4,0.6], [0.5,0.5],...] target_values, target_rewards, target_policies, actions = [], [], [], [] for current_index in range( state_index, state_index + self.config.num_unroll_steps + 1 @@ -280,6 +291,7 @@ def make_target(self, game_history, state_index): target_values.append(0) target_rewards.append(game_history.reward_history[current_index]) # Uniform policy + # 因为是游戏结束的状态,因此选择各个策略的概率是平均分布的 target_policies.append( [ 1 / len(game_history.child_visits[0]) @@ -287,8 +299,9 @@ def make_target(self, game_history, state_index): ] ) actions.append(game_history.action_history[current_index]) - else: + else: # 如果current index 大于 game_history的长度 # States past the end of games are treated as absorbing states + # 游戏结束后的状态被视为吸收状态,因此都为0 target_values.append(0) target_rewards.append(0) # Uniform policy diff --git a/requirements.lock b/requirements.lock index 742f745f..4d7ba441 100644 --- a/requirements.lock +++ b/requirements.lock @@ -6,7 +6,7 @@ # absl-py==1.0.0 # via tensorboard -aiohttp==3.8.1 +aiohttp==3.7.4 # via # aiohttp-cors # ray @@ -16,7 +16,7 @@ aioredis==1.3.1 # via ray aiosignal==1.2.0 # via aiohttp -async-timeout==4.0.1 +async-timeout==3.0.1 # via # aiohttp # aioredis @@ -171,7 +171,7 @@ pytz==2021.3 # via pandas pyyaml==6.0 # via ray -ray==1.5.2 +ray==1.2 # via -r requirements.in redis==4.0.1 # via ray diff --git a/self_play.py b/self_play.py index d90fe5db..c62802f7 100644 --- a/self_play.py +++ b/self_play.py @@ -33,8 +33,8 @@ def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False): shared_storage.get_info.remote("training_step") ) < self.config.training_steps and not ray.get( shared_storage.get_info.remote("terminate") - ): - self.model.set_weights(ray.get(shared_storage.get_info.remote("weights"))) + ): # 如果当前的训练步数低于训练总步数,并且没有终止的话,继续进行训练 + self.model.set_weights(ray.get(shared_storage.get_info.remote("weights"))) # 从shared_storage中获取当前的参数 if not test_mode: game_history = self.play_game( @@ -107,6 +107,16 @@ def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False): self.close_game() + #play game 运行 + # 合法的actions是固定的,由游戏文件提供(在本函数中,可以看到调用legal_actions函数没有使用env,这表面现游戏环境于的改变于动作无关)。 + # 运行步骤: + # 1. 创建GameHistory用来存储数据 + # 2. 检查游戏是否结束或者到底最大移动次数 + # 3. 获取stacked observation(因为有些游戏需要考虑之前的历史数据和移动轨迹) + # 4. 运行MCTS搜索下一步的action + # 5. 调用游戏函数step(action),获取下一步action之后的observation、reward和done + # 6. 持续运行2-5步直到结束 + # 7. 返回GameHistory def play_game( self, temperature, temperature_threshold, render, opponent, muzero_player ): @@ -116,9 +126,9 @@ def play_game( game_history = GameHistory() observation = self.game.reset() game_history.action_history.append(0) - game_history.observation_history.append(observation) + game_history.observation_history.append(observation) # 添加reset之后的observation game_history.reward_history.append(0) - game_history.to_play_history.append(self.game.to_play()) + game_history.to_play_history.append(self.game.to_play()) # to_play_history是用来存放玩家id的 done = False @@ -128,7 +138,7 @@ def play_game( with torch.no_grad(): while ( not done and len(game_history.action_history) <= self.config.max_moves - ): + ): # 游戏没有结束且运行步数小于最大移动步长 assert ( len(numpy.array(observation).shape) == 3 ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" @@ -138,14 +148,17 @@ def play_game( stacked_observations = game_history.get_stacked_observations( -1, self.config.stacked_observations, len(self.config.action_space) ) + # index是-1,game_history 会在创建时添加reset的observation,因此其长度为1.index取模(%)之后时1 + # config.stacked_observationis是存储之前的observation的数量,如果不要之前的信息,可以设为0,这样就不会存储之前的信息 + # 一下的if-else部分主要是为了选择一个动作 # Choose the action if opponent == "self" or muzero_player == self.game.to_play(): root, mcts_info = MCTS(self.config).run( self.model, stacked_observations, self.game.legal_actions(), - self.game.to_play(), + self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 True, ) action = self.select_action( @@ -154,7 +167,7 @@ def play_game( if not temperature_threshold or len(game_history.action_history) < temperature_threshold else 0, - ) + ) # 根据temperature选择动作 if render: print(f'Tree depth: {mcts_info["max_tree_depth"]}') @@ -162,11 +175,11 @@ def play_game( f"Root value for player {self.game.to_play()}: {root.value():.2f}" ) else: - action, root = self.select_opponent_action( + action, root = self.select_opponent_action( #选择对手动作,分为随机,human和expert三种 opponent, stacked_observations ) - observation, reward, done = self.game.step(action) + observation, reward, done = self.game.step(action) # 运行游戏 if render: print(f"Played action: {self.game.action_to_string(action)}") @@ -176,7 +189,7 @@ def play_game( # Next batch game_history.action_history.append(action) - game_history.observation_history.append(observation) + game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 game_history.reward_history.append(reward) game_history.to_play_history.append(self.game.to_play()) @@ -219,7 +232,12 @@ def select_opponent_action(self, opponent, stacked_observations): 'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"' ) - @staticmethod + # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。 + # 公式为 c^(1/t)。可以看到: + # t越小,1/t于接近于无穷大,值大的c就越容易被选中。 + # t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1,难以区分大小,因此就会相当于随机选择 + # 特殊地,当t=0时,使用random完全随机选择,当t=+∞,使用argmax选择最大的 + @staticmethod # 静态方法修饰符,类似于static关键字 def select_action(node, temperature): """ Select action according to the visit count distribution and the temperature. @@ -257,6 +275,25 @@ class MCTS: def __init__(self, config): self.config = config + # run函数运行流程: + # 1. 获取root节点 + # (1)如果由指定节点这将root赋值为该节点; + # (2)如果没有,则 + # i. 创建新的节点Node(0) + # ii. 使用initial_inference函数通过observation获取相应的reward,hidden state,legal actions等数据 + # iii. 将ii中获取的数据赋值到创建的root节点中取 + # PS. 可以看到,在(1)的情况下不需要调用initial_inference函数 + # 2. 检查是否需要添加探索噪音 + # 3. 开始循环模拟游戏,模拟的次数由num simulation决定 + # (1) 将初始节点node设置为root,并将节点node加入search tree中 + # (2) 检查该节点是否已经扩展,如果已经扩展,则通过ucb值来选择子节点expand. 并将node 设置为选中的节点。并将节点node加入search tree中 + # (3) 重复2,直到找到expanded为false的node为止 + # (4) 选择search_tree[-2]为parent(因为最后一个是node) + # (5) 运行recurrent_inference函数,获得reward,hidden state,legal actions等数据 + # (6) 扩展node,即为node创建子节点,使node展开。 + # (7) 反向传播算法,对路径上的所有访问次数+1,value值加reward + # PS: 可以看到,通过不停的模拟,节点被一层层的扩展(每次模拟扩展一个节点)。 + # 4. 返回扩展过后的节点树root,以便之后的程序根据它选择动作action def run( self, model, @@ -272,7 +309,7 @@ def run( We then run a Monte Carlo Tree Search using only action sequences and the model learned by the network. """ - if override_root_with: + if override_root_with: #检查有没有提供Node,如果有,则指定;如果没有,则自己创建一个 root = override_root_with root_predicted_value = None else: @@ -282,7 +319,7 @@ def run( .float() .unsqueeze(0) .to(next(model.parameters()).device) - ) + ) # observation转tensor,外面包一层形成一个batch。 Observation的长度由参数stacked_observation配置,主要存储之前的previous。不要之前privious的配置为0 ( root_predicted_value, reward, @@ -316,16 +353,17 @@ def run( min_max_stats = MinMaxStats() max_tree_depth = 0 - for _ in range(self.config.num_simulations): + for _ in range(self.config.num_simulations): # 开始模拟游戏 virtual_to_play = to_play node = root search_path = [node] current_tree_depth = 0 - while node.expanded(): + # expanded根据node的子节点个数判断是否已经扩展了,如果没有子节点,说明没被扩展 + while node.expanded(): #这个循环一直在搜索没有expand的子节点。如果子节点已经expand了,则通过select_child选择下一个 current_tree_depth += 1 - action, node = self.select_child(node, min_max_stats) - search_path.append(node) + action, node = self.select_child(node, min_max_stats) #选取ucb最大的一个action,如果有多个action得分相同,随机选取一个 + search_path.append(node) #把节点添加到搜索队列 # Players play turn by turn if virtual_to_play + 1 < len(self.config.players): @@ -333,15 +371,18 @@ def run( else: virtual_to_play = self.config.players[0] + # 在搜索树内部,我们使用动态函数来获取给定动作的下一个hidden_state和previous hidden state # Inside the search tree we use the dynamics function to obtain the next hidden # state given an action and the previous hidden state - parent = search_path[-2] + parent = search_path[-2] # 选择倒数第二个节点,因为当前的node是-1,则-2是它的parent value, reward, policy_logits, hidden_state = model.recurrent_inference( parent.hidden_state, torch.tensor([[action]]).to(parent.hidden_state.device), ) value = models.support_to_scalar(value, self.config.support_size).item() reward = models.support_to_scalar(reward, self.config.support_size).item() + # expand一层节点,actions是动作列表,policy_logits是rewards列表 + # 通过该函数,在该节点扩展一层节点 node.expand( self.config.action_space, virtual_to_play, @@ -360,6 +401,9 @@ def run( } return root, extra_info + # MCTS 的select child和之前SelfPlay的select action逻辑是不一样的 + # 1. select child是根据UCB选取的,select action是根据各个动作的visit count和temperature选取的 + # 2. select child 选择的对象是Node,Node是由当前的state执行action后生成的新Node形成的。select action单纯的是选action def select_child(self, node, min_max_stats): """ Select the child with the highest UCB score. @@ -368,7 +412,7 @@ def select_child(self, node, min_max_stats): self.ucb_score(node, child, min_max_stats) for action, child in node.children.items() ) - action = numpy.random.choice( + action = numpy.random.choice( # 随机选择ucb值等于最大ucb的动作(因为可能有多个动作的值都达到了最大的ucb,如果只有一个,那么就会选取这个) [ action for action, child in node.children.items() @@ -377,33 +421,37 @@ def select_child(self, node, min_max_stats): ) return action, node.children[action] - def ucb_score(self, parent, child, min_max_stats): + def ucb_score(self, parent, child, min_max_stats): #该函数只进行一步查询,不进行多步 """ The score for a node is based on its value, plus an exploration bonus based on the prior. """ pb_c = ( math.log( - (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base + (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base # pc_c_base由配置文件决定 ) + self.config.pb_c_init ) pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1) - prior_score = pb_c * child.prior + prior_score = pb_c * child.prior # prior 之前的p_value + # 公式 pb_c = (log((N+C+1)/C)+init ) * sqrt(N/(VC+1)) + # prior_score = pbc * prior if child.visit_count > 0: # Mean value Q - value_score = min_max_stats.normalize( + value_score = min_max_stats.normalize( # 括号里的是Q值,Q=E[r+r*Q'。此处在对其进行正则化 child.reward - + self.config.discount - * (child.value() if len(self.config.players) == 1 else -child.value()) + + self.config.discount # 衰减系数, 之后乘以子节点的值 + * (child.value() if len(self.config.players) == 1 else -child.value()) # 根据players的个数,如果大于1,则子节点必定是对手,因此子节点的取负。 ) else: value_score = 0 - return prior_score + value_score + return prior_score + value_score # 先前的分数加上Q值就是新的UCB值 - def backpropagate(self, search_path, value, to_play, min_max_stats): + # 反向传播算法 + # 对路径上的所有访问次数+1,value值加reward + def backpropagate(self, search_path, value, to_play, min_max_stats): # MCTS反向传播,visit count加1 """ At the end of a simulation, we propagate the evaluation all the way up the tree to the root. @@ -432,7 +480,7 @@ def backpropagate(self, search_path, value, to_play, min_max_stats): class Node: def __init__(self, prior): - self.visit_count = 0 + self.visit_count = 0 #visit count默认是0,只有经过反向传播之后才能变成增加 self.to_play = -1 self.prior = prior self.value_sum = 0 @@ -449,6 +497,8 @@ def value(self): return self.value_sum / self.visit_count def expand(self, actions, to_play, reward, policy_logits, hidden_state): + # expand一层节点,actions是动作列表,policy_logits是rewards列表 + # 通过该函数,在该节点扩展一层节点 """ We expand a node using the value, reward and policy prediction obtained from the neural network. @@ -460,7 +510,7 @@ def expand(self, actions, to_play, reward, policy_logits, hidden_state): policy_values = torch.softmax( torch.tensor([policy_logits[0][a] for a in actions]), dim=0 ).tolist() - policy = {a: policy_values[i] for i, a in enumerate(actions)} + policy = {a: policy_values[i] for i, a in enumerate(actions)} # 列出所有的合法动作及对于的value值 for action, p in policy.items(): self.children[action] = Node(p) @@ -512,7 +562,7 @@ def store_search_statistics(self, root, action_space): def get_stacked_observations( self, index, num_stacked_observations, action_space_size - ): + ): #根据索引index获取observation序列 """ Generate a new observation with the observation at the index position and num_stacked_observations past observations and actions stacked. @@ -520,12 +570,12 @@ def get_stacked_observations( # Convert to positive index index = index % len(self.observation_history) - stacked_observations = self.observation_history[index].copy() + stacked_observations = self.observation_history[index].copy() #分为两部分,一部分是当前(current)观察值,一部分是之前的(previous)观察值 for past_observation_index in reversed( range(index - num_stacked_observations, index) ): if 0 <= past_observation_index: - previous_observation = numpy.concatenate( + previous_observation = numpy.concatenate( # np.concatenate将第一个参数的list组合起来,方法是依次拆开每个元素,拼接 ( self.observation_history[past_observation_index], [ @@ -543,7 +593,7 @@ def get_stacked_observations( ) ) - stacked_observations = numpy.concatenate( + stacked_observations = numpy.concatenate( # 向stoacked_observtions添加内容 (stacked_observations, previous_observation) ) @@ -556,15 +606,16 @@ class MinMaxStats: """ def __init__(self): - self.maximum = -float("inf") - self.minimum = float("inf") + self.maximum = -float("inf") # 最大是-∞ + self.minimum = float("inf") # 最小是+∞ + # 跟类一定要update至少两次才能产生正确的范围。第一次更新掉max self.minimum: + def normalize(self, value): #对value规范化,公式为(x-a)/(a-b) 当x∈[a,b]时 + if self.maximum > self.minimum: # 如果最大大于最小,说明至少更新了两次(第一次更新掉max0。c^0=1。则所有的访问次数变为相同的1,难以区分大小,因此就会相当于随机选择 + # 特殊地,当t=0时,使用random完全随机选择,当t=+∞,使用argmax选择最大的 + @staticmethod # 静态方法修饰符,类似于static关键字 + def select_action(node, temperature): + """ + Select action according to the visit count distribution and the temperature. + The temperature is changed dynamically with the visit_softmax_temperature function + in the config. + """ + visit_counts = numpy.array( + [child.visit_count for child in node.children.values()], dtype="int32" + ) + actions = [action for action in node.children.keys()] + if temperature == 0: + action = actions[numpy.argmax(visit_counts)] + elif temperature == float("inf"): + action = numpy.random.choice(actions) + else: + # See paper appendix Data Generation + visit_count_distribution = visit_counts ** (1 / temperature) + visit_count_distribution = visit_count_distribution / sum( + visit_count_distribution + ) + action = numpy.random.choice(actions, p=visit_count_distribution) + + return action + + +# Game independent +class MCTS: + """ + Core Monte Carlo Tree Search algorithm. + To decide on an action, we run N simulations, always starting at the root of + the search tree and traversing the tree according to the UCB formula until we + reach a leaf node. + """ + + def __init__(self, config): + self.config = config + + # run函数运行流程: + # 1. 获取root节点 + # (1)如果由指定节点这将root赋值为该节点; + # (2)如果没有,则 + # i. 创建新的节点Node(0) + # ii. 使用initial_inference函数通过observation获取相应的reward,hidden state,legal actions等数据 + # iii. 将ii中获取的数据赋值到创建的root节点中取 + # PS. 可以看到,在(1)的情况下不需要调用initial_inference函数 + # 2. 检查是否需要添加探索噪音 + # 3. 开始循环模拟游戏,模拟的次数由num simulation决定 + # (1) 将初始节点node设置为root,并将节点node加入search tree中 + # (2) 检查该节点是否已经扩展,如果已经扩展,则通过ucb值来选择子节点expand. 并将node 设置为选中的节点。并将节点node加入search tree中 + # (3) 重复2,直到找到expanded为false的node为止 + # (4) 选择search_tree[-2]为parent(因为最后一个是node) + # (5) 运行recurrent_inference函数,获得reward,hidden state,legal actions等数据 + # (6) 扩展node,即为node创建子节点,使node展开。 + # (7) 反向传播算法,对路径上的所有访问次数+1,value值加reward + # PS: 可以看到,通过不停的模拟,节点被一层层的扩展(每次模拟扩展一个节点)。 + # 4. 返回扩展过后的节点树root,以便之后的程序根据它选择动作action + def run( + self, + model, + observation, + legal_actions, + to_play, + add_exploration_noise, + override_root_with=None, + ): + """ + At the root of the search tree we use the representation function to obtain a + hidden state given the current observation. + We then run a Monte Carlo Tree Search using only action sequences and the model + learned by the network. + """ + if override_root_with: #检查有没有提供Node,如果有,则指定;如果没有,则自己创建一个 + root = override_root_with + root_predicted_value = None + else: + root = Node(0) + observation = ( + torch.tensor(observation) + .float() + .unsqueeze(0) + .to(next(model.parameters()).device) + ) # observation转tensor,外面包一层形成一个batch。 Observation的长度由参数stacked_observation配置,主要存储之前的previous。不要之前privious的配置为0 + ( + root_predicted_value, + reward, + policy_logits, + hidden_state, + ) = model.initial_inference(observation) + root_predicted_value = models.support_to_scalar( + root_predicted_value, self.config.support_size + ).item() + reward = models.support_to_scalar(reward, self.config.support_size).item() + assert ( + legal_actions + ), f"Legal actions should not be an empty array. Got {legal_actions}." + assert set(legal_actions).issubset( + set(self.config.action_space) + ), "Legal actions should be a subset of the action space." + root.expand( + legal_actions, + to_play, + reward, + policy_logits, + hidden_state, + ) + + if add_exploration_noise: + root.add_exploration_noise( + dirichlet_alpha=self.config.root_dirichlet_alpha, + exploration_fraction=self.config.root_exploration_fraction, + ) + + min_max_stats = MinMaxStats() + + max_tree_depth = 0 + for _ in range(self.config.num_simulations): # 开始模拟游戏 + virtual_to_play = to_play + node = root + search_path = [node] + current_tree_depth = 0 + + # expanded根据node的子节点个数判断是否已经扩展了,如果没有子节点,说明没被扩展 + while node.expanded(): #这个循环一直在搜索没有expand的子节点。如果子节点已经expand了,则通过select_child选择下一个 + current_tree_depth += 1 + action, node = self.select_child(node, min_max_stats) #选取ucb最大的一个action,如果有多个action得分相同,随机选取一个 + search_path.append(node) #把节点添加到搜索队列 + + # Players play turn by turn + if virtual_to_play + 1 < len(self.config.players): + virtual_to_play = self.config.players[virtual_to_play + 1] + else: + virtual_to_play = self.config.players[0] + + # 在搜索树内部,我们使用动态函数来获取给定动作的下一个hidden_state和previous hidden state + # Inside the search tree we use the dynamics function to obtain the next hidden + # state given an action and the previous hidden state + parent = search_path[-2] # 选择倒数第二个节点,因为当前的node是-1,则-2是它的parent + value, reward, policy_logits, hidden_state = model.recurrent_inference( + parent.hidden_state, + torch.tensor([[action]]).to(parent.hidden_state.device), + ) + value = models.support_to_scalar(value, self.config.support_size).item() + reward = models.support_to_scalar(reward, self.config.support_size).item() + # expand一层节点,actions是动作列表,policy_logits是rewards列表 + # 通过该函数,在该节点扩展一层节点 + node.expand( + self.config.action_space, + virtual_to_play, + reward, + policy_logits, + hidden_state, + ) + + self.backpropagate(search_path, value, virtual_to_play, min_max_stats) + + max_tree_depth = max(max_tree_depth, current_tree_depth) + + extra_info = { + "max_tree_depth": max_tree_depth, + "root_predicted_value": root_predicted_value, + } + return root, extra_info + + # MCTS 的select child和之前SelfPlay的select action逻辑是不一样的 + # 1. select child是根据UCB选取的,select action是根据各个动作的visit count和temperature选取的 + # 2. select child 选择的对象是Node,Node是由当前的state执行action后生成的新Node形成的。select action单纯的是选action + def select_child(self, node, min_max_stats): + """ + Select the child with the highest UCB score. + """ + max_ucb = max( + self.ucb_score(node, child, min_max_stats) + for action, child in node.children.items() + ) + action = numpy.random.choice( # 随机选择ucb值等于最大ucb的动作(因为可能有多个动作的值都达到了最大的ucb,如果只有一个,那么就会选取这个) + [ + action + for action, child in node.children.items() + if self.ucb_score(node, child, min_max_stats) == max_ucb + ] + ) + return action, node.children[action] + + def ucb_score(self, parent, child, min_max_stats): #该函数只进行一步查询,不进行多步 + """ + The score for a node is based on its value, plus an exploration bonus based on the prior. + """ + pb_c = ( + math.log( + (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base # pc_c_base由配置文件决定 + ) + + self.config.pb_c_init + ) + pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1) + + prior_score = pb_c * child.prior # prior 之前的p_value + # 公式 pb_c = (log((N+C+1)/C)+init ) * sqrt(N/(VC+1)) + # prior_score = pbc * prior + + if child.visit_count > 0: + # Mean value Q + value_score = min_max_stats.normalize( # 括号里的是Q值,Q=E[r+r*Q'。此处在对其进行正则化 + child.reward + + self.config.discount # 衰减系数, 之后乘以子节点的值 + * (child.value() if len(self.config.players) == 1 else -child.value()) # 根据players的个数,如果大于1,则子节点必定是对手,因此子节点的取负。 + ) + else: + value_score = 0 + + return prior_score + value_score # 先前的分数加上Q值就是新的UCB值 + + # 反向传播算法 + # 对路径上的所有访问次数+1,value值加reward + def backpropagate(self, search_path, value, to_play, min_max_stats): # MCTS反向传播,visit count加1 + """ + At the end of a simulation, we propagate the evaluation all the way up the tree + to the root. + """ + if len(self.config.players) == 1: + for node in reversed(search_path): + node.value_sum += value + node.visit_count += 1 + min_max_stats.update(node.reward + self.config.discount * node.value()) + + value = node.reward + self.config.discount * value + + elif len(self.config.players) == 2: + for node in reversed(search_path): + node.value_sum += value if node.to_play == to_play else -value + node.visit_count += 1 + min_max_stats.update(node.reward + self.config.discount * -node.value()) + + value = ( + -node.reward if node.to_play == to_play else node.reward + ) + self.config.discount * value + + else: + raise NotImplementedError("More than two player mode not implemented.") + + +class Node: + def __init__(self, prior): + self.visit_count = 0 #visit count默认是0,只有经过反向传播之后才能变成增加 + self.to_play = -1 + self.prior = prior + self.value_sum = 0 + self.children = {} + self.hidden_state = None + self.reward = 0 + + def expanded(self): + return len(self.children) > 0 + + def value(self): + if self.visit_count == 0: + return 0 + return self.value_sum / self.visit_count + + def expand(self, actions, to_play, reward, policy_logits, hidden_state): + # expand一层节点,actions是动作列表,policy_logits是rewards列表 + # 通过该函数,在该节点扩展一层节点 + """ + We expand a node using the value, reward and policy prediction obtained from the + neural network. + """ + self.to_play = to_play + self.reward = reward + self.hidden_state = hidden_state + + policy_values = torch.softmax( + torch.tensor([policy_logits[0][a] for a in actions]), dim=0 + ).tolist() + policy = {a: policy_values[i] for i, a in enumerate(actions)} # 列出所有的合法动作及对于的value值 + for action, p in policy.items(): + self.children[action] = Node(p) + + def add_exploration_noise(self, dirichlet_alpha, exploration_fraction): + """ + At the start of each search, we add dirichlet noise to the prior of the root to + encourage the search to explore new actions. + """ + actions = list(self.children.keys()) + noise = numpy.random.dirichlet([dirichlet_alpha] * len(actions)) + frac = exploration_fraction + for a, n in zip(actions, noise): + self.children[a].prior = self.children[a].prior * (1 - frac) + n * frac + + +class GameHistory: + """ + Store only usefull information of a self-play game. + """ + + def __init__(self): + self.observation_history = [] + self.action_history = [] + self.reward_history = [] + self.to_play_history = [] + self.child_visits = [] + self.root_values = [] + self.reanalysed_predicted_root_values = None + # For PER + self.priorities = None + self.game_priority = None + + def store_search_statistics(self, root, action_space): + # Turn visit count from root into a policy + if root is not None: + sum_visits = sum(child.visit_count for child in root.children.values()) + self.child_visits.append( + [ + root.children[a].visit_count / sum_visits + if a in root.children + else 0 + for a in action_space + ] + ) + + self.root_values.append(root.value()) + else: + self.root_values.append(None) + + def get_stacked_observations( + self, index, num_stacked_observations, action_space_size + ): #根据索引index获取observation序列 + """ + Generate a new observation with the observation at the index position + and num_stacked_observations past observations and actions stacked. + """ + # Convert to positive index + index = index % len(self.observation_history) + + stacked_observations = self.observation_history[index].copy() #分为两部分,一部分是当前(current)观察值,一部分是之前的(previous)观察值 + for past_observation_index in reversed( + range(index - num_stacked_observations, index) + ): + if 0 <= past_observation_index: + previous_observation = numpy.concatenate( # np.concatenate将第一个参数的list组合起来,方法是依次拆开每个元素,拼接 + ( + self.observation_history[past_observation_index], + [ + numpy.ones_like(stacked_observations[0]) + * self.action_history[past_observation_index + 1] + / action_space_size + ], + ) + ) + else: + previous_observation = numpy.concatenate( + ( + numpy.zeros_like(self.observation_history[index]), + [numpy.zeros_like(stacked_observations[0])], + ) + ) + + stacked_observations = numpy.concatenate( # 向stoacked_observtions添加内容 + (stacked_observations, previous_observation) + ) + + return stacked_observations + + +class MinMaxStats: + """ + A class that holds the min-max values of the tree. + """ + + def __init__(self): + self.maximum = -float("inf") # 最大是-∞ + self.minimum = float("inf") # 最小是+∞ + # 跟类一定要update至少两次才能产生正确的范围。第一次更新掉max self.minimum: # 如果最大大于最小,说明至少更新了两次(第一次更新掉max self.config.ratio + and self.training_step < self.config.training_steps + and not ray.get(shared_storage.get_info.remote("terminate")) + ): + time.sleep(0.5) + + def update_weights(self, batch): + """ + Perform one training step. + """ + + ( + observation_batch, + action_batch, + target_value, + target_reward, + target_policy, + weight_batch, + gradient_scale_batch, + ) = batch + + # Keep values as scalars for calculating the priorities for the prioritized replay + target_value_scalar = numpy.array(target_value, dtype="float32") + priorities = numpy.zeros_like(target_value_scalar) + + device = next(self.model.parameters()).device + if self.config.PER: + weight_batch = torch.tensor(weight_batch.copy()).float().to(device) + observation_batch = ( + torch.tensor(numpy.array(observation_batch)).float().to(device) + ) + action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1) + target_value = torch.tensor(target_value).float().to(device) + target_reward = torch.tensor(target_reward).float().to(device) + target_policy = torch.tensor(target_policy).float().to(device) + gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device) + # observation_batch: batch, channels, height, width + # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze) + # target_value: batch, num_unroll_steps+1 + # target_reward: batch, num_unroll_steps+1 + # target_policy: batch, num_unroll_steps+1, len(action_space) + # gradient_scale_batch: batch, num_unroll_steps+1 + + target_value = models.scalar_to_support(target_value, self.config.support_size) + target_reward = models.scalar_to_support( + target_reward, self.config.support_size + ) + # target_value: batch, num_unroll_steps+1, 2*support_size+1 + # target_reward: batch, num_unroll_steps+1, 2*support_size+1 + + ## Generate predictions + value, reward, policy_logits, hidden_state = self.model.initial_inference( + observation_batch + ) + predictions = [(value, reward, policy_logits)] + for i in range(1, action_batch.shape[1]): + value, reward, policy_logits, hidden_state = self.model.recurrent_inference( + hidden_state, action_batch[:, i] + ) + # Scale the gradient at the start of the dynamics function (See paper appendix Training) + hidden_state.register_hook(lambda grad: grad * 0.5) + predictions.append((value, reward, policy_logits)) + # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim) + + ## Compute losses + value_loss, reward_loss, policy_loss = (0, 0, 0) + value, reward, policy_logits = predictions[0] + # Ignore reward loss for the first batch step + current_value_loss, _, current_policy_loss = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, 0], + target_reward[:, 0], + target_policy[:, 0], + ) + value_loss += current_value_loss + policy_loss += current_policy_loss + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, 0] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, 0]) + ** self.config.PER_alpha + ) + + for i in range(1, len(predictions)): + value, reward, policy_logits = predictions[i] + ( + current_value_loss, + current_reward_loss, + current_policy_loss, + ) = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, i], + target_reward[:, i], + target_policy[:, i], + ) + + # Scale gradient by the number of unroll steps (See paper appendix Training) + current_value_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_reward_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_policy_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + + value_loss += current_value_loss + reward_loss += current_reward_loss + policy_loss += current_policy_loss + + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, i] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, i]) + ** self.config.PER_alpha + ) + + # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze) + loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss + if self.config.PER: + # Correct PER bias by using importance-sampling (IS) weights + loss *= weight_batch + # Mean over batch dimension (pseudocode do a sum) + loss = loss.mean() + + # Optimize + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + self.training_step += 1 + + return ( + priorities, + # For log purpose + loss.item(), + value_loss.mean().item(), + reward_loss.mean().item(), + policy_loss.mean().item(), + ) + + def update_lr(self): + """ + Update learning rate + """ + lr = self.config.lr_init * self.config.lr_decay_rate ** ( + self.training_step / self.config.lr_decay_steps + ) + for param_group in self.optimizer.param_groups: + param_group["lr"] = lr + + @staticmethod + def loss_function( + value, + reward, + policy_logits, + target_value, + target_reward, + target_policy, + ): + # Cross-entropy seems to have a better convergence than MSE + value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1) + reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1) + policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum( + 1 + ) + return value_loss, reward_loss, policy_loss diff --git a/simplifiedMuZero/no_pv/trainer_no_pv.py b/simplifiedMuZero/no_pv/trainer_no_pv.py new file mode 100644 index 00000000..f51e3ef8 --- /dev/null +++ b/simplifiedMuZero/no_pv/trainer_no_pv.py @@ -0,0 +1,301 @@ +import copy +import time + +import numpy +import ray +import torch + +import models + + +@ray.remote +class Trainer: + """ + Class which run in a dedicated thread to train a neural network and save it + in the shared storage. + """ + + def __init__(self, initial_checkpoint, config): + self.config = config + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Initialize the network + self.model = models.MuZeroNetwork(self.config) + self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"])) + self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu")) + self.model.train() + + self.training_step = initial_checkpoint["training_step"] + + if "cuda" not in str(next(self.model.parameters()).device): + print("You are not training on GPU.\n") + + # Initialize the optimizer + if self.config.optimizer == "SGD": + self.optimizer = torch.optim.SGD( + self.model.parameters(), + lr=self.config.lr_init, + momentum=self.config.momentum, + weight_decay=self.config.weight_decay, + ) + elif self.config.optimizer == "Adam": + self.optimizer = torch.optim.Adam( + self.model.parameters(), + lr=self.config.lr_init, + weight_decay=self.config.weight_decay, + ) + else: + raise NotImplementedError( + f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py." + ) + + if initial_checkpoint["optimizer_state"] is not None: + print("Loading optimizer...\n") + self.optimizer.load_state_dict( + copy.deepcopy(initial_checkpoint["optimizer_state"]) + ) + + def continuous_update_weights(self, replay_buffer, shared_storage): + # Wait for the replay buffer to be filled + while ray.get(shared_storage.get_info.remote("num_played_games")) < 1: + time.sleep(0.1) + + next_batch = replay_buffer.get_batch.remote() + # Training loop + while self.training_step < self.config.training_steps and not ray.get( + shared_storage.get_info.remote("terminate") # terminate是用来记录replay buffer等其它程序是否终止的,跟game的状态无关 + ): + index_batch, batch = ray.get(next_batch) + next_batch = replay_buffer.get_batch.remote() + self.update_lr() + ( + priorities, + total_loss, + value_loss, + reward_loss, + policy_loss, + ) = self.update_weights(batch) + + if self.config.PER: + # Save new priorities in the replay buffer (See https://arxiv.org/abs/1803.00933) + replay_buffer.update_priorities.remote(priorities, index_batch) + + # Save to the shared storage + if self.training_step % self.config.checkpoint_interval == 0: + shared_storage.set_info.remote( + { + "weights": copy.deepcopy(self.model.get_weights()), + "optimizer_state": copy.deepcopy( + models.dict_to_cpu(self.optimizer.state_dict()) + ), + } + ) + if self.config.save_model: + shared_storage.save_checkpoint.remote() + shared_storage.set_info.remote( + { + "training_step": self.training_step, + "lr": self.optimizer.param_groups[0]["lr"], + "total_loss": total_loss, + "value_loss": value_loss, + "reward_loss": reward_loss, + "policy_loss": policy_loss, + } + ) + + # Managing the self-play / training ratio + if self.config.training_delay: + time.sleep(self.config.training_delay) + if self.config.ratio: + while ( + self.training_step + / max( + 1, ray.get(shared_storage.get_info.remote("num_played_steps")) + ) + > self.config.ratio + and self.training_step < self.config.training_steps + and not ray.get(shared_storage.get_info.remote("terminate")) # terminate是用来记录replay buffer等其它程序是否终止的,跟game的状态无关 + ): + time.sleep(0.5) + + def update_weights(self, batch): + """ + Perform one training step. + """ + + ( + observation_batch, + action_batch, + target_value, + target_reward, + target_policy, + weight_batch, + gradient_scale_batch, + ) = batch + + # Keep values as scalars for calculating the priorities for the prioritized replay + target_value_scalar = numpy.array(target_value, dtype="float32") + priorities = numpy.zeros_like(target_value_scalar) + + device = next(self.model.parameters()).device + if self.config.PER: + weight_batch = torch.tensor(weight_batch.copy()).float().to(device) + observation_batch = ( + torch.tensor(numpy.array(observation_batch)).float().to(device) + ) + action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1) + target_value = torch.tensor(target_value).float().to(device) + target_reward = torch.tensor(target_reward).float().to(device) + target_policy = torch.tensor(target_policy).float().to(device) + gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device) + # observation_batch: batch, channels, height, width + # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze) + # target_value: batch, num_unroll_steps+1 + # target_reward: batch, num_unroll_steps+1 + # target_policy: batch, num_unroll_steps+1, len(action_space) + # gradient_scale_batch: batch, num_unroll_steps+1 + + target_value = models.scalar_to_support(target_value, self.config.support_size) + target_reward = models.scalar_to_support( + target_reward, self.config.support_size + ) + # target_value: batch, num_unroll_steps+1, 2*support_size+1 + # target_reward: batch, num_unroll_steps+1, 2*support_size+1 + + ## Generate predictions + value, reward, policy_logits, hidden_state = self.model.initial_inference( + observation_batch + ) + predictions = [(value, reward, policy_logits)] + for i in range(1, action_batch.shape[1]): + value, reward, policy_logits, hidden_state = self.model.recurrent_inference( + hidden_state, action_batch[:, i] + ) + # Scale the gradient at the start of the dynamics function (See paper appendix Training) + hidden_state.register_hook(lambda grad: grad * 0.5) + predictions.append((value, reward, policy_logits)) + # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim) + + ## Compute losses + value_loss, reward_loss, policy_loss = (0, 0, 0) + value, reward, policy_logits = predictions[0] + # Ignore reward loss for the first batch step + current_value_loss, _, current_policy_loss = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, 0], + target_reward[:, 0], + target_policy[:, 0], + ) + value_loss += current_value_loss + policy_loss += current_policy_loss + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, 0] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, 0]) + ** self.config.PER_alpha + ) + + for i in range(1, len(predictions)): + value, reward, policy_logits = predictions[i] + ( + current_value_loss, + current_reward_loss, + current_policy_loss, + ) = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, i], + target_reward[:, i], + target_policy[:, i], + ) + + # Scale gradient by the number of unroll steps (See paper appendix Training) + current_value_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_reward_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_policy_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + + value_loss += current_value_loss + reward_loss += current_reward_loss + policy_loss += current_policy_loss + + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, i] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, i]) + ** self.config.PER_alpha + ) + + # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze) + # loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss + loss = reward_loss + policy_loss + if self.config.PER: + # Correct PER bias by using importance-sampling (IS) weights + loss *= weight_batch + # Mean over batch dimension (pseudocode do a sum) + loss = loss.mean() + + # Optimize + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + self.training_step += 1 + + return ( + priorities, + # For log purpose + loss.item(), + value_loss.mean().item(), + reward_loss.mean().item(), + policy_loss.mean().item(), + ) + + def update_lr(self): + """ + Update learning rate + """ + lr = self.config.lr_init * self.config.lr_decay_rate ** ( + self.training_step / self.config.lr_decay_steps + ) + for param_group in self.optimizer.param_groups: # 更新optimizer的lr + param_group["lr"] = lr + + @staticmethod + def loss_function( + value, + reward, + policy_logits, + target_value, + target_reward, + target_policy, + ): + # Cross-entropy seems to have a better convergence than MSE + value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1) + reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1) + policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum( + 1 + ) + return value_loss, reward_loss, policy_loss diff --git a/simplifiedMuZero/search_policy/RHEA.py b/simplifiedMuZero/search_policy/RHEA.py new file mode 100644 index 00000000..fe070c8b --- /dev/null +++ b/simplifiedMuZero/search_policy/RHEA.py @@ -0,0 +1,75 @@ +import copy +import numpy as np +from functools import partial + +from deap import base, creator, tools, algorithms + +from games.abstract_game import AbstractGame + +creator.create('FitnessMax', base.Fitness, weights=(1.0,)) +creator.create('Individual', list, fitness = creator.FitnessMax) + +class RHEA: + def __init__(self): + self.game = None + self.play_id = 0 + self.toolbox = base.Toolbox() + self.register("mate", tools.cxTwoPoint) + self.register("mutate", tools.mutFlipBit, indpb=0.05) + self.register("select", tools.selStochasticUniversalSampling) + + def game_evaluate(self, actions, game_stat=None, play_id=None): + game_stat = copy.deepcopy(game_stat) + game_stat.reset() + + for i in range(len(actions)): + player = game_stat.to_play() + observation, reward, done = game_stat.step(actions[i]) + if done: + break + + game_stat.close() + reward = reward if play_id == player else -reward + # 因为i是从0开始的,如果第一个action就结束,会出现NAN异常 + reward /= i + 1 # 路径越长,回报越低。以便寻找到最近的路径 + return reward, + + def evaluate(self, actions): + game_stat = copy.deepcopy(self.game) + play_id = self.play_id + + game_stat.reset() + + for i in range(len(actions)): + player = game_stat.to_play() + observation, reward, done = game_stat.step(actions[i]) + if done: + break + + game_stat.close() + reward = reward if play_id == player else -reward + # 因为i是从0开始的,如果第一个action就结束,会出现NAN异常 + reward /= i + 1 # 路径越长,回报越低。以便寻找到最近的路径 + return reward, + + def individual(self, actions, max_moves, replace=False): + max_moves = max_moves if replace else len(actions) + return tools.initIterate(creator.Individual, partial(np.random.choice, actions, max_moves, replace=replace)) + def population(self, actions, max_moves, N, replace=False): + return tools.initRepeat(list, partial(self.individual, actions, max_moves, replace), N) + + def rhea(self, game_state:AbstractGame, config, play_id): + actions = game_state.legal_actions() + pop = self.population(actions. config.max_moves) + self.toolbox.register("evaluate", self.game_evaluate, game=game_state, play_id=play_id) + pop, logbook = algorithms.eaSimple(pop, self.toolbox, cxpb=0.5, mutpb=0.2, ngen=10, verbose=False) + + results = tools.selBest(pop, k=1) + + # 返回第一个动作和评分 + return [(r[0],self.game_evaluate(actions, game_state, play_id)[0]) for r in results] # r[0]表示第一个动作 + + + + + diff --git a/simplifiedMuZero/search_policy/RHEA2.py b/simplifiedMuZero/search_policy/RHEA2.py new file mode 100644 index 00000000..73d30799 --- /dev/null +++ b/simplifiedMuZero/search_policy/RHEA2.py @@ -0,0 +1,192 @@ +import copy +import numpy as np +from functools import partial +import torch + +from deap import base, creator, tools, algorithms + +from games.abstract_game import AbstractGame +from self_play import Node +import models + +from games.tictactoe import MuZeroConfig, Game + +creator.create('FitnessMax', base.Fitness, weights=(1.0,)) +creator.create('Individual', list, fitness = creator.FitnessMax) + + +def evaluate(actions, model, observation, config): + ( + root_predicted_value, + reward, + policy_logits, + hidden_state, + ) = model.initial_inference(observation) + + for action in actions: + value, reward, policy_logits, hidden_state = model.recurrent_inference( + hidden_state, + torch.tensor([[action]]).to(observation.device), + ) + + reward = models.support_to_scalar(reward, config.support_size).item() + return reward, + +class RHEA: + def __init__(self, config, game): + self.game = game + self.config = config + self.play_id = -1 + self.toolbox = base.Toolbox() + self.toolbox.register("mate", tools.cxTwoPoint) + self.toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) + self.toolbox.register("select", tools.selStochasticUniversalSampling) + + # def game_evaluate(self, actions, game_stat=None, play_id=None): + # game_stat = copy.deepcopy(game_stat) + # game_stat.reset() + # + # for i in range(len(actions)): + # player = game_stat.to_play() + # observation, reward, done = game_stat.step(actions[i]) + # if done: + # break + # + # game_stat.close() + # reward = reward if play_id == player else -reward + # # 因为i是从0开始的,如果第一个action就结束,会出现NAN异常 + # reward /= i + 1 # 路径越长,回报越低。以便寻找到最近的路径 + # return reward, + # + # def action_evaluate(self, actions): + # game_stat = copy.deepcopy(self.game) + # game_stat.reset() + # + # for i in range(len(actions)): + # player = game_stat.to_play() + # observation, reward, done = game_stat.step(actions[i]) + # if done: + # break + # + # game_stat.close() + # reward = reward if self.play_id == player else -reward + # + # return reward, actions[:(i+1)] + # + def evaluate(self, actions): + game_stat = copy.deepcopy(self.game) + play_id = self.play_id + + game_stat.reset() + + for i in range(len(actions)): + player = game_stat.to_play() + observation, reward, done = game_stat.step(actions[i]) + if done: + break + + game_stat.close() + reward = reward if play_id == player else -reward + # 因为i是从0开始的,如果第一个action就结束,会出现NAN异常 + reward /= i + 1 # 路径越长,回报越低。以便寻找到最近的路径 + return reward, + + def individual(self, actions, max_moves, replace=False): + max_moves = max_moves if replace else min(len(actions), max_moves) + return tools.initIterate(creator.Individual, partial(np.random.choice, actions, max_moves, replace=replace)) + def population(self, actions, max_moves, N, replace=False): + return tools.initRepeat(list, partial(self.individual, actions, max_moves, replace), N) + + # def rhea(self, game_state:AbstractGame): + # self.game = game_state + # self.play_id = game_state.to_play() + # actions = game_state.legal_actions() + # self.toolbox.register("evaluate", evaluate, ) + # pop = self.population(actions. self.config.max_moves) + # + # pop, logbook = algorithms.eaSimple(pop, self.toolbox, cxpb=0.5, mutpb=0.2, ngen=10, verbose=False) + # + # results = tools.selBest(pop, k=1) + # + # return self.action_evaluate(results[0]) + + + + # # 返回第一个动作和评分 + # return [(r[0],self.game_evaluate(actions, game_state, play_id)[0]) for r in results] # r[0]表示第一个动作 + + def run(self, + model, + observation, + legal_actions, + to_play, + action_replace, + override_root_with=None, + ): + observation = ( + torch.tensor(observation) + .float() + .unsqueeze(0) + .to(next(model.parameters()).device) + ) + + # 检查可用的动作空间,如果小于等于1,则直接返回。因为进化算法无法杂交,会报错 + if len(legal_actions) <=1: + return legal_actions + else: + # self.toolbox.register("evaluate", evaluate, model=model, observation=observation, config=self.config) + self.toolbox.register("evaluate", self.evaluate) + pop = self.population(legal_actions, self.config.max_moves, self.config.num_simulations, replace=action_replace) + + pop, logbook = algorithms.eaSimple(pop, self.toolbox, cxpb=0.5, mutpb=0.2, ngen=len(legal_actions), verbose=False) + + results = tools.selBest(pop, k=1) + + return results[0] + +if __name__=="__main__": + game = Game() + config = MuZeroConfig() + game.reset() + done = False + + # rhea = RHEA(config, game) + # pop = rhea.population(game.legal_actions(), 9, config.num_simulations, config.action_replace) + # + # print(pop) + # rhea.toolbox.register("evaluate", rhea.evaluate) + # pop, logbook = algorithms.eaSimple(pop, rhea.toolbox, cxpb=0.5, mutpb=0, ngen=9, verbose=False) + # + # results = tools.selBest(pop, k=1) + # print(results) + + legal_actions = game.legal_actions() + while not done and len(legal_actions) >1: + legal_actions = game.legal_actions() + rhea = RHEA(config, game) + rhea.play_id = game.to_play() + + pop = rhea.population(legal_actions, config.max_moves, config.num_simulations, config.action_replace) + + rhea.toolbox.register("evaluate", rhea.evaluate) + + pop, logbook = algorithms.eaSimple(pop, rhea.toolbox, cxpb=0.5, mutpb=0.2, ngen=len(legal_actions), verbose=False) + + print(pop) + results = tools.selBest(pop, k=1) + print(results) + action = results[0][0] + observation, reward, done = game.step(action) + # print(observation) + + + + + + + + + + + + diff --git a/simplifiedMuZero/search_policy/__init__.py b/simplifiedMuZero/search_policy/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/simplifiedMuZero/search_policy/rhea_self_play.py b/simplifiedMuZero/search_policy/rhea_self_play.py new file mode 100644 index 00000000..ca49d875 --- /dev/null +++ b/simplifiedMuZero/search_policy/rhea_self_play.py @@ -0,0 +1,227 @@ +import math +import time + +import numpy +import ray +import torch + +import models +from simplifiedMuZero.search_policy.RHEA2 import RHEA +from self_play import GameHistory + + +@ray.remote +class SelfPlayRhea: + """ + Class which run in a dedicated thread to play games and save them to the replay-buffer. + """ + + def __init__(self, initial_checkpoint, Game, config, seed): + self.config = config + self.game = Game(seed) + + # Fix random generator seed + numpy.random.seed(seed) + torch.manual_seed(seed) + + # Initialize the network + self.model = models.MuZeroNetwork(self.config) + # self.model = models.MuZeroNetwork(self.config) + self.model.set_weights(initial_checkpoint["weights"]) + self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu")) + self.model.eval() + + def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False): + while ray.get( + shared_storage.get_info.remote("training_step") + ) < self.config.training_steps and not ray.get( + shared_storage.get_info.remote("terminate") + ): # 如果当前的训练步数低于训练总步数,并且没有终止的话,继续进行训练 + self.model.set_weights(ray.get(shared_storage.get_info.remote("weights"))) # 从shared_storage中获取当前的参数 + + if not test_mode: + game_history = self.play_game( + self.config.visit_softmax_temperature_fn( + trained_steps=ray.get( + shared_storage.get_info.remote("training_step") + ) + ), + self.config.temperature_threshold, + False, + "self", + 0, + ) + + replay_buffer.save_game.remote(game_history, shared_storage) + + else: + # Take the best action (no exploration) in test mode + game_history = self.play_game( + 0, + self.config.temperature_threshold, + False, + "self" if len(self.config.players) == 1 else self.config.opponent, + self.config.muzero_player, + ) + + # Save to the shared storage + shared_storage.set_info.remote( + { + "episode_length": len(game_history.action_history) - 1, + "total_reward": sum(game_history.reward_history), + "mean_value": numpy.mean( + [value for value in game_history.root_values if value] + ), + } + ) + if 1 < len(self.config.players): + shared_storage.set_info.remote( + { + "muzero_reward": sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + == self.config.muzero_player + ), + "opponent_reward": sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + != self.config.muzero_player + ), + } + ) + + # Managing the self-play / training ratio + if not test_mode and self.config.self_play_delay: + time.sleep(self.config.self_play_delay) + if not test_mode and self.config.ratio: + while ( + ray.get(shared_storage.get_info.remote("training_step")) + / max( + 1, ray.get(shared_storage.get_info.remote("num_played_steps")) + ) + < self.config.ratio + and ray.get(shared_storage.get_info.remote("training_step")) + < self.config.training_steps + and not ray.get(shared_storage.get_info.remote("terminate")) + ): + time.sleep(0.5) + + self.close_game() + + #play game 运行 + # 合法的actions是固定的,由游戏文件提供(在本函数中,可以看到调用legal_actions函数没有使用env,这表面现游戏环境于的改变于动作无关)。 + # 运行步骤: + # 1. 创建GameHistory用来存储数据 + # 2. 检查游戏是否结束或者到底最大移动次数 + # 3. 获取stacked observation(因为有些游戏需要考虑之前的历史数据和移动轨迹) + # 4. 运行MCTS搜索下一步的action + # 5. 调用游戏函数step(action),获取下一步action之后的observation、reward和done + # 6. 持续运行2-5步直到结束 + # 7. 返回GameHistory + def play_game( + self, temperature, temperature_threshold, render, opponent, muzero_player + ): + """ + Play one game with actions based on the Monte Carlo tree search at each moves. + """ + game_history = GameHistory() + observation = self.game.reset() + game_history.action_history.append(0) + game_history.observation_history.append(observation) # 添加reset之后的observation + game_history.reward_history.append(0) + game_history.to_play_history.append(self.game.to_play()) + + done = False + + if render: + self.game.render() + + with torch.no_grad(): + while ( + not done and len(game_history.action_history) <= self.config.max_moves + ): # 游戏没有结束且运行步数小于最大移动步长 + assert ( + len(numpy.array(observation).shape) == 3 + ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" + assert ( + numpy.array(observation).shape == self.config.observation_shape + ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." + stacked_observations = game_history.get_stacked_observations( + -1, self.config.stacked_observations, len(self.config.action_space) + ) + # index是-1,game_history 会在创建时添加reset的observation,因此其长度为1.index取模(%)之后时1 + # config.stacked_observationis是存储之前的observation的数量,如果不要之前的信息,可以设为0,这样就不会存储之前的信息 + + # 一下的if-else部分主要是为了选择一个动作 + # Choose the action + if opponent == "self" or muzero_player == self.game.to_play(): + # root, mcts_info = MCTS(self.config).run( + # self.model, + # stacked_observations, + # self.game.legal_actions(), + # self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 + # True, + # ) + # action = self.select_action( + # root, + # temperature + # if not temperature_threshold + # or len(game_history.action_history) < temperature_threshold + # else 0, + # ) # 根据temperature选择动作 + actions = RHEA(self.config, self.game).run(self.model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), + self.config.action_replace, + ) + action = actions[0] + + else: + action, root = self.select_opponent_action( #选择对手动作,分为随机,human和expert三种 + opponent, stacked_observations + ) + + observation, reward, done = self.game.step(action) # 运行游戏 + + if render: + print(f"Played action: {self.game.action_to_string(action)}") + self.game.render() + + # game_history.store_search_statistics(root, self.config.action_space) + game_history.root_values.append(reward) + + # Next batch + game_history.action_history.append(action) + game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 + game_history.reward_history.append(reward) + game_history.to_play_history.append(self.game.to_play()) + + return game_history + + def close_game(self): + self.game.close() + + def select_opponent_action(self, opponent, stacked_observations): + """ + Select opponent action for evaluating MuZero level. + """ + if opponent == "human": + return self.game.human_to_action(), None + elif opponent == "expert": + return self.game.expert_agent(), None + elif opponent == "random": + assert ( + self.game.legal_actions() + ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}." + assert set(self.game.legal_actions()).issubset( + set(self.config.action_space) + ), "Legal actions should be a subset of the action space." + + return numpy.random.choice(self.game.legal_actions()), None + else: + raise NotImplementedError( + 'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"' + ) diff --git a/simplifiedMuZero/search_policy/self_play_uniform_search.py b/simplifiedMuZero/search_policy/self_play_uniform_search.py new file mode 100644 index 00000000..314249f0 --- /dev/null +++ b/simplifiedMuZero/search_policy/self_play_uniform_search.py @@ -0,0 +1,622 @@ +import math +import time + +import numpy +import ray +import torch + +import models + + +@ray.remote +class SelfPlay: + """ + Class which run in a dedicated thread to play games and save them to the replay-buffer. + """ + + def __init__(self, initial_checkpoint, Game, config, seed): + self.config = config + self.game = Game(seed) + + # Fix random generator seed + numpy.random.seed(seed) + torch.manual_seed(seed) + + # Initialize the network + self.model = models.MuZeroNetwork(self.config) + self.model.set_weights(initial_checkpoint["weights"]) + self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu")) + self.model.eval() + + def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False): + while ray.get( + shared_storage.get_info.remote("training_step") + ) < self.config.training_steps and not ray.get( + shared_storage.get_info.remote("terminate") + ): # 如果当前的训练步数低于训练总步数,并且没有终止的话,继续进行训练 + self.model.set_weights(ray.get(shared_storage.get_info.remote("weights"))) # 从shared_storage中获取当前的参数 + + if not test_mode: + game_history = self.play_game( + self.config.visit_softmax_temperature_fn( + trained_steps=ray.get( + shared_storage.get_info.remote("training_step") + ) + ), + self.config.temperature_threshold, + False, + "self", + 0, + ) + + replay_buffer.save_game.remote(game_history, shared_storage) + + else: + # Take the best action (no exploration) in test mode + game_history = self.play_game( + 0, + self.config.temperature_threshold, + False, + "self" if len(self.config.players) == 1 else self.config.opponent, + self.config.muzero_player, + ) + + # Save to the shared storage + shared_storage.set_info.remote( + { + "episode_length": len(game_history.action_history) - 1, + "total_reward": sum(game_history.reward_history), + "mean_value": numpy.mean( + [value for value in game_history.root_values if value] + ), + } + ) + if 1 < len(self.config.players): + shared_storage.set_info.remote( + { + "muzero_reward": sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + == self.config.muzero_player + ), + "opponent_reward": sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + != self.config.muzero_player + ), + } + ) + + # Managing the self-play / training ratio + if not test_mode and self.config.self_play_delay: + time.sleep(self.config.self_play_delay) + if not test_mode and self.config.ratio: + while ( + ray.get(shared_storage.get_info.remote("training_step")) + / max( + 1, ray.get(shared_storage.get_info.remote("num_played_steps")) + ) + < self.config.ratio + and ray.get(shared_storage.get_info.remote("training_step")) + < self.config.training_steps + and not ray.get(shared_storage.get_info.remote("terminate")) + ): + time.sleep(0.5) + + self.close_game() + + #play game 运行 + # 合法的actions是固定的,由游戏文件提供(在本函数中,可以看到调用legal_actions函数没有使用env,这表面现游戏环境于的改变于动作无关)。 + # 运行步骤: + # 1. 创建GameHistory用来存储数据 + # 2. 检查游戏是否结束或者到底最大移动次数 + # 3. 获取stacked observation(因为有些游戏需要考虑之前的历史数据和移动轨迹) + # 4. 运行MCTS搜索下一步的action + # 5. 调用游戏函数step(action),获取下一步action之后的observation、reward和done + # 6. 持续运行2-5步直到结束 + # 7. 返回GameHistory + def play_game( + self, temperature, temperature_threshold, render, opponent, muzero_player + ): + """ + Play one game with actions based on the Monte Carlo tree search at each moves. + """ + game_history = GameHistory() + observation = self.game.reset() + game_history.action_history.append(0) + game_history.observation_history.append(observation) # 添加reset之后的observation + game_history.reward_history.append(0) + game_history.to_play_history.append(self.game.to_play()) # to_play_history是用来存放玩家id的 + + done = False + + if render: + self.game.render() + + with torch.no_grad(): + while ( + not done and len(game_history.action_history) <= self.config.max_moves + ): # 游戏没有结束且运行步数小于最大移动步长 + assert ( + len(numpy.array(observation).shape) == 3 + ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" + assert ( + numpy.array(observation).shape == self.config.observation_shape + ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." + stacked_observations = game_history.get_stacked_observations( + -1, self.config.stacked_observations, len(self.config.action_space) + ) + # index是-1,game_history 会在创建时添加reset的observation,因此其长度为1.index取模(%)之后时1 + # config.stacked_observationis是存储之前的observation的数量,如果不要之前的信息,可以设为0,这样就不会存储之前的信息 + + # 一下的if-else部分主要是为了选择一个动作 + # Choose the action + if opponent == "self" or muzero_player == self.game.to_play(): + root, mcts_info = UniformSearch(self.config).run( + self.model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 + True, + ) + action = self.select_action( + root, + temperature + if not temperature_threshold + or len(game_history.action_history) < temperature_threshold + else 0, + ) # 根据temperature选择动作 + + if render: + print(f'Tree depth: {mcts_info["max_tree_depth"]}') + print( + f"Root value for player {self.game.to_play()}: {root.value():.2f}" + ) + else: + action, root = self.select_opponent_action( #选择对手动作,分为随机,human和expert三种 + opponent, stacked_observations + ) + + observation, reward, done = self.game.step(action) # 运行游戏 + + if render: + print(f"Played action: {self.game.action_to_string(action)}") + self.game.render() + + game_history.store_search_statistics(root, self.config.action_space) + + # Next batch + game_history.action_history.append(action) + game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 + game_history.reward_history.append(reward) + game_history.to_play_history.append(self.game.to_play()) + + return game_history + + def close_game(self): + self.game.close() + + def select_opponent_action(self, opponent, stacked_observations): + """ + Select opponent action for evaluating MuZero level. + """ + if opponent == "human": + root, mcts_info = UniformSearch(self.config).run( + self.model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), + True, + ) + print(f'Tree depth: {mcts_info["max_tree_depth"]}') + print(f"Root value for player {self.game.to_play()}: {root.value():.2f}") + print( + f"Player {self.game.to_play()} turn. MuZero suggests {self.game.action_to_string(self.select_action(root, 0))}" + ) + return self.game.human_to_action(), root + elif opponent == "expert": + return self.game.expert_agent(), None + elif opponent == "random": + assert ( + self.game.legal_actions() + ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}." + assert set(self.game.legal_actions()).issubset( + set(self.config.action_space) + ), "Legal actions should be a subset of the action space." + + return numpy.random.choice(self.game.legal_actions()), None + else: + raise NotImplementedError( + 'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"' + ) + + # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。 + # 公式为 c^(1/t)。可以看到: + # t越小,1/t于接近于无穷大,值大的c就越容易被选中。 + # t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1,难以区分大小,因此就会相当于随机选择 + # 特殊地,当t=0时,使用random完全随机选择,当t=+∞,使用argmax选择最大的 + @staticmethod # 静态方法修饰符,类似于static关键字 + def select_action(node, temperature): + """ + Select action according to the visit count distribution and the temperature. + The temperature is changed dynamically with the visit_softmax_temperature function + in the config. + """ + visit_counts = numpy.array( + [child.visit_count for child in node.children.values()], dtype="int32" + ) + actions = [action for action in node.children.keys()] + if temperature == 0: + action = actions[numpy.argmax(visit_counts)] + elif temperature == float("inf"): + action = numpy.random.choice(actions) + else: + # See paper appendix Data Generation + visit_count_distribution = visit_counts ** (1 / temperature) + visit_count_distribution = visit_count_distribution / sum( + visit_count_distribution + ) + action = numpy.random.choice(actions, p=visit_count_distribution) + + return action + + +# Game independent +class UniformSearch: + """ + Core Monte Carlo Tree Search algorithm. + To decide on an action, we run N simulations, always starting at the root of + the search tree and traversing the tree according to the UCB formula until we + reach a leaf node. + """ + + def __init__(self, config): + self.config = config + + # run函数运行流程: + # 1. 获取root节点 + # (1)如果由指定节点这将root赋值为该节点; + # (2)如果没有,则 + # i. 创建新的节点Node(0) + # ii. 使用initial_inference函数通过observation获取相应的reward,hidden state,legal actions等数据 + # iii. 将ii中获取的数据赋值到创建的root节点中取 + # PS. 可以看到,在(1)的情况下不需要调用initial_inference函数 + # 2. 检查是否需要添加探索噪音 + # 3. 开始循环模拟游戏,模拟的次数由num simulation决定 + # (1) 将初始节点node设置为root,并将节点node加入search tree中 + # (2) 检查该节点是否已经扩展,如果已经扩展,则通过ucb值来选择子节点expand. 并将node 设置为选中的节点。并将节点node加入search tree中 + # (3) 重复2,直到找到expanded为false的node为止 + # (4) 选择search_tree[-2]为parent(因为最后一个是node) + # (5) 运行recurrent_inference函数,获得reward,hidden state,legal actions等数据 + # (6) 扩展node,即为node创建子节点,使node展开。 + # (7) 反向传播算法,对路径上的所有访问次数+1,value值加reward + # PS: 可以看到,通过不停的模拟,节点被一层层的扩展(每次模拟扩展一个节点)。 + # 4. 返回扩展过后的节点树root,以便之后的程序根据它选择动作action + def run( + self, + model, + observation, + legal_actions, + to_play, + add_exploration_noise, + override_root_with=None, + ): + """ + At the root of the search tree we use the representation function to obtain a + hidden state given the current observation. + We then run a Monte Carlo Tree Search using only action sequences and the model + learned by the network. + """ + if override_root_with: #检查有没有提供Node,如果有,则指定;如果没有,则自己创建一个 + root = override_root_with + root_predicted_value = None + else: + root = Node(0) + observation = ( + torch.tensor(observation) + .float() + .unsqueeze(0) + .to(next(model.parameters()).device) + ) # observation转tensor,外面包一层形成一个batch。 Observation的长度由参数stacked_observation配置,主要存储之前的previous。不要之前privious的配置为0 + ( + root_predicted_value, + reward, + policy_logits, + hidden_state, + ) = model.initial_inference(observation) + root_predicted_value = models.support_to_scalar( + root_predicted_value, self.config.support_size + ).item() + reward = models.support_to_scalar(reward, self.config.support_size).item() + assert ( + legal_actions + ), f"Legal actions should not be an empty array. Got {legal_actions}." + assert set(legal_actions).issubset( + set(self.config.action_space) + ), "Legal actions should be a subset of the action space." + root.expand( + legal_actions, + to_play, + reward, + policy_logits, + hidden_state, + ) + + if add_exploration_noise: + root.add_exploration_noise( + dirichlet_alpha=self.config.root_dirichlet_alpha, + exploration_fraction=self.config.root_exploration_fraction, + ) + + min_max_stats = MinMaxStats() + + max_tree_depth = 0 + for _ in range(self.config.num_simulations): # 开始模拟游戏 + virtual_to_play = to_play + node = root + search_path = [node] + current_tree_depth = 0 + + # expanded根据node的子节点个数判断是否已经扩展了,如果没有子节点,说明没被扩展 + while node.expanded(): #这个循环一直在搜索没有expand的子节点。如果子节点已经expand了,则通过select_child选择下一个 + current_tree_depth += 1 + action, node = self.select_child(node, min_max_stats) #选取ucb最大的一个action,如果有多个action得分相同,随机选取一个 + search_path.append(node) #把节点添加到搜索队列 + + # Players play turn by turn + if virtual_to_play + 1 < len(self.config.players): + virtual_to_play = self.config.players[virtual_to_play + 1] + else: + virtual_to_play = self.config.players[0] + + # 在搜索树内部,我们使用动态函数来获取给定动作的下一个hidden_state和previous hidden state + # Inside the search tree we use the dynamics function to obtain the next hidden + # state given an action and the previous hidden state + parent = search_path[-2] # 选择倒数第二个节点,因为当前的node是-1,则-2是它的parent + value, reward, policy_logits, hidden_state = model.recurrent_inference( + parent.hidden_state, + torch.tensor([[action]]).to(parent.hidden_state.device), + ) + value = models.support_to_scalar(value, self.config.support_size).item() + reward = models.support_to_scalar(reward, self.config.support_size).item() + # expand一层节点,actions是动作列表,policy_logits是rewards列表 + # 通过该函数,在该节点扩展一层节点 + node.expand( + self.config.action_space, + virtual_to_play, + reward, + policy_logits, + hidden_state, + ) + + self.backpropagate(search_path, value, virtual_to_play, min_max_stats) + + max_tree_depth = max(max_tree_depth, current_tree_depth) + + extra_info = { + "max_tree_depth": max_tree_depth, + "root_predicted_value": root_predicted_value, + } + return root, extra_info + + # MCTS 的select child和之前SelfPlay的select action逻辑是不一样的 + # 1. select child是根据UCB选取的,select action是根据各个动作的visit count和temperature选取的 + # 2. select child 选择的对象是Node,Node是由当前的state执行action后生成的新Node形成的。select action单纯的是选action + def select_child(self, node, min_max_stats): + """ + Select the child with the highest UCB score. + """ + # max_ucb = max( + # self.ucb_score(node, child, min_max_stats) + # for action, child in node.children.items() + # ) + # action = numpy.random.choice( # 随机选择ucb值等于最大ucb的动作(因为可能有多个动作的值都达到了最大的ucb,如果只有一个,那么就会选取这个) + # [ + # action + # for action, child in node.children.items() + # if self.ucb_score(node, child, min_max_stats) == max_ucb + # ] + # ) + action = numpy.random.choice([action for action,child in node.children.items()]) + return action, node.children[action] + + # def ucb_score(self, parent, child, min_max_stats): #该函数只进行一步查询,不进行多步 + # """ + # The score for a node is based on its value, plus an exploration bonus based on the prior. + # """ + # pb_c = ( + # math.log( + # (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base # pc_c_base由配置文件决定 + # ) + # + self.config.pb_c_init + # ) + # pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1) + # + # prior_score = pb_c * child.prior # prior 之前的p_value + # # 公式 pb_c = (log((N+C+1)/C)+init ) * sqrt(N/(VC+1)) + # # prior_score = pbc * prior + # + # if child.visit_count > 0: + # # Mean value Q + # value_score = min_max_stats.normalize( # 括号里的是Q值,Q=E[r+r*Q'。此处在对其进行正则化 + # child.reward + # + self.config.discount # 衰减系数, 之后乘以子节点的值 + # * (child.value() if len(self.config.players) == 1 else -child.value()) # 根据players的个数,如果大于1,则子节点必定是对手,因此子节点的取负。 + # ) + # else: + # value_score = 0 + # + # return prior_score + value_score # 先前的分数加上Q值就是新的UCB值 + + # 反向传播算法 + # 对路径上的所有访问次数+1,value值加reward + def backpropagate(self, search_path, value, to_play, min_max_stats): # MCTS反向传播,visit count加1 + """ + At the end of a simulation, we propagate the evaluation all the way up the tree + to the root. + """ + if len(self.config.players) == 1: + for node in reversed(search_path): + node.value_sum += value + node.visit_count += 1 + min_max_stats.update(node.reward + self.config.discount * node.value()) + + value = node.reward + self.config.discount * value + + elif len(self.config.players) == 2: + for node in reversed(search_path): + node.value_sum += value if node.to_play == to_play else -value + node.visit_count += 1 + min_max_stats.update(node.reward + self.config.discount * -node.value()) + + value = ( + -node.reward if node.to_play == to_play else node.reward + ) + self.config.discount * value + + else: + raise NotImplementedError("More than two player mode not implemented.") + + +class Node: + def __init__(self, prior): + self.visit_count = 0 #visit count默认是0,只有经过反向传播之后才能变成增加 + self.to_play = -1 + self.prior = prior + self.value_sum = 0 + self.children = {} + self.hidden_state = None + self.reward = 0 + + def expanded(self): + return len(self.children) > 0 + + def value(self): + if self.visit_count == 0: + return 0 + return self.value_sum / self.visit_count + + def expand(self, actions, to_play, reward, policy_logits, hidden_state): + # expand一层节点,actions是动作列表,policy_logits是rewards列表 + # 通过该函数,在该节点扩展一层节点 + """ + We expand a node using the value, reward and policy prediction obtained from the + neural network. + """ + self.to_play = to_play + self.reward = reward + self.hidden_state = hidden_state + + policy_values = torch.softmax( + torch.tensor([policy_logits[0][a] for a in actions]), dim=0 + ).tolist() + policy = {a: policy_values[i] for i, a in enumerate(actions)} # 列出所有的合法动作及对于的value值 + for action, p in policy.items(): + self.children[action] = Node(p) + + def add_exploration_noise(self, dirichlet_alpha, exploration_fraction): + """ + At the start of each search, we add dirichlet noise to the prior of the root to + encourage the search to explore new actions. + """ + actions = list(self.children.keys()) + noise = numpy.random.dirichlet([dirichlet_alpha] * len(actions)) + frac = exploration_fraction + for a, n in zip(actions, noise): + self.children[a].prior = self.children[a].prior * (1 - frac) + n * frac + + +class GameHistory: + """ + Store only usefull information of a self-play game. + """ + + def __init__(self): + self.observation_history = [] + self.action_history = [] + self.reward_history = [] + self.to_play_history = [] + self.child_visits = [] + self.root_values = [] + self.reanalysed_predicted_root_values = None + # For PER + self.priorities = None + self.game_priority = None + + def store_search_statistics(self, root, action_space): + # Turn visit count from root into a policy + if root is not None: + sum_visits = sum(child.visit_count for child in root.children.values()) + self.child_visits.append( + [ + root.children[a].visit_count / sum_visits + if a in root.children + else 0 + for a in action_space + ] + ) + + self.root_values.append(root.value()) + else: + self.root_values.append(None) + + def get_stacked_observations( + self, index, num_stacked_observations, action_space_size + ): #根据索引index获取observation序列 + """ + Generate a new observation with the observation at the index position + and num_stacked_observations past observations and actions stacked. + """ + # Convert to positive index + index = index % len(self.observation_history) + + stacked_observations = self.observation_history[index].copy() #分为两部分,一部分是当前(current)观察值,一部分是之前的(previous)观察值 + for past_observation_index in reversed( + range(index - num_stacked_observations, index) + ): + if 0 <= past_observation_index: + previous_observation = numpy.concatenate( # np.concatenate将第一个参数的list组合起来,方法是依次拆开每个元素,拼接 + ( + self.observation_history[past_observation_index], + [ + numpy.ones_like(stacked_observations[0]) + * self.action_history[past_observation_index + 1] + / action_space_size + ], + ) + ) + else: + previous_observation = numpy.concatenate( + ( + numpy.zeros_like(self.observation_history[index]), + [numpy.zeros_like(stacked_observations[0])], + ) + ) + + stacked_observations = numpy.concatenate( # 向stoacked_observtions添加内容 + (stacked_observations, previous_observation) + ) + + return stacked_observations + + +class MinMaxStats: + """ + A class that holds the min-max values of the tree. + """ + + def __init__(self): + self.maximum = -float("inf") # 最大是-∞ + self.minimum = float("inf") # 最小是+∞ + # 跟类一定要update至少两次才能产生正确的范围。第一次更新掉max self.minimum: # 如果最大大于最小,说明至少更新了两次(第一次更新掉max0。c^0=1。则所有的访问次数变为相同的1,难以区分大小,因此就会相当于随机选择 + # 特殊地,当t=0时,使用random完全随机选择,当t=+∞,使用argmax选择最大的 + @staticmethod # 静态方法修饰符,类似于static关键字 + def select_action(node, temperature): + """ + Select action according to the visit count distribution and the temperature. + The temperature is changed dynamically with the visit_softmax_temperature function + in the config. + """ + visit_counts = numpy.array( + [child.visit_count for child in node.children.values()], dtype="int32" + ) + actions = [action for action in node.children.keys()] + if temperature == 0: + action = actions[numpy.argmax(visit_counts)] + elif temperature == float("inf"): + action = numpy.random.choice(actions) + else: + # See paper appendix Data Generation + visit_count_distribution = visit_counts ** (1 / temperature) + visit_count_distribution = visit_count_distribution / sum( + visit_count_distribution + ) + action = numpy.random.choice(actions, p=visit_count_distribution) + + return action \ No newline at end of file diff --git a/simplifiedMuZero/without_rb/play_buffer.py b/simplifiedMuZero/without_rb/play_buffer.py new file mode 100644 index 00000000..ad13a67f --- /dev/null +++ b/simplifiedMuZero/without_rb/play_buffer.py @@ -0,0 +1,214 @@ +import numpy +import torch +import copy +class PlayBuffer: + """ + Class which run in a dedicated thread to store played games and generate batch. + """ + + def __init__(self, initial_checkpoint, initial_buffer, config): + self.config = config + self.buffer = copy.deepcopy(initial_buffer) # initial_buffer默认为{} + self.num_played_games = initial_checkpoint["num_played_games"] + self.num_played_steps = initial_checkpoint["num_played_steps"] + self.total_samples = sum( + [len(game_history.root_values) for game_history in self.buffer.values()] + ) + if self.total_samples != 0: + print( + f"Replay buffer initialized with {self.total_samples} samples ({self.num_played_games} games).\n" + ) + + # Fix random generator seed + numpy.random.seed(self.config.seed) + + def save_game(self, game_history): + self.buffer[self.num_played_games] = game_history + self.num_played_games += 1 + self.num_played_steps += len(game_history.root_values) + self.total_samples += len(game_history.root_values) + + if self.config.replay_buffer_size < len(self.buffer): + del_id = self.num_played_games - len(self.buffer) + self.total_samples -= len(self.buffer[del_id].root_values) + del self.buffer[del_id] + + def get_buffer(self): + return self.buffer + + def get_batch(self): + ( + index_batch, + observation_batch, + action_batch, + reward_batch, + value_batch, + policy_batch, + gradient_scale_batch, + ) = ([], [], [], [], [], [], []) + weight_batch = None + + for game_id, game_history, game_prob in self.sample_n_games( + self.config.batch_size + ): + game_pos, pos_prob = self.sample_position(game_history) + + values, rewards, policies, actions = self.make_target( + game_history, game_pos + ) + + index_batch.append([game_id, game_pos]) + observation_batch.append( + game_history.get_stacked_observations( + game_pos, + self.config.stacked_observations, + len(self.config.action_space), + ) + ) + action_batch.append(actions) + value_batch.append(values) + reward_batch.append(rewards) + policy_batch.append(policies) + gradient_scale_batch.append( + [ + min( + self.config.num_unroll_steps, + len(game_history.action_history) - game_pos, + ) + ] + * len(actions) + ) + + # observation_batch: batch, channels, height, width + # action_batch: batch, num_unroll_steps+1 + # value_batch: batch, num_unroll_steps+1 + # reward_batch: batch, num_unroll_steps+1 + # policy_batch: batch, num_unroll_steps+1, len(action_space) + # weight_batch: batch + # gradient_scale_batch: batch, num_unroll_steps+1 + return ( + index_batch, + ( + observation_batch, + action_batch, + value_batch, + reward_batch, + policy_batch, + weight_batch, + gradient_scale_batch, + ), + ) + + def sample_game(self, force_uniform=True): #将force_uniform 设置为True,强制安装平均分布选取 + """ + Sample game from buffer either uniformly or according to some priority. + See paper appendix Training. + """ + game_prob = None + + game_index = numpy.random.choice(len(self.buffer)) + game_id = self.num_played_games - len(self.buffer) + game_index + + return game_id, self.buffer[game_id], game_prob + + def sample_n_games(self, n_games): + selected_games = numpy.random.choice(list(self.buffer.keys()), n_games) + game_prob_dict = {} + ret = [ + (game_id, self.buffer[game_id], game_prob_dict.get(game_id)) + for game_id in selected_games + ] + return ret + + def sample_position(self, game_history): + """ + Sample position from game either uniformly or according to some priority. + See paper appendix Training. + """ + position_prob = None + + position_index = numpy.random.choice(len(game_history.root_values)) + + return position_index, position_prob + + def update_game_history(self, game_id, game_history): + # The element could have been removed since its selection and update + # if next(iter(self.buffer)) <= game_id: + # self.buffer[game_id] = game_history + + self.buffer[game_id] = game_history + + def compute_target_value(self, game_history, index): + # The value target is the discounted root value of the search tree td_steps into the + # future, plus the discounted sum of all rewards until then. + bootstrap_index = index + self.config.td_steps + if bootstrap_index < len(game_history.root_values): + root_values = ( + game_history.root_values + if game_history.reanalysed_predicted_root_values is None + else game_history.reanalysed_predicted_root_values + ) + last_step_value = ( + root_values[bootstrap_index] + if game_history.to_play_history[bootstrap_index] + == game_history.to_play_history[index] + else -root_values[bootstrap_index] + ) + + value = last_step_value * self.config.discount**self.config.td_steps + else: + value = 0 + + for i, reward in enumerate( + game_history.reward_history[index + 1 : bootstrap_index + 1] + ): + # The value is oriented from the perspective of the current player + value += ( + reward + if game_history.to_play_history[index] + == game_history.to_play_history[index + i] + else -reward + ) * self.config.discount**i + + return value + + def make_target(self, game_history, state_index): + """ + Generate targets for every unroll steps. + """ + target_values, target_rewards, target_policies, actions = [], [], [], [] + for current_index in range( + state_index, state_index + self.config.num_unroll_steps + 1 + ): + value = self.compute_target_value(game_history, current_index) + + if current_index < len(game_history.root_values): + target_values.append(value) + target_rewards.append(game_history.reward_history[current_index]) + target_policies.append(game_history.child_visits[current_index]) + actions.append(game_history.action_history[current_index]) + elif current_index == len(game_history.root_values): + target_values.append(0) + target_rewards.append(game_history.reward_history[current_index]) + # Uniform policy + target_policies.append( + [ + 1 / len(game_history.child_visits[0]) + for _ in range(len(game_history.child_visits[0])) + ] + ) + actions.append(game_history.action_history[current_index]) + else: + # States past the end of games are treated as absorbing states + target_values.append(0) + target_rewards.append(0) + # Uniform policy + target_policies.append( + [ + 1 / len(game_history.child_visits[0]) + for _ in range(len(game_history.child_visits[0])) + ] + ) + actions.append(numpy.random.choice(self.config.action_space)) + + return target_values, target_rewards, target_policies, actions diff --git a/simplifiedMuZero/without_rb/trainer.py b/simplifiedMuZero/without_rb/trainer.py new file mode 100644 index 00000000..265b13c5 --- /dev/null +++ b/simplifiedMuZero/without_rb/trainer.py @@ -0,0 +1,243 @@ +import numpy +import torch +import models + +class Trainer: + """ + Class which run in a dedicated thread to train a neural network and save it + in the shared storage. + """ + + def __init__(self, model_cls, initial_checkpoint, config): + self.config = config + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Initialize the network + self.model = model_cls(self.config) + # self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"])) + self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu")) + self.model.train() + + self.training_step = initial_checkpoint["training_step"] + + if "cuda" not in str(next(self.model.parameters()).device): + print("You are not training on GPU.\n") + + # Initialize the optimizer + if self.config.optimizer == "SGD": + self.optimizer = torch.optim.SGD( + self.model.parameters(), + lr=self.config.lr_init, + momentum=self.config.momentum, + weight_decay=self.config.weight_decay, + ) + elif self.config.optimizer == "Adam": + self.optimizer = torch.optim.Adam( + self.model.parameters(), + lr=self.config.lr_init, + weight_decay=self.config.weight_decay, + ) + else: + raise NotImplementedError( + f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py." + ) + + # if initial_checkpoint["optimizer_state"] is not None: + # print("Loading optimizer...\n") + # self.optimizer.load_state_dict( + # copy.deepcopy(initial_checkpoint["optimizer_state"]) + # ) + + # # update weights 与 continuous update weights 的区别 + # # 1. update weights 是实际计算更新network的权重 + # # 2. continuous update weights 从replay buffer 里获取数据batch, 并将batch传递给update weights,使update weights完成参数更新 + # def continuous_update_weights(self, play_buffer, terminate): # terminate是用来记录replay buffer等其它程序是否终止的,跟game的状态无关 + # next_batch = play_buffer.get_batch() + # # Training loop + # while self.training_step < self.config.training_steps and not terminate: + # index_batch, batch = next_batch + # next_batch = play_buffer.get_batch() + # self.update_lr() + # ( + # priorities, + # total_loss, + # value_loss, + # reward_loss, + # policy_loss, + # ) = self.update_weights(batch) + + def update_weights(self, batch): + """ + Perform one training step. + """ + + ( + observation_batch, + action_batch, + target_value, + target_reward, + target_policy, + weight_batch, + gradient_scale_batch, + ) = batch + + # Keep values as scalars for calculating the priorities for the prioritized replay + target_value_scalar = numpy.array(target_value, dtype="float32") + priorities = numpy.zeros_like(target_value_scalar) + + device = next(self.model.parameters()).device + observation_batch = ( + torch.tensor(numpy.array(observation_batch)).float().to(device) + ) + action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1) + target_value = torch.tensor(target_value).float().to(device) + target_reward = torch.tensor(target_reward).float().to(device) + target_policy = torch.tensor(target_policy).float().to(device) + gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device) + # observation_batch: batch, channels, height, width + # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze) + # target_value: batch, num_unroll_steps+1 + # target_reward: batch, num_unroll_steps+1 + # target_policy: batch, num_unroll_steps+1, len(action_space) + # gradient_scale_batch: batch, num_unroll_steps+1 + + target_value = models.scalar_to_support(target_value, self.config.support_size) + target_reward = models.scalar_to_support( + target_reward, self.config.support_size + ) + # target_value: batch, num_unroll_steps+1, 2*support_size+1 + # target_reward: batch, num_unroll_steps+1, 2*support_size+1 + + ## Generate predictions + value, reward, policy_logits, hidden_state = self.model.initial_inference( + observation_batch + ) + predictions = [(value, reward, policy_logits)] + for i in range(1, action_batch.shape[1]): + value, reward, policy_logits, hidden_state = self.model.recurrent_inference( + hidden_state, action_batch[:, i] + ) + # Scale the gradient at the start of the dynamics function (See paper appendix Training) + hidden_state.register_hook(lambda grad: grad * 0.5) + predictions.append((value, reward, policy_logits)) + # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim) + + ## Compute losses + value_loss, reward_loss, policy_loss = (0, 0, 0) + value, reward, policy_logits = predictions[0] + # Ignore reward loss for the first batch step + current_value_loss, _, current_policy_loss = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, 0], + target_reward[:, 0], + target_policy[:, 0], + ) + value_loss += current_value_loss + policy_loss += current_policy_loss + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, 0] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, 0]) + ** self.config.PER_alpha + ) + + for i in range(1, len(predictions)): + value, reward, policy_logits = predictions[i] + ( + current_value_loss, + current_reward_loss, + current_policy_loss, + ) = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, i], + target_reward[:, i], + target_policy[:, i], + ) + + # Scale gradient by the number of unroll steps (See paper appendix Training) + current_value_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_reward_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_policy_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + + value_loss += current_value_loss + reward_loss += current_reward_loss + policy_loss += current_policy_loss + + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, i] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, i]) + ** self.config.PER_alpha + ) + + # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze) + loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss + + # Mean over batch dimension (pseudocode do a sum) + loss = loss.mean() + + # Optimize + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + self.training_step += 1 + + return ( + priorities, + # For log purpose + loss.item(), + value_loss.mean().item(), + reward_loss.mean().item(), + policy_loss.mean().item(), + ) + + def update_lr(self): + """ + Update learning rate + """ + lr = self.config.lr_init * self.config.lr_decay_rate ** ( + self.training_step / self.config.lr_decay_steps + ) + for param_group in self.optimizer.param_groups: + param_group["lr"] = lr + + @staticmethod + def loss_function( + value, + reward, + policy_logits, + target_value, + target_reward, + target_policy, + ): + # Cross-entropy seems to have a better convergence than MSE + value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1) + reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1) + policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(1) + + return value_loss, reward_loss, policy_loss diff --git a/simplifiedMuZero/without_rb/trainer_no_PV.py b/simplifiedMuZero/without_rb/trainer_no_PV.py new file mode 100644 index 00000000..265b13c5 --- /dev/null +++ b/simplifiedMuZero/without_rb/trainer_no_PV.py @@ -0,0 +1,243 @@ +import numpy +import torch +import models + +class Trainer: + """ + Class which run in a dedicated thread to train a neural network and save it + in the shared storage. + """ + + def __init__(self, model_cls, initial_checkpoint, config): + self.config = config + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Initialize the network + self.model = model_cls(self.config) + # self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"])) + self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu")) + self.model.train() + + self.training_step = initial_checkpoint["training_step"] + + if "cuda" not in str(next(self.model.parameters()).device): + print("You are not training on GPU.\n") + + # Initialize the optimizer + if self.config.optimizer == "SGD": + self.optimizer = torch.optim.SGD( + self.model.parameters(), + lr=self.config.lr_init, + momentum=self.config.momentum, + weight_decay=self.config.weight_decay, + ) + elif self.config.optimizer == "Adam": + self.optimizer = torch.optim.Adam( + self.model.parameters(), + lr=self.config.lr_init, + weight_decay=self.config.weight_decay, + ) + else: + raise NotImplementedError( + f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py." + ) + + # if initial_checkpoint["optimizer_state"] is not None: + # print("Loading optimizer...\n") + # self.optimizer.load_state_dict( + # copy.deepcopy(initial_checkpoint["optimizer_state"]) + # ) + + # # update weights 与 continuous update weights 的区别 + # # 1. update weights 是实际计算更新network的权重 + # # 2. continuous update weights 从replay buffer 里获取数据batch, 并将batch传递给update weights,使update weights完成参数更新 + # def continuous_update_weights(self, play_buffer, terminate): # terminate是用来记录replay buffer等其它程序是否终止的,跟game的状态无关 + # next_batch = play_buffer.get_batch() + # # Training loop + # while self.training_step < self.config.training_steps and not terminate: + # index_batch, batch = next_batch + # next_batch = play_buffer.get_batch() + # self.update_lr() + # ( + # priorities, + # total_loss, + # value_loss, + # reward_loss, + # policy_loss, + # ) = self.update_weights(batch) + + def update_weights(self, batch): + """ + Perform one training step. + """ + + ( + observation_batch, + action_batch, + target_value, + target_reward, + target_policy, + weight_batch, + gradient_scale_batch, + ) = batch + + # Keep values as scalars for calculating the priorities for the prioritized replay + target_value_scalar = numpy.array(target_value, dtype="float32") + priorities = numpy.zeros_like(target_value_scalar) + + device = next(self.model.parameters()).device + observation_batch = ( + torch.tensor(numpy.array(observation_batch)).float().to(device) + ) + action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1) + target_value = torch.tensor(target_value).float().to(device) + target_reward = torch.tensor(target_reward).float().to(device) + target_policy = torch.tensor(target_policy).float().to(device) + gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device) + # observation_batch: batch, channels, height, width + # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze) + # target_value: batch, num_unroll_steps+1 + # target_reward: batch, num_unroll_steps+1 + # target_policy: batch, num_unroll_steps+1, len(action_space) + # gradient_scale_batch: batch, num_unroll_steps+1 + + target_value = models.scalar_to_support(target_value, self.config.support_size) + target_reward = models.scalar_to_support( + target_reward, self.config.support_size + ) + # target_value: batch, num_unroll_steps+1, 2*support_size+1 + # target_reward: batch, num_unroll_steps+1, 2*support_size+1 + + ## Generate predictions + value, reward, policy_logits, hidden_state = self.model.initial_inference( + observation_batch + ) + predictions = [(value, reward, policy_logits)] + for i in range(1, action_batch.shape[1]): + value, reward, policy_logits, hidden_state = self.model.recurrent_inference( + hidden_state, action_batch[:, i] + ) + # Scale the gradient at the start of the dynamics function (See paper appendix Training) + hidden_state.register_hook(lambda grad: grad * 0.5) + predictions.append((value, reward, policy_logits)) + # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim) + + ## Compute losses + value_loss, reward_loss, policy_loss = (0, 0, 0) + value, reward, policy_logits = predictions[0] + # Ignore reward loss for the first batch step + current_value_loss, _, current_policy_loss = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, 0], + target_reward[:, 0], + target_policy[:, 0], + ) + value_loss += current_value_loss + policy_loss += current_policy_loss + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, 0] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, 0]) + ** self.config.PER_alpha + ) + + for i in range(1, len(predictions)): + value, reward, policy_logits = predictions[i] + ( + current_value_loss, + current_reward_loss, + current_policy_loss, + ) = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, i], + target_reward[:, i], + target_policy[:, i], + ) + + # Scale gradient by the number of unroll steps (See paper appendix Training) + current_value_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_reward_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_policy_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + + value_loss += current_value_loss + reward_loss += current_reward_loss + policy_loss += current_policy_loss + + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, i] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, i]) + ** self.config.PER_alpha + ) + + # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze) + loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss + + # Mean over batch dimension (pseudocode do a sum) + loss = loss.mean() + + # Optimize + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + self.training_step += 1 + + return ( + priorities, + # For log purpose + loss.item(), + value_loss.mean().item(), + reward_loss.mean().item(), + policy_loss.mean().item(), + ) + + def update_lr(self): + """ + Update learning rate + """ + lr = self.config.lr_init * self.config.lr_decay_rate ** ( + self.training_step / self.config.lr_decay_steps + ) + for param_group in self.optimizer.param_groups: + param_group["lr"] = lr + + @staticmethod + def loss_function( + value, + reward, + policy_logits, + target_value, + target_reward, + target_policy, + ): + # Cross-entropy seems to have a better convergence than MSE + value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1) + reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1) + policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum(1) + + return value_loss, reward_loss, policy_loss diff --git a/simplified_muzero.py b/simplified_muzero.py new file mode 100644 index 00000000..11cf7591 --- /dev/null +++ b/simplified_muzero.py @@ -0,0 +1,108 @@ +from simplifiedMuZero.net2.models2 import MuZeroNetwork_2net +from muzero_general import MuZeroGeneral +from muzero import load_model_menu, hyperparameter_search + +import json +import sys +import pathlib +import time +import nevergrad + +if __name__ == "__main__": + # muzero = MuZeroWithoutRB("",models.MuZeroNetwork, save_path_ex="muzero_without_rb") + # start_time = time.time() + # muzero.train() + # end_time = time.time() + # print("耗时: {:.2f}秒".format(end_time - start_time)) + model_cls = MuZeroNetwork_2net + if len(sys.argv) == 2: + # Train directly with: python muzero.py cartpole + muzero = MuZeroGeneral(sys.argv[1], model_cls=model_cls) + muzero.train() + elif len(sys.argv) == 3: + # Train directly with: python muzero.py cartpole '{"lr_init": 0.01}' + config = json.loads(sys.argv[2]) + muzero = MuZeroGeneral(sys.argv[1], config, model_cls=model_cls) + muzero.train() + else: + print("\nWelcome to MuZero! Here's a list of games:") + # Let user pick a game + games = [ + filename.stem + for filename in sorted(list((pathlib.Path.cwd() / "games").glob("*.py"))) + if filename.name != "abstract_game.py" + ] + for i in range(len(games)): + print(f"{i}. {games[i]}") + choice = input("Enter a number to choose the game: ") + valid_inputs = [str(i) for i in range(len(games))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + + # Initialize MuZero + choice = int(choice) + game_name = games[choice] + muzero = MuZeroGeneral(game_name, model_cls=model_cls) + + while True: + # Configure running options + options = [ + "Train", + "Load pretrained model", + "Diagnose model", + "Render some self play games", + "Play against MuZero", + "Test the game manually", + "Hyperparameter search", + "Exit", + ] + print() + for i in range(len(options)): + print(f"{i}. {options[i]}") + + choice = input("Enter a number to choose an action: ") + valid_inputs = [str(i) for i in range(len(options))] + while choice not in valid_inputs: + choice = input("Invalid input, enter a number listed above: ") + choice = int(choice) + if choice == 0: + start_time = time.time() + muzero.train() + end_time = time.time() + print("耗时: {:.2f}秒".format(end_time - start_time)) + elif choice == 1: + load_model_menu(muzero, game_name) + elif choice == 2: + muzero.diagnose_model(30) + elif choice == 3: + muzero.test(render=True, opponent="self", muzero_player=None) + elif choice == 4: + muzero.test(render=True, opponent="human", muzero_player=0) + elif choice == 5: + env = muzero.Game() + env.reset() + env.render() + + done = False + while not done: + action = env.human_to_action() + observation, reward, done = env.step(action) + print(f"\nAction: {env.action_to_string(action)}\nReward: {reward}") + env.render() + elif choice == 6: + # Define here the parameters to tune + # Parametrization documentation: https://facebookresearch.github.io/nevergrad/parametrization.html + muzero.terminate_workers() + del muzero + budget = 20 + parallel_experiments = 2 + lr_init = nevergrad.p.Log(lower=0.0001, upper=0.1) + discount = nevergrad.p.Log(lower=0.95, upper=0.9999) + parametrization = nevergrad.p.Dict(lr_init=lr_init, discount=discount) + best_hyperparameters = hyperparameter_search( + game_name, parametrization, budget, parallel_experiments, 20 + ) + muzero = MuZeroGeneral(game_name, best_hyperparameters, model_cls=model_cls) + else: + break + print("\nDone") \ No newline at end of file diff --git a/test/Simple_grid_test.py b/test/Simple_grid_test.py new file mode 100644 index 00000000..501ac2df --- /dev/null +++ b/test/Simple_grid_test.py @@ -0,0 +1,23 @@ +import numpy as np + +from games.simple_grid import Game +import random +import time + +g = Game() +observation = g.env.get_observation() + +# print(observer) +for i in range(1000): + actions = g.legal_actions() + observation, reward, done = g.step(random.choice(actions)) + # g.render() + print(np.array(observation).shape) + + if done: + break + + + # time.sleep(10) + +g.close() diff --git a/test/deap_test.py b/test/deap_test.py new file mode 100644 index 00000000..51b930c8 --- /dev/null +++ b/test/deap_test.py @@ -0,0 +1,120 @@ +import copy +import random + +import deap +from games.tictactoe import Game, MuZeroConfig +import numpy as np + +config = MuZeroConfig() + +from deap import base, creator, tools +import numpy as np +# 定义问题 +# creator创建的是类,第一个参数是类名,第二个参数是基类,后面的是其它参数 +creator.create('FitnessMax', base.Fitness, weights=(1.0,)) +creator.create('Individual', list, fitness = creator.FitnessMax) + +legal_actions = 9 + +toolbox = base.Toolbox() +# 注册生成基因的函数。第一个参数是函数名,因此下面的调用是toolbox.Actions。 +# 第二鸽参数是生成action的函数。 +# 后边的参数是生成函数的参数,如此为np.random.choice(range(n), N, replace=False) +toolbox.register("Actions", np.random.choice, range(legal_actions), config.max_moves, replace=False) +# tools.initIterate返回一个生成的动作序列 +toolbox.register("Individual", tools.initIterate, creator.Individual, toolbox.Actions) + +# ind1 = toolbox.Individual() +# print(ind1) + +# 重复生成动作序列 +toolbox.register("population", tools.initRepeat, list, toolbox.Individual) + +# pop = toolbox.population(n=36) +# print(len(pop)) + + + +game = Game(0) +game2 = copy.deepcopy(game) +game.reset() +game2.reset() + +actions = game.legal_actions() +np.random.shuffle(actions) + +# for i in range(config.max_moves): +# # game.render() +# print(game.legal_actions()) +# observation, reward, done = game.step(np.random.choice(game.legal_actions())) +# +# if done: +# break + +def evaluate(actions): + game = Game(1) + game.reset() + + for i in range(len(actions)): + player = game.to_play() + observation, reward, done = game.step(actions[i]) + if done: + break + + game.close() + reward = reward if 0 == player else -reward + # 因为i是从0开始的,如果第一个action就结束,会出现NAN异常 + reward /= i + 1 # 路径越长,回报越低。以便寻找到最近的路径 + return reward, + + +def game_evaluate(actions, game=None, play_id=None): + game = copy.deepcopy(game) + game.reset() + + for i in range(len(actions)): + player = game.to_play() + observation, reward, done = game.step(actions[i]) + if done: + break + + game.close() + reward = reward if play_id == player else -reward + # 因为i是从0开始的,如果第一个action就结束,会出现NAN异常 + reward /= i+1 # 路径越长,回报越低。以便寻找到最近的路径 + return reward, + # print(actions[i]) + # game.render() + +toolbox.register("evaluate", game_evaluate, game=game, play_id = 0) +# toolbox.register("evaluate", evaluate) +toolbox.register("mate", tools.cxTwoPoint) +toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) +# toolbox.register("select", tools.selTournament, tournsize=2000) +# toolbox.register("select", tools.selBest) +toolbox.register("select", tools.selStochasticUniversalSampling) + +pop = toolbox.population(n=100) + +# from deap import algorithms +# pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=10, verbose=False) +# # print(logbook) +# result = tools.selBest(pop, k=1) + +results = [[0, 6, 8, 7, 4, 5, 2, 1, 3]] +print(results) +print(evaluate(results[0])) +reward = game_evaluate(results[0],game,0) +print(reward) + +# reward = game_evaluate([0,1,3,4,6,7,2,5,9],game,0) +# print(reward) +# +# for i in range(20): +# print(game_evaluate(pop[i], game, 0)) + +# print(evaluate(actions, game, 0)) + +# print(actions[:i]) +# game.render() +# game2.render() diff --git a/test/deap_test2.py b/test/deap_test2.py new file mode 100644 index 00000000..ad6de6bc --- /dev/null +++ b/test/deap_test2.py @@ -0,0 +1,119 @@ +import copy +import random + +import deap +from games.tictactoe import Game, MuZeroConfig +import numpy as np +from functools import partial + +config = MuZeroConfig() + +from deap import base, creator, tools +import numpy as np +# 定义问题 +# creator创建的是类,第一个参数是类名,第二个参数是基类,后面的是其它参数 +creator.create('FitnessMax', base.Fitness, weights=(1.0,)) +creator.create('Individual', list, fitness = creator.FitnessMax) + +legal_actions = 9 + +toolbox = base.Toolbox() +# 注册生成基因的函数。第一个参数是函数名,因此下面的调用是toolbox.Actions。 +# 第二鸽参数是生成action的函数。 +# 后边的参数是生成函数的参数,如此为np.random.choice(range(n), N, replace=False) +# toolbox.register("Actions", np.random.choice, range(legal_actions), config.max_moves, replace=False) +# # tools.initIterate返回一个生成的动作序列 +# toolbox.register("Individual", tools.initIterate, creator.Individual, toolbox.Actions) + +def individual(actions, max_moves, replace=False): + max_moves = max_moves if replace else len(actions) + return tools.initIterate(creator.Individual, partial(np.random.choice, actions, max_moves, replace=replace)) + +# print(individual([0,1,2,3,4], 9, replace=False)) +# print(individual([0,1,2,3,4], 9, replace=True)) +# exit() + +def population(actions, max_moves, N, replace=False): + return tools.initRepeat(list, partial(individual, actions, max_moves, replace), N) + +pop = population(range(9),9, N=4, replace=False) +print(pop) + +# exit() +# +# # 重复生成动作序列 +# toolbox.register("population", tools.initRepeat, list, toolbox.Individual) + +game = Game(0) + +actions = game.legal_actions() +np.random.shuffle(actions) + +def evaluate(actions): + game = Game(1) + game.reset() + + for i in range(len(actions)): + player = game.to_play() + observation, reward, done = game.step(actions[i]) + if done: + break + + game.close() + reward = reward if 0 == player else -reward + # 因为i是从0开始的,如果第一个action就结束,会出现NAN异常 + reward /= i + 1 # 路径越长,回报越低。以便寻找到最近的路径 + return reward, + + +def game_evaluate(actions, game=None, play_id=None): + game = copy.deepcopy(game) + game.reset() + + for i in range(len(actions)): + player = game.to_play() + observation, reward, done = game.step(actions[i]) + if done: + break + + game.close() + reward = reward if play_id == player else -reward + # 因为i是从0开始的,如果第一个action就结束,会出现NAN异常 + reward /= i+1 # 路径越长,回报越低。以便寻找到最近的路径 + return reward, + # print(actions[i]) + # game.render() + +toolbox.register("evaluate", game_evaluate, game=game, play_id = 0) +# toolbox.register("evaluate", evaluate) +toolbox.register("mate", tools.cxTwoPoint) +toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) +# toolbox.register("select", tools.selTournament, tournsize=2000) +# toolbox.register("select", tools.selBest) +toolbox.register("select", tools.selStochasticUniversalSampling) + +# pop = toolbox.population(n=100) +# pop = [[0, 6, 8, 7, 4, 5, 2, 1, 3], [0, 6, 3, 7, 4, 5, 2, 1, 8]] + +from deap import algorithms +pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=10, verbose=False) +# # print(logbook) +results = tools.selBest(pop, k=1) + +# results = [[0, 6, 8, 7, 4, 5, 2, 1, 3]] +print(results) +print(evaluate(results[0])) +reward = game_evaluate(results[0],game,0) +print(reward) + +# reward = game_evaluate([0,1,3,4,6,7,2,5,9],game,0) +# print(reward) +# +# for i in range(20): +# print(game_evaluate(pop[i], game, 0)) + +# print(evaluate(actions, game, 0)) + +# print(actions[:i]) +# game.render() +# game2.render() diff --git a/test/game_play_test.py b/test/game_play_test.py new file mode 100644 index 00000000..78fdc4a5 --- /dev/null +++ b/test/game_play_test.py @@ -0,0 +1,704 @@ +from self_play import MCTS, GameHistory +from games.simple_grid import MuZeroConfig, Game +# from games.tictactoe import MuZeroConfig, Game +import models + +import numpy +import torch + +import math +import time +import copy + +class MySelfPlay: + """ + Class which run in a dedicated thread to play games and save them to the replay-buffer. + """ + + def __init__(self, model, initial_checkpoint, Game, config, seed): + self.config = config + self.game = Game(seed) + + # Fix random generator seed + numpy.random.seed(seed) + torch.manual_seed(seed) + + # Initialize the network + # self.model = models.MuZeroNetwork(self.config) + # self.model.set_weights(initial_checkpoint["weights"]) + self.model = model + self.model.to(torch.device("cuda" if self.config.selfplay_on_gpu else "cpu")) + self.model.eval() + self.trained_steps = initial_checkpoint["training_step"] + self.terminate = False + + #play game 运行 + # 合法的actions是固定的,由游戏文件提供(在本函数中,可以看到调用legal_actions函数没有使用env,这表面现游戏环境于的改变于动作无关)。 + # 运行步骤: + # 1. 创建GameHistory用来存储数据 + # 2. 检查游戏是否结束或者到底最大移动次数 + # 3. 获取stacked observation(因为有些游戏需要考虑之前的历史数据和移动轨迹) + # 4. 运行MCTS搜索下一步的action + # 5. 调用游戏函数step(action),获取下一步action之后的observation、reward和done + # 6. 持续运行2-5步直到结束 + # 7. 返回GameHistory + def play_game( + self, temperature, temperature_threshold, render, opponent, muzero_player + ): + """ + Play one game with actions based on the Monte Carlo tree search at each moves. + """ + game_history = GameHistory() + observation = self.game.reset() + game_history.action_history.append(0) + game_history.observation_history.append(observation) # 添加reset之后的observation + game_history.reward_history.append(0) + game_history.to_play_history.append(self.game.to_play()) + + done = False + game_id = None + + if render: + self.game.render() + + game_id = self.game.to_play() + + with torch.no_grad(): + while ( + not done and len(game_history.action_history) <= self.config.max_moves + ): # 游戏没有结束且运行步数小于最大移动步长 + assert ( + len(numpy.array(observation).shape) == 3 + ), f"Observation should be 3 dimensionnal instead of {len(numpy.array(observation).shape)} dimensionnal. Got observation of shape: {numpy.array(observation).shape}" + assert ( + numpy.array(observation).shape == self.config.observation_shape + ), f"Observation should match the observation_shape defined in MuZeroConfig. Expected {self.config.observation_shape} but got {numpy.array(observation).shape}." + stacked_observations = game_history.get_stacked_observations( + -1, self.config.stacked_observations, len(self.config.action_space) + ) + # index是-1,game_history 会在创建时添加reset的observation,因此其长度为1.index取模(%)之后时1 + # config.stacked_observationis是存储之前的observation的数量,如果不要之前的信息,可以设为0,这样就不会存储之前的信息 + + # 一下的if-else部分主要是为了选择一个动作 + # Choose the action + if opponent == "self" or muzero_player == self.game.to_play(): + root, mcts_info = MCTS(self.config).run( + self.model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), # to_play返回当期玩游戏的玩家ID,默认是0 + True, + ) + action = self.select_action( + root, + temperature + if not temperature_threshold + or len(game_history.action_history) < temperature_threshold + else 0, + ) # 根据temperature选择动作 + + if render: + print(f'Tree depth: {mcts_info["max_tree_depth"]}') + print( + f"Root value for player {self.game.to_play()}: {root.value():.2f}" + ) + else: + action, root = self.select_opponent_action( #选择对手动作,分为随机,human和expert三种 + opponent, stacked_observations + ) + + observation, reward, done = self.game.step(action) # 运行游戏 + + if render: + print(f"Played action: {self.game.action_to_string(action)}") + self.game.render() + + game_history.store_search_statistics(root, self.config.action_space) + + # Next batch + game_history.action_history.append(action) + game_history.observation_history.append(observation) #添加到observation的队列。取数据是使用stacked_observation函数,从后往前取 + game_history.reward_history.append(reward) + game_history.to_play_history.append(self.game.to_play()) + + return game_id, game_history + + def close_game(self): + self.game.close() + + def select_opponent_action(self, opponent, stacked_observations): + """ + Select opponent action for evaluating MuZero level. + """ + if opponent == "human": + root, mcts_info = MCTS(self.config).run( + self.model, + stacked_observations, + self.game.legal_actions(), + self.game.to_play(), + True, + ) + print(f'Tree depth: {mcts_info["max_tree_depth"]}') + print(f"Root value for player {self.game.to_play()}: {root.value():.2f}") + print( + f"Player {self.game.to_play()} turn. MuZero suggests {self.game.action_to_string(self.select_action(root, 0))}" + ) + return self.game.human_to_action(), root + elif opponent == "expert": + return self.game.expert_agent(), None + elif opponent == "random": + assert ( + self.game.legal_actions() + ), f"Legal actions should not be an empty array. Got {self.game.legal_actions()}." + assert set(self.game.legal_actions()).issubset( + set(self.config.action_space) + ), "Legal actions should be a subset of the action space." + + return numpy.random.choice(self.game.legal_actions()), None + else: + raise NotImplementedError( + 'Wrong argument: "opponent" argument should be "self", "human", "expert" or "random"' + ) + + # 根据访问次数分布和温度选择操作。 温度通过配置中的visit_softmax_Temperature函数动态改变。 + # 公式为 c^(1/t)。可以看到: + # t越小,1/t于接近于无穷大,值大的c就越容易被选中。 + # t越大,1/t->0。c^0=1。则所有的访问次数变为相同的1,难以区分大小,因此就会相当于随机选择 + # 特殊地,当t=0时,使用random完全随机选择,当t=+∞,使用argmax选择最大的 + @staticmethod # 静态方法修饰符,类似于static关键字 + def select_action(node, temperature): + """ + Select action according to the visit count distribution and the temperature. + The temperature is changed dynamically with the visit_softmax_temperature function + in the config. + """ + visit_counts = numpy.array( + [child.visit_count for child in node.children.values()], dtype="int32" + ) + actions = [action for action in node.children.keys()] + if temperature == 0: + action = actions[numpy.argmax(visit_counts)] + elif temperature == float("inf"): + action = numpy.random.choice(actions) + else: + # See paper appendix Data Generation + visit_count_distribution = visit_counts ** (1 / temperature) + visit_count_distribution = visit_count_distribution / sum( + visit_count_distribution + ) + action = numpy.random.choice(actions, p=visit_count_distribution) + + return action + +class PlayBuffer: + """ + Class which run in a dedicated thread to store played games and generate batch. + """ + + def __init__(self, initial_checkpoint, initial_buffer, config): + self.config = config + self.buffer = copy.deepcopy(initial_buffer) # initial_buffer默认为{} + self.num_played_games = initial_checkpoint["num_played_games"] + self.num_played_steps = initial_checkpoint["num_played_steps"] + self.total_samples = sum( + [len(game_history.root_values) for game_history in self.buffer.values()] + ) + if self.total_samples != 0: + print( + f"Replay buffer initialized with {self.total_samples} samples ({self.num_played_games} games).\n" + ) + + # Fix random generator seed + numpy.random.seed(self.config.seed) + + def save_game(self, game_history): + self.buffer[self.num_played_games] = game_history + self.num_played_games += 1 + self.num_played_steps += len(game_history.root_values) + self.total_samples += len(game_history.root_values) + + if self.config.replay_buffer_size < len(self.buffer): + del_id = self.num_played_games - len(self.buffer) + self.total_samples -= len(self.buffer[del_id].root_values) + del self.buffer[del_id] + + def get_buffer(self): + return self.buffer + + def get_batch(self): + ( + index_batch, + observation_batch, + action_batch, + reward_batch, + value_batch, + policy_batch, + gradient_scale_batch, + ) = ([], [], [], [], [], [], []) + weight_batch = None + + for game_id, game_history, game_prob in self.sample_n_games( + self.config.batch_size + ): + game_pos, pos_prob = self.sample_position(game_history) + + values, rewards, policies, actions = self.make_target( + game_history, game_pos + ) + + index_batch.append([game_id, game_pos]) + observation_batch.append( + game_history.get_stacked_observations( + game_pos, + self.config.stacked_observations, + len(self.config.action_space), + ) + ) + action_batch.append(actions) + value_batch.append(values) + reward_batch.append(rewards) + policy_batch.append(policies) + gradient_scale_batch.append( + [ + min( + self.config.num_unroll_steps, + len(game_history.action_history) - game_pos, + ) + ] + * len(actions) + ) + + # observation_batch: batch, channels, height, width + # action_batch: batch, num_unroll_steps+1 + # value_batch: batch, num_unroll_steps+1 + # reward_batch: batch, num_unroll_steps+1 + # policy_batch: batch, num_unroll_steps+1, len(action_space) + # weight_batch: batch + # gradient_scale_batch: batch, num_unroll_steps+1 + return ( + index_batch, + ( + observation_batch, + action_batch, + value_batch, + reward_batch, + policy_batch, + weight_batch, + gradient_scale_batch, + ), + ) + + def sample_game(self, force_uniform=True): #将force_uniform 设置为True,强制安装平均分布选取 + """ + Sample game from buffer either uniformly or according to some priority. + See paper appendix Training. + """ + game_prob = None + + game_index = numpy.random.choice(len(self.buffer)) + game_id = self.num_played_games - len(self.buffer) + game_index + + return game_id, self.buffer[game_id], game_prob + + def sample_n_games(self, n_games): + selected_games = numpy.random.choice(list(self.buffer.keys()), n_games) + game_prob_dict = {} + ret = [ + (game_id, self.buffer[game_id], game_prob_dict.get(game_id)) + for game_id in selected_games + ] + return ret + + def sample_position(self, game_history): + """ + Sample position from game either uniformly or according to some priority. + See paper appendix Training. + """ + position_prob = None + + position_index = numpy.random.choice(len(game_history.root_values)) + + return position_index, position_prob + + def update_game_history(self, game_id, game_history): + # The element could have been removed since its selection and update + # if next(iter(self.buffer)) <= game_id: + # self.buffer[game_id] = game_history + + self.buffer[game_id] = game_history + + def compute_target_value(self, game_history, index): + # The value target is the discounted root value of the search tree td_steps into the + # future, plus the discounted sum of all rewards until then. + bootstrap_index = index + self.config.td_steps + if bootstrap_index < len(game_history.root_values): + root_values = ( + game_history.root_values + if game_history.reanalysed_predicted_root_values is None + else game_history.reanalysed_predicted_root_values + ) + last_step_value = ( + root_values[bootstrap_index] + if game_history.to_play_history[bootstrap_index] + == game_history.to_play_history[index] + else -root_values[bootstrap_index] + ) + + value = last_step_value * self.config.discount**self.config.td_steps + else: + value = 0 + + for i, reward in enumerate( + game_history.reward_history[index + 1 : bootstrap_index + 1] + ): + # The value is oriented from the perspective of the current player + value += ( + reward + if game_history.to_play_history[index] + == game_history.to_play_history[index + i] + else -reward + ) * self.config.discount**i + + return value + + def make_target(self, game_history, state_index): + """ + Generate targets for every unroll steps. + """ + target_values, target_rewards, target_policies, actions = [], [], [], [] + for current_index in range( + state_index, state_index + self.config.num_unroll_steps + 1 + ): + value = self.compute_target_value(game_history, current_index) + + if current_index < len(game_history.root_values): + target_values.append(value) + target_rewards.append(game_history.reward_history[current_index]) + target_policies.append(game_history.child_visits[current_index]) + actions.append(game_history.action_history[current_index]) + elif current_index == len(game_history.root_values): + target_values.append(0) + target_rewards.append(game_history.reward_history[current_index]) + # Uniform policy + target_policies.append( + [ + 1 / len(game_history.child_visits[0]) + for _ in range(len(game_history.child_visits[0])) + ] + ) + actions.append(game_history.action_history[current_index]) + else: + # States past the end of games are treated as absorbing states + target_values.append(0) + target_rewards.append(0) + # Uniform policy + target_policies.append( + [ + 1 / len(game_history.child_visits[0]) + for _ in range(len(game_history.child_visits[0])) + ] + ) + actions.append(numpy.random.choice(self.config.action_space)) + + return target_values, target_rewards, target_policies, actions + +class Trainer: + """ + Class which run in a dedicated thread to train a neural network and save it + in the shared storage. + """ + + def __init__(self, initial_checkpoint, config): + self.config = config + + # Fix random generator seed + numpy.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + + # Initialize the network + self.model = models.MuZeroNetwork(self.config) + # self.model.set_weights(copy.deepcopy(initial_checkpoint["weights"])) + self.model.to(torch.device("cuda" if self.config.train_on_gpu else "cpu")) + self.model.train() + + self.training_step = initial_checkpoint["training_step"] + + if "cuda" not in str(next(self.model.parameters()).device): + print("You are not training on GPU.\n") + + # Initialize the optimizer + if self.config.optimizer == "SGD": + self.optimizer = torch.optim.SGD( + self.model.parameters(), + lr=self.config.lr_init, + momentum=self.config.momentum, + weight_decay=self.config.weight_decay, + ) + elif self.config.optimizer == "Adam": + self.optimizer = torch.optim.Adam( + self.model.parameters(), + lr=self.config.lr_init, + weight_decay=self.config.weight_decay, + ) + else: + raise NotImplementedError( + f"{self.config.optimizer} is not implemented. You can change the optimizer manually in trainer.py." + ) + + # if initial_checkpoint["optimizer_state"] is not None: + # print("Loading optimizer...\n") + # self.optimizer.load_state_dict( + # copy.deepcopy(initial_checkpoint["optimizer_state"]) + # ) + + # # update weights 与 continuous update weights 的区别 + # # 1. update weights 是实际计算更新network的权重 + # # 2. continuous update weights 从replay buffer 里获取数据batch, 并将batch传递给update weights,使update weights完成参数更新 + # def continuous_update_weights(self, play_buffer, terminate): # terminate是用来记录replay buffer等其它程序是否终止的,跟game的状态无关 + # next_batch = play_buffer.get_batch() + # # Training loop + # while self.training_step < self.config.training_steps and not terminate: + # index_batch, batch = next_batch + # next_batch = play_buffer.get_batch() + # self.update_lr() + # ( + # priorities, + # total_loss, + # value_loss, + # reward_loss, + # policy_loss, + # ) = self.update_weights(batch) + + def update_weights(self, batch): + """ + Perform one training step. + """ + + ( + observation_batch, + action_batch, + target_value, + target_reward, + target_policy, + weight_batch, + gradient_scale_batch, + ) = batch + + # Keep values as scalars for calculating the priorities for the prioritized replay + target_value_scalar = numpy.array(target_value, dtype="float32") + priorities = numpy.zeros_like(target_value_scalar) + + device = next(self.model.parameters()).device + observation_batch = ( + torch.tensor(numpy.array(observation_batch)).float().to(device) + ) + action_batch = torch.tensor(action_batch).long().to(device).unsqueeze(-1) + target_value = torch.tensor(target_value).float().to(device) + target_reward = torch.tensor(target_reward).float().to(device) + target_policy = torch.tensor(target_policy).float().to(device) + gradient_scale_batch = torch.tensor(gradient_scale_batch).float().to(device) + # observation_batch: batch, channels, height, width + # action_batch: batch, num_unroll_steps+1, 1 (unsqueeze) + # target_value: batch, num_unroll_steps+1 + # target_reward: batch, num_unroll_steps+1 + # target_policy: batch, num_unroll_steps+1, len(action_space) + # gradient_scale_batch: batch, num_unroll_steps+1 + + target_value = models.scalar_to_support(target_value, self.config.support_size) + target_reward = models.scalar_to_support( + target_reward, self.config.support_size + ) + # target_value: batch, num_unroll_steps+1, 2*support_size+1 + # target_reward: batch, num_unroll_steps+1, 2*support_size+1 + + ## Generate predictions + value, reward, policy_logits, hidden_state = self.model.initial_inference( + observation_batch + ) + predictions = [(value, reward, policy_logits)] + for i in range(1, action_batch.shape[1]): + value, reward, policy_logits, hidden_state = self.model.recurrent_inference( + hidden_state, action_batch[:, i] + ) + # Scale the gradient at the start of the dynamics function (See paper appendix Training) + hidden_state.register_hook(lambda grad: grad * 0.5) + predictions.append((value, reward, policy_logits)) + # predictions: num_unroll_steps+1, 3, batch, 2*support_size+1 | 2*support_size+1 | 9 (according to the 2nd dim) + + ## Compute losses + value_loss, reward_loss, policy_loss = (0, 0, 0) + value, reward, policy_logits = predictions[0] + # Ignore reward loss for the first batch step + current_value_loss, _, current_policy_loss = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, 0], + target_reward[:, 0], + target_policy[:, 0], + ) + value_loss += current_value_loss + policy_loss += current_policy_loss + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, 0] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, 0]) + ** self.config.PER_alpha + ) + + for i in range(1, len(predictions)): + value, reward, policy_logits = predictions[i] + ( + current_value_loss, + current_reward_loss, + current_policy_loss, + ) = self.loss_function( + value.squeeze(-1), + reward.squeeze(-1), + policy_logits, + target_value[:, i], + target_reward[:, i], + target_policy[:, i], + ) + + # Scale gradient by the number of unroll steps (See paper appendix Training) + current_value_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_reward_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + current_policy_loss.register_hook( + lambda grad: grad / gradient_scale_batch[:, i] + ) + + value_loss += current_value_loss + reward_loss += current_reward_loss + policy_loss += current_policy_loss + + # Compute priorities for the prioritized replay (See paper appendix Training) + pred_value_scalar = ( + models.support_to_scalar(value, self.config.support_size) + .detach() + .cpu() + .numpy() + .squeeze() + ) + priorities[:, i] = ( + numpy.abs(pred_value_scalar - target_value_scalar[:, i]) + ** self.config.PER_alpha + ) + + # Scale the value loss, paper recommends by 0.25 (See paper appendix Reanalyze) + loss = value_loss * self.config.value_loss_weight + reward_loss + policy_loss + + # Mean over batch dimension (pseudocode do a sum) + loss = loss.mean() + + # Optimize + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + self.training_step += 1 + + return ( + priorities, + # For log purpose + loss.item(), + value_loss.mean().item(), + reward_loss.mean().item(), + policy_loss.mean().item(), + ) + + def update_lr(self): + """ + Update learning rate + """ + lr = self.config.lr_init * self.config.lr_decay_rate ** ( + self.training_step / self.config.lr_decay_steps + ) + for param_group in self.optimizer.param_groups: + param_group["lr"] = lr + + @staticmethod + def loss_function( + value, + reward, + policy_logits, + target_value, + target_reward, + target_policy, + ): + # Cross-entropy seems to have a better convergence than MSE + value_loss = (-target_value * torch.nn.LogSoftmax(dim=1)(value)).sum(1) + reward_loss = (-target_reward * torch.nn.LogSoftmax(dim=1)(reward)).sum(1) + policy_loss = (-target_policy * torch.nn.LogSoftmax(dim=1)(policy_logits)).sum( + 1 + ) + return value_loss, reward_loss, policy_loss + +if __name__ == "__main__": + config = MuZeroConfig() + + checkpoint = { + "weights": None, + "optimizer_state": None, + "total_reward": 0, + "muzero_reward": 0, + "opponent_reward": 0, + "episode_length": 0, + "mean_value": 0, + "training_step": 0, + "lr": 0, + "total_loss": 0, + "value_loss": 0, + "reward_loss": 0, + "policy_loss": 0, + "num_played_games": 0, + "num_played_steps": 0, + "num_reanalysed_games": 0, + "terminate": False, + } + + trainer = Trainer(checkpoint, config) + selfplay = MySelfPlay(trainer.model, checkpoint, Game, config, config.seed) + buffer = {} + play_buffer = PlayBuffer(checkpoint, buffer, config) + for i in range(config.training_steps): + game_id, game_history = selfplay.play_game(selfplay.config.visit_softmax_temperature_fn(0), selfplay.config.temperature_threshold, False, "self",0) + + # print(game_id) + # print(game_history.action_history) + print(game_history.reward_history) + muzero_reward = sum( + reward + for i, reward in enumerate(game_history.reward_history) + if game_history.to_play_history[i - 1] + == config.muzero_player + ) + + print(muzero_reward) + # print(game_history.to_play_history) + # # print(game_history.observation_history) + # print("child visits", game_history.child_visits) + # print(game_history.root_values) # root value指的是root节点的UCB值 + + # buffer[game_id] = game_history + + play_buffer.update_game_history(game_id, game_history) + + for i in range(10): + index_batch, batch = play_buffer.get_batch() + # print(batch[1]) + trainer.update_lr() + trainer.update_weights(batch) + + selfplay.close_game() + + diff --git a/test/load_model.py b/test/load_model.py new file mode 100644 index 00000000..88e83520 --- /dev/null +++ b/test/load_model.py @@ -0,0 +1,12 @@ +import torch + +import simplifiedMuZero.net2.models2 as models +from games.tictactoe import Game, MuZeroConfig + +from game_tournament import load_model + +config = MuZeroConfig() + +muzero_2net_checkpoint_path = r"C:\Users\chunchang\workspace\muzero-general\results\tictactoe\2023-08-21--22-01-34\muzero_2net\model.checkpoint" +muzero_2net_model = load_model(models.MuZeroNetwork_2net, muzero_2net_checkpoint_path, config) + diff --git a/test/mcts_test.py b/test/mcts_test.py new file mode 100644 index 00000000..d3edc0f3 --- /dev/null +++ b/test/mcts_test.py @@ -0,0 +1,245 @@ +import models +from self_play import MCTS, GameHistory, Node, MinMaxStats +from games.tictactoe import MuZeroConfig, Game + +import torch +import numpy +import math + +class MCTS1: + """ + Core Monte Carlo Tree Search algorithm. + To decide on an action, we run N simulations, always starting at the root of + the search tree and traversing the tree according to the UCB formula until we + reach a leaf node. + """ + + def __init__(self, config): + self.config = config + + # run函数运行流程: + # 1. 获取root节点 + # (1)如果由指定节点这将root赋值为该节点; + # (2)如果没有,则 + # i. 创建新的节点Node(0) + # ii. 使用initial_inference函数通过observation获取相应的reward,hidden state,legal actions等数据 + # iii. 将ii中获取的数据赋值到创建的root节点中取 + # PS. 可以看到,在(1)的情况下不需要调用initial_inference函数 + # 2. 检查是否需要添加探索噪音 + # 3. 开始循环模拟游戏,模拟的次数由num simulation决定 + # (1) 将初始节点node设置为root,并将节点node加入search tree中 + # (2) 检查该节点是否已经扩展,如果已经扩展,则通过ucb值来选择子节点expand. 并将node 设置为选中的节点。并将节点node加入search tree中 + # (3) 重复2,直到找到expanded为false的node为止 + # (4) 选择search_tree[-2]为parent(因为最后一个是node) + # (5) 运行recurrent_inference函数,获得reward,hidden state,legal actions等数据 + # (6) 扩展node,即为node创建子节点,使node展开。 + # (7) 反向传播算法,对路径上的所有访问次数+1,value值加reward + # PS: 可以看到,通过不停的模拟,节点被一层层的扩展(每次模拟扩展一个节点)。 + # 4. 返回扩展过后的节点树root,以便之后的程序根据它选择动作action + def run( + self, + model, + observation, + legal_actions, + to_play, + add_exploration_noise, + override_root_with=None, + ): + """ + At the root of the search tree we use the representation function to obtain a + hidden state given the current observation. + We then run a Monte Carlo Tree Search using only action sequences and the model + learned by the network. + """ + print(override_root_with) + if override_root_with: #检查有没有提供Node,如果有,则指定;如果没有,则自己创建一个 + root = override_root_with + root_predicted_value = None + else: + root = Node(0) + observation = ( + torch.tensor(observation) + .float() + .unsqueeze(0) + .to(next(model.parameters()).device) + ) # observation转tensor,外面包一层形成一个batch。 Observation的长度由参数stacked_observation配置,主要存储之前的previous。不要之前privious的配置为0 + ( + root_predicted_value, + reward, + policy_logits, + hidden_state, + ) = model.initial_inference(observation) + root_predicted_value = models.support_to_scalar( + root_predicted_value, self.config.support_size + ).item() + reward = models.support_to_scalar(reward, self.config.support_size).item() + assert ( + legal_actions + ), f"Legal actions should not be an empty array. Got {legal_actions}." + assert set(legal_actions).issubset( + set(self.config.action_space) + ), "Legal actions should be a subset of the action space." + root.expand( + legal_actions, + to_play, + reward, + policy_logits, + hidden_state, + ) + + if add_exploration_noise: + root.add_exploration_noise( + dirichlet_alpha=self.config.root_dirichlet_alpha, + exploration_fraction=self.config.root_exploration_fraction, + ) + + min_max_stats = MinMaxStats() + + max_tree_depth = 0 + for _ in range(self.config.num_simulations): # 开始模拟游戏 + virtual_to_play = to_play + node = root + search_path = [node] + current_tree_depth = 0 + + # expanded根据node的子节点个数判断是否已经扩展了,如果没有子节点,说明没被扩展 + while node.expanded(): #这个循环一直在搜索没有expand的子节点。如果子节点已经expand了,则通过select_child选择下一个 + current_tree_depth += 1 + action, node = self.select_child(node, min_max_stats) #选取ucb最大的一个action,如果有多个action得分相同,随机选取一个 + search_path.append(node) #把节点添加到搜索队列 + + # Players play turn by turn + if virtual_to_play + 1 < len(self.config.players): + virtual_to_play = self.config.players[virtual_to_play + 1] + else: + virtual_to_play = self.config.players[0] + + # 在搜索树内部,我们使用动态函数来获取给定动作的下一个hidden_state和previous hidden state + # Inside the search tree we use the dynamics function to obtain the next hidden + # state given an action and the previous hidden state + parent = search_path[-2] # 选择倒数第二个节点,因为当前的node是-1,则-2是它的parent + value, reward, policy_logits, hidden_state = model.recurrent_inference( + parent.hidden_state, + torch.tensor([[action]]).to(parent.hidden_state.device), + ) + value = models.support_to_scalar(value, self.config.support_size).item() + reward = models.support_to_scalar(reward, self.config.support_size).item() + # expand一层节点,actions是动作列表,policy_logits是rewards列表 + # 通过该函数,在该节点扩展一层节点 + node.expand( + self.config.action_space, + virtual_to_play, + reward, + policy_logits, + hidden_state, + ) + + self.backpropagate(search_path, value, virtual_to_play, min_max_stats) + + max_tree_depth = max(max_tree_depth, current_tree_depth) + + extra_info = { + "max_tree_depth": max_tree_depth, + "root_predicted_value": root_predicted_value, + } + return root, extra_info + + # MCTS 的select child和之前SelfPlay的select action逻辑是不一样的 + # 1. select child是根据UCB选取的,select action是根据各个动作的visit count和temperature选取的 + # 2. select child 选择的对象是Node,Node是由当前的state执行action后生成的新Node形成的。select action单纯的是选action + def select_child(self, node, min_max_stats): + """ + Select the child with the highest UCB score. + """ + max_ucb = max( + self.ucb_score(node, child, min_max_stats) + for action, child in node.children.items() + ) + action = numpy.random.choice( # 随机选择ucb值等于最大ucb的动作(因为可能有多个动作的值都达到了最大的ucb,如果只有一个,那么就会选取这个) + [ + action + for action, child in node.children.items() + if self.ucb_score(node, child, min_max_stats) == max_ucb + ] + ) + return action, node.children[action] + + def ucb_score(self, parent, child, min_max_stats): #该函数只进行一步查询,不进行多步 + """ + The score for a node is based on its value, plus an exploration bonus based on the prior. + """ + pb_c = ( + math.log( + (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base # pc_c_base由配置文件决定 + ) + + self.config.pb_c_init + ) + pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1) + + prior_score = pb_c * child.prior # prior 之前的p_value + # 公式 pb_c = (log((N+C+1)/C)+init ) * sqrt(N/(VC+1)) + # prior_score = pbc * prior + + if child.visit_count > 0: + # Mean value Q + value_score = min_max_stats.normalize( # 括号里的是Q值,Q=E[r+r*Q'。此处在对其进行正则化 + child.reward + + self.config.discount # 衰减系数, 之后乘以子节点的值 + * (child.value() if len(self.config.players) == 1 else -child.value()) # 根据players的个数,如果大于1,则子节点必定是对手,因此子节点的取负。 + ) + else: + value_score = 0 + + return prior_score + value_score # 先前的分数加上Q值就是新的UCB值 + + # 反向传播算法 + # 对路径上的所有访问次数+1,value值加reward + def backpropagate(self, search_path, value, to_play, min_max_stats): # MCTS反向传播,visit count加1 + """ + At the end of a simulation, we propagate the evaluation all the way up the tree + to the root. + """ + if len(self.config.players) == 1: + for node in reversed(search_path): + node.value_sum += value + node.visit_count += 1 + min_max_stats.update(node.reward + self.config.discount * node.value()) + + value = node.reward + self.config.discount * value + + elif len(self.config.players) == 2: + for node in reversed(search_path): + node.value_sum += value if node.to_play == to_play else -value + node.visit_count += 1 + min_max_stats.update(node.reward + self.config.discount * -node.value()) + + value = ( + -node.reward if node.to_play == to_play else node.reward + ) + self.config.discount * value + + else: + raise NotImplementedError("More than two player mode not implemented.") + +config = MuZeroConfig() +game = Game(config.seed) + +game_history = GameHistory() + +observation = game.reset() + +game_history.action_history.append(0) +game_history.observation_history.append(observation) # 添加reset之后的observation +game_history.reward_history.append(0) +game_history.to_play_history.append(game.to_play()) + +stacked_observations = game_history.get_stacked_observations( -1, config.stacked_observations, len(config.action_space)) + +done = False + +model = models.MuZeroNetwork(config) + +root, mcts_info = MCTS1(config).run(model, stacked_observations, game.legal_actions(), game.to_play(), True) + +print(root) + +game.close() \ No newline at end of file diff --git a/test/muzero_config_test.py b/test/muzero_config_test.py new file mode 100644 index 00000000..1b5fc135 --- /dev/null +++ b/test/muzero_config_test.py @@ -0,0 +1,6 @@ +from games.simple_grid import MuZeroConfig + +if __name__ == "__main__": + config = MuZeroConfig() + config.results_path /= "config_test" + print(config.results_path) \ No newline at end of file diff --git a/test/ray_test.py b/test/ray_test.py new file mode 100644 index 00000000..7d7f0cf6 --- /dev/null +++ b/test/ray_test.py @@ -0,0 +1,20 @@ +import ray +import time + +ray.init() + +@ray.remote +def hello(): + return "Hello world!" + +object_id = hello.remote() + +hello = ray.get(object_id) + +print(hello) + +# time.sleep(100) +results_ids = [ray.put(i) for i in range(10)] +print(ray.get(results_ids)) + +ray.shutdown() \ No newline at end of file diff --git a/trainer.py b/trainer.py index faa5f941..849beaa2 100644 --- a/trainer.py +++ b/trainer.py @@ -66,7 +66,7 @@ def continuous_update_weights(self, replay_buffer, shared_storage): next_batch = replay_buffer.get_batch.remote() # Training loop while self.training_step < self.config.training_steps and not ray.get( - shared_storage.get_info.remote("terminate") + shared_storage.get_info.remote("terminate") # terminate是用来记录replay buffer等其它程序是否终止的,跟game的状态无关 ): index_batch, batch = ray.get(next_batch) next_batch = replay_buffer.get_batch.remote() @@ -117,7 +117,7 @@ def continuous_update_weights(self, replay_buffer, shared_storage): ) > self.config.ratio and self.training_step < self.config.training_steps - and not ray.get(shared_storage.get_info.remote("terminate")) + and not ray.get(shared_storage.get_info.remote("terminate")) # terminate是用来记录replay buffer等其它程序是否终止的,跟game的状态无关 ): time.sleep(0.5) @@ -279,7 +279,7 @@ def update_lr(self): lr = self.config.lr_init * self.config.lr_decay_rate ** ( self.training_step / self.config.lr_decay_steps ) - for param_group in self.optimizer.param_groups: + for param_group in self.optimizer.param_groups: # 更新optimizer的lr param_group["lr"] = lr @staticmethod