Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: fixed ray's error 'No module named aiohttp.signals' #218

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -90,4 +90,6 @@ venv.bak/
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
dmypy.json

results/
392 changes: 392 additions & 0 deletions game_tournament.py

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions games/simple_grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ def __init__(self):
self.players = list(range(1)) # List of players. You should only edit the length
self.stacked_observations = 0 # Number of previous observations and previous actions to add to the current observation

self.action_replace = True

# Evaluate
self.muzero_player = 0 # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second)
self.opponent = None # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class
Expand Down
24 changes: 17 additions & 7 deletions games/tictactoe.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ def __init__(self):
self.muzero_player = 0 # Turn Muzero begins to play (0: MuZero plays first, 1: MuZero plays second)
self.opponent = "expert" # Hard coded agent that MuZero faces to assess his progress in multiplayer games. It doesn't influence training. None, "random" or "expert" if implemented in the Game class


# 动作是否能重复
self.action_replace = False

### Self-Play
self.num_workers = 1 # Number of simultaneous threads/workers self-playing to feed the replay buffer
Expand All @@ -48,7 +49,8 @@ def __init__(self):


### Network
self.network = "resnet" # "resnet" / "fullyconnected"
# self.network = "resnet" # "resnet" / "fullyconnected"
self.network = "fullyconnected"
self.support_size = 10 # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size. Choose it so that support_size <= sqrt(max(abs(discounted reward)))

# Residual Network
Expand All @@ -63,19 +65,27 @@ def __init__(self):
self.resnet_fc_policy_layers = [8] # Define the hidden layers in the policy head of the prediction network

# Fully Connected Network
# self.encoding_size = 32
# self.fc_representation_layers = [] # Define the hidden layers in the representation network
# self.fc_dynamics_layers = [16] # Define the hidden layers in the dynamics network
# self.fc_reward_layers = [16] # Define the hidden layers in the reward network
# self.fc_value_layers = [] # Define the hidden layers in the value network
# self.fc_policy_layers = [] # Define the hidden layers in the policy network

self.encoding_size = 32
self.fc_representation_layers = [] # Define the hidden layers in the representation network
self.fc_representation_layers = [16] # Define the hidden layers in the representation network
self.fc_dynamics_layers = [16] # Define the hidden layers in the dynamics network
self.fc_reward_layers = [16] # Define the hidden layers in the reward network
self.fc_value_layers = [] # Define the hidden layers in the value network
self.fc_policy_layers = [] # Define the hidden layers in the policy network

self.fc_value_layers = [16] # Define the hidden layers in the value network
self.fc_policy_layers = [16]


### Training
self.results_path = pathlib.Path(__file__).resolve().parents[1] / "results" / pathlib.Path(__file__).stem / datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S") # Path to store the model weights and TensorBoard logs
self.save_model = True # Save the checkpoint in results_path as model.checkpoint
self.training_steps = 1000000 # Total number of training steps (ie weights update according to a batch)
# self.training_steps = 1000000 # Total number of training steps (ie weights update according to a batch)
# self.training_steps = 50000
self.training_steps = 500000
self.batch_size = 64 # Number of parts of games to train on at each training step
self.checkpoint_interval = 10 # Number of training steps before using the model for self-playing
self.value_loss_weight = 0.25 # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
Expand Down
47 changes: 27 additions & 20 deletions models.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def __init__(
super().__init__()
self.action_space_size = action_space_size
self.full_support_size = 2 * support_size + 1
# support_size 表示的应该是一个选择的范围【-support_size, support_size】.最后+1是因为range最后不包含最后的数

self.representation_network = torch.nn.DataParallel(
mlp(
Expand All @@ -107,6 +108,7 @@ def __init__(
)
)

#dynamics的输入是encoding_size+action_space_size
self.dynamics_encoded_state_network = torch.nn.DataParallel(
mlp(
encoding_size + self.action_space_size,
Expand All @@ -115,14 +117,14 @@ def __init__(
)
)
self.dynamics_reward_network = torch.nn.DataParallel(
mlp(encoding_size, fc_reward_layers, self.full_support_size)
mlp(encoding_size, fc_reward_layers, self.full_support_size) #最后的输出为full_support_size,因为范围是[-support_size, support_size]
)

self.prediction_policy_network = torch.nn.DataParallel(
mlp(encoding_size, fc_policy_layers, self.action_space_size)
mlp(encoding_size, fc_policy_layers, self.action_space_size) #输出action的概率
)
self.prediction_value_network = torch.nn.DataParallel(
mlp(encoding_size, fc_value_layers, self.full_support_size)
mlp(encoding_size, fc_value_layers, self.full_support_size) #最后的输出为full_support_size,因为范围是[-support_size, support_size]
)

def prediction(self, encoded_state):
Expand All @@ -134,35 +136,39 @@ def representation(self, observation):
encoded_state = self.representation_network(
observation.view(observation.shape[0], -1)
)

# 正则化
# Scale encoded state between [0, 1] (See appendix paper Training)
min_encoded_state = encoded_state.min(1, keepdim=True)[0]
max_encoded_state = encoded_state.max(1, keepdim=True)[0]
scale_encoded_state = max_encoded_state - min_encoded_state
scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5
scale_encoded_state[scale_encoded_state < 1e-5] += 1e-5 # 防止为0,造成NAN
encoded_state_normalized = (
encoded_state - min_encoded_state
) / scale_encoded_state
return encoded_state_normalized

# dynamic同representation的唯一不同就是前者需要将encoded_state和action合并在一起作为输入,而representation不需要绑定action
def dynamics(self, encoded_state, action):
# Stack encoded_state with a game specific one hot encoded action (See paper appendix Network Architecture)
action_one_hot = (
torch.zeros((action.shape[0], self.action_space_size))
.to(action.device)
.float()
)
action_one_hot.scatter_(1, action.long(), 1.0)
action_one_hot.scatter_(1, action.long(), 1.0) #将action的位置赋值为1
x = torch.cat((encoded_state, action_one_hot), dim=1)

next_encoded_state = self.dynamics_encoded_state_network(x)

reward = self.dynamics_reward_network(next_encoded_state)

# 正则化
# Scale encoded state between [0, 1] (See paper appendix Training)
min_next_encoded_state = next_encoded_state.min(1, keepdim=True)[0]
max_next_encoded_state = next_encoded_state.max(1, keepdim=True)[0]
scale_next_encoded_state = max_next_encoded_state - min_next_encoded_state
scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5
scale_next_encoded_state[scale_next_encoded_state < 1e-5] += 1e-5 # 防止为0,造成NAN
next_encoded_state_normalized = (
next_encoded_state - min_next_encoded_state
) / scale_next_encoded_state
Expand All @@ -172,7 +178,7 @@ def dynamics(self, encoded_state, action):
def initial_inference(self, observation):
encoded_state = self.representation(observation)
policy_logits, value = self.prediction(encoded_state)
# reward equal to 0 for consistency
# reward equal to 0 for consistency 一致性奖励等于 0
reward = torch.log(
(
torch.zeros(1, self.full_support_size)
Expand All @@ -181,6 +187,7 @@ def initial_inference(self, observation):
.to(observation.device)
)
)
# reward的样子为[[0,0,...,0,1,0,...,0,0],...]。即中间值为1,其余全为0,然后重复于observation行数相同的次数

return (
value,
Expand Down Expand Up @@ -605,8 +612,8 @@ def initial_inference(self, observation):
reward = torch.log(
(
torch.zeros(1, self.full_support_size)
.scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0)
.repeat(len(observation), 1)
.scatter(1, torch.tensor([[self.full_support_size // 2]]).long(), 1.0) # 将support_size位置设为1
.repeat(len(observation), 1) # 根据observation的长度复制,保证reward的维度于observation的一致,即之前的observation也赋值
.to(observation.device)
)
)
Expand Down Expand Up @@ -637,29 +644,29 @@ def mlp(
sizes = [input_size] + layer_sizes + [output_size]
layers = []
for i in range(len(sizes) - 1):
act = activation if i < len(sizes) - 2 else output_activation
act = activation if i < len(sizes) - 2 else output_activation #激活函数,最后一层是output_activation,其余的都一样
layers += [torch.nn.Linear(sizes[i], sizes[i + 1]), act()]
return torch.nn.Sequential(*layers)


def support_to_scalar(logits, support_size):
def support_to_scalar(logits, support_size): # logits 是 value的对数值,support_size是转换后的范围。
"""
Transform a categorical representation to a scalar
See paper appendix Network Architecture
"""
# Decode to a scalar
probabilities = torch.softmax(logits, dim=1)
probabilities = torch.softmax(logits, dim=1) # softmax在指定的向量和为1,softmax扩大大的,缩小下的,shape为[stacked_size, fully_support_size]
support = (
torch.tensor([x for x in range(-support_size, support_size + 1)])
torch.tensor([x for x in range(-support_size, support_size + 1)]) # 范围是-support_size, support_szie。因为support_size+1
.expand(probabilities.shape)
.float()
.to(device=probabilities.device)
)
x = torch.sum(support * probabilities, dim=1, keepdim=True)
) # shape 为【stacked_size, fully_support_size】,
x = torch.sum(support * probabilities, dim=1, keepdim=True) # 输出为【1,fully_support_size】,因为dim=1,另外keep_dim=True,所有是【1,fully_support_size】而不是【fully_support_size]

# Invert the scaling (defined in https://arxiv.org/abs/1805.11593)
x = torch.sign(x) * (
((torch.sqrt(1 + 4 * 0.001 * (torch.abs(x) + 1 + 0.001)) - 1) / (2 * 0.001))
x = torch.sign(x) * ( # sign函数为分段函数, x小于0为-1,大于0为1,0为0。主要是获取x的符号
((torch.sqrt(1 + 4 * 0.001 * (torch.abs(x) + 1 + 0.001)) - 1) / (2 * 0.001)) # (sqrt(1+0.04*(|x|+1.001))-1)/0.002
** 2
- 1
)
Expand All @@ -675,9 +682,9 @@ def scalar_to_support(x, support_size):
x = torch.sign(x) * (torch.sqrt(torch.abs(x) + 1) - 1) + 0.001 * x

# Encode on a vector
x = torch.clamp(x, -support_size, support_size)
floor = x.floor()
prob = x - floor
x = torch.clamp(x, -support_size, support_size) # 裁剪x的范围,使x的范围定为[-support_size, support_size]
floor = x.floor() # floor向下取整,类似的,ceil为向上取整
prob = x - floor # 减去整数,保留小数部分(因为在support_to_scala部分是index位置乘上概率)
logits = torch.zeros(x.shape[0], x.shape[1], 2 * support_size + 1).to(x.device)
logits.scatter_(
2, (floor + support_size).long().unsqueeze(-1), (1 - prob).unsqueeze(-1)
Expand Down
4 changes: 4 additions & 0 deletions muzero.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def __init__(self, game_name, config=None, split_resources_in=1):
# Load the game and the config from the module with the game name
try:
game_module = importlib.import_module("games." + game_name)
print("games." + game_name)
self.Game = game_module.Game
self.config = game_module.MuZeroConfig()
except ModuleNotFoundError as err:
Expand Down Expand Up @@ -671,7 +672,10 @@ def load_model_menu(muzero, game_name):
choice = input("Invalid input, enter a number listed above: ")
choice = int(choice)
if choice == 0:
start_time = time.time()
muzero.train()
end_time = time.time()
print("耗时: {:.2f}秒".format(end_time - start_time))
elif choice == 1:
load_model_menu(muzero, game_name)
elif choice == 2:
Expand Down
Loading