Question about Actor's outputs and selected actions #1216

flammmes · 2024-09-19T09:19:04Z

I have marked all applicable categories:
- RL algorithm bug
I have visited the source website
I have searched through the issue tracker for duplicates

I have mentioned version numbers, operating system and environment, where applicable:

import tianshou, gymnasium as gym, torch, numpy, sys
print(tianshou.__version__, gym.__version__, torch.__version__, numpy.__version__, sys.version, sys.platform)

2024-09-19 11:57:06.630942: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-09-19 11:57:14.731284: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
0.5.1 1.0.0a2 2.2.2+cu121 1.26.2 3.10.12 (main, Mar 22 2024, 16:50:05) [GCC 11.4.0] linux

Hello,
I am trying to setup an actor network that outputs (mu,sigma), state

class Actor(nn.Module):
    def __init__(self, state_shape, action_shape, max_action,min_action, device=device):
        super().__init__()
        self.device = device
        self.cnn = ThreeDCNN().to(self.device)  # Our 3D CNN
        self.well_conv = nn.Conv1d(
            in_channels=state_shape['well_observations'][1],  # Number of features (15)
            out_channels=32,  # You can adjust this
            kernel_size=3, 
            padding=1
        ).to(self.device)        
        # Calculate the size of the flattened 3D CNN output
        with torch.no_grad():
            sample_input = torch.randn(1, 2, 4, 163, 120).to(self.device)  # Example input shape
            cnn_output_size = self.cnn(sample_input).view(1, -1).size(1)
            sample_well_obs = torch.randn(1, *state_shape['well_observations']).to(self.device)
            well_cnn_output_size = self.well_conv(sample_well_obs.permute(0, 2, 1)).view(1, -1).size(1)

        # Combine the CNN output size with the sizes of other observations
        combined_size = cnn_output_size + well_cnn_output_size

        self.fc1 = nn.Linear(combined_size, 128).to(self.device) 
        self.fc2 = nn.Linear(128, 128).to(self.device)
        self.fc_mu = nn.Linear(128, action_shape[0]).to(self.device)
        self.fc_std = nn.Linear(128, action_shape[0]).to(self.device)
        nn.init.xavier_uniform_(self.fc_mu.weight)
        nn.init.xavier_uniform_(self.fc_std.weight)
        self.max_action = max_action
        self.min_action = min_action
        
        #self.sigma_param = nn.Parameter(torch.ones(action_shape[0], 1))*0.1

    def forward(self, obs, state=None, info={}):

        image = torch.from_numpy(obs['res_state']).to(self.device)
        image_features = self.cnn(image)
        well_obs = torch.from_numpy(obs['well_observations']).to(self.device)
        well_obs = well_obs.permute(0, 2, 1).float()  # Change the shape to [batch_size, num_features, num_wells]
        well_features = torch.relu(self.well_conv(well_obs))
        well_features = well_features.view(well_features.size(0), -1)
        # Flatten the CNN output and combine with other observations
        #This should change
        combined_obs  = torch.cat([image_features,well_features], dim=1)

        x = torch.relu(self.fc1(combined_obs))
        logits = torch.relu(self.fc2(x))

    # Compute the action mean (mu)
        mu = self.fc_mu(logits)
        sigma = self.fc_std(logits)
        max_action_tensor = torch.from_numpy(self.max_action).to(self.device).float()
        min_action_tensor = torch.from_numpy(self.min_action).to(self.device).float()
        action_range = (max_action_tensor - min_action_tensor)/2
        action_midpoint = (max_action_tensor + min_action_tensor) / 2

        print(max_action_tensor,min_action_tensor)
        print(max_action_tensor.shape,min_action_tensor.shape)
        print(action_range,action_midpoint)


    # Bound the action with tanh if required
        mu = torch.tanh(mu) * action_range  + action_midpoint
        sigma = (torch.clamp(sigma, min=-3, max=1)).exp() # Make sigma always positive
    # Compute the action standard deviation (sigma)


        print(mu,sigma)
    # Return the action mean and standard deviation
        return (mu, sigma), state`

My issue is that the outputs of the actor do not match the received actions at all, for example:

mu = tensor([[ 30.,  30., 100.,   8.,   8.,   8.],
        [ 30.,  30., 100.,   8.,   8.,   8.],
        [ 30.,  30., 100.,   8.,   8.,   8.],
        [ 30.,  30., 100.,   8.,   8.,   8.]]) 
sigma = tensor([[0.0498, 2.7183, 2.7183, 2.7183, 0.0498, 0.0498],
        [0.0498, 2.7183, 2.7183, 2.7183, 0.0498, 0.0498],
        [0.0498, 2.7183, 2.7183, 2.7183, 0.0498, 0.0498],
        [0.0498, 2.7183, 2.7183, 2.7183, 0.0498, 0.0498]])

Received action: [100.       100.       100.         7.999999   7.999999   7.999999]
Received action: [100.       100.       100.         7.999999   7.999999   7.999999]
Received action: [100.       100.       100.         7.999999   7.999999   7.999999]
Received action: [100.       100.       100.         7.999999   7.999999   7.999999]

if those values are directly given to the torch Normal the output is fine.

dist = torch.distributions.normal.Normal(mu, sigma)
samples = dist.sample()
tensor([[ 30.0007,  32.4612, 107.4354,   9.7134,   7.9884,   8.0492],
        [ 29.9397,  30.2262, 103.6806,  10.6415,   8.0678,   7.8900],
        [ 30.0626,  29.4189,  96.5598,   6.5364,   7.9801,   8.1219],
        [ 30.0591,  33.8789,  94.8604,  10.9181,   7.9676,   8.0026]])

So these examples are still at the first stages, before training, where the network evaluates the test envs initially.

here is the following code with parameters of training selected

net_a = Actor(state_shape, action_shape, max_action,min_action, device=device)
actor_optim = torch.optim.Adam(net_a.parameters(), lr=1e-3)

net_c1 = Critic(state_shape, action_shape, hidden_sizes, device=device)
critic1_optim = torch.optim.Adam(net_c1.parameters(), lr=1e-3)

net_c2 = Critic(state_shape, action_shape, hidden_sizes, device=device)
critic2_optim = torch.optim.Adam(net_c2.parameters(), lr=1e-3)

policy = SACPolicy(
    actor=net_a,
    actor_optim=actor_optim,
    critic1=net_c1,
    critic1_optim=critic1_optim,
    critic2=net_c2,
    critic2_optim=critic2_optim,
    tau=0.005,  # Target network update rate
    gamma=0.99,  # Discount factor
    alpha=0.2,   # Temperature parameter
    estimation_step=1,  # Number of TD steps for Q-value estimation
    action_space=env.action_space
)

buffer = VectorReplayBuffer(20000, len(train_envs))
train_collector = Collector(policy, train_envs, buffer, exploration_noise=True)
test_collector = Collector(policy, test_envs)

def stop_fn(mean_rewards: float) -> bool:
    return train_collector.total_episodes >= 20000  # Stop after 20,000 episodes
logdir = "log"
log_path = os.path.join(logdir, "sac")

def save_best_fn(policy: BasePolicy) -> None:
    torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth"))

result = OffpolicyTrainer(
    policy=policy, 
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=10,  # Adjust training parameters as needed
    step_per_epoch=1000,
    step_per_collect=10,
    episode_per_test=100,
    batch_size=64,
    stop_fn=stop_fn,
    logger=TensorboardLogger(SummaryWriter(log_path))
).run()
assert stop_fn(result.best_reward)

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Question about Actor's outputs and selected actions #1216

Question about Actor's outputs and selected actions #1216

flammmes commented Sep 19, 2024 •

edited by opcode81

Loading

Question about Actor's outputs and selected actions #1216

Question about Actor's outputs and selected actions #1216

Comments

flammmes commented Sep 19, 2024 • edited by opcode81 Loading

flammmes commented Sep 19, 2024 •

edited by opcode81

Loading