Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Question about Actor's outputs and selected actions #1216

Open
4 of 5 tasks
flammmes opened this issue Sep 19, 2024 · 0 comments
Open
4 of 5 tasks

Question about Actor's outputs and selected actions #1216

flammmes opened this issue Sep 19, 2024 · 0 comments

Comments

@flammmes
Copy link

flammmes commented Sep 19, 2024

  • I have marked all applicable categories:
    • RL algorithm bug
  • I have visited the source website
  • I have searched through the issue tracker for duplicates
  • I have mentioned version numbers, operating system and environment, where applicable:
    import tianshou, gymnasium as gym, torch, numpy, sys
    print(tianshou.__version__, gym.__version__, torch.__version__, numpy.__version__, sys.version, sys.platform)
2024-09-19 11:57:06.630942: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-09-19 11:57:14.731284: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
0.5.1 1.0.0a2 2.2.2+cu121 1.26.2 3.10.12 (main, Mar 22 2024, 16:50:05) [GCC 11.4.0] linux
 

Hello,
I am trying to setup an actor network that outputs (mu,sigma), state

class Actor(nn.Module):
    def __init__(self, state_shape, action_shape, max_action,min_action, device=device):
        super().__init__()
        self.device = device
        self.cnn = ThreeDCNN().to(self.device)  # Our 3D CNN
        self.well_conv = nn.Conv1d(
            in_channels=state_shape['well_observations'][1],  # Number of features (15)
            out_channels=32,  # You can adjust this
            kernel_size=3, 
            padding=1
        ).to(self.device)        
        # Calculate the size of the flattened 3D CNN output
        with torch.no_grad():
            sample_input = torch.randn(1, 2, 4, 163, 120).to(self.device)  # Example input shape
            cnn_output_size = self.cnn(sample_input).view(1, -1).size(1)
            sample_well_obs = torch.randn(1, *state_shape['well_observations']).to(self.device)
            well_cnn_output_size = self.well_conv(sample_well_obs.permute(0, 2, 1)).view(1, -1).size(1)

        # Combine the CNN output size with the sizes of other observations
        combined_size = cnn_output_size + well_cnn_output_size

        self.fc1 = nn.Linear(combined_size, 128).to(self.device) 
        self.fc2 = nn.Linear(128, 128).to(self.device)
        self.fc_mu = nn.Linear(128, action_shape[0]).to(self.device)
        self.fc_std = nn.Linear(128, action_shape[0]).to(self.device)
        nn.init.xavier_uniform_(self.fc_mu.weight)
        nn.init.xavier_uniform_(self.fc_std.weight)
        self.max_action = max_action
        self.min_action = min_action
        
        #self.sigma_param = nn.Parameter(torch.ones(action_shape[0], 1))*0.1

    def forward(self, obs, state=None, info={}):

        image = torch.from_numpy(obs['res_state']).to(self.device)
        image_features = self.cnn(image)
        well_obs = torch.from_numpy(obs['well_observations']).to(self.device)
        well_obs = well_obs.permute(0, 2, 1).float()  # Change the shape to [batch_size, num_features, num_wells]
        well_features = torch.relu(self.well_conv(well_obs))
        well_features = well_features.view(well_features.size(0), -1)
        # Flatten the CNN output and combine with other observations
        #This should change
        combined_obs  = torch.cat([image_features,well_features], dim=1)

        x = torch.relu(self.fc1(combined_obs))
        logits = torch.relu(self.fc2(x))

    # Compute the action mean (mu)
        mu = self.fc_mu(logits)
        sigma = self.fc_std(logits)
        max_action_tensor = torch.from_numpy(self.max_action).to(self.device).float()
        min_action_tensor = torch.from_numpy(self.min_action).to(self.device).float()
        action_range = (max_action_tensor - min_action_tensor)/2
        action_midpoint = (max_action_tensor + min_action_tensor) / 2

        print(max_action_tensor,min_action_tensor)
        print(max_action_tensor.shape,min_action_tensor.shape)
        print(action_range,action_midpoint)


    # Bound the action with tanh if required
        mu = torch.tanh(mu) * action_range  + action_midpoint
        sigma = (torch.clamp(sigma, min=-3, max=1)).exp() # Make sigma always positive
    # Compute the action standard deviation (sigma)


        print(mu,sigma)
    # Return the action mean and standard deviation
        return (mu, sigma), state`

My issue is that the outputs of the actor do not match the received actions at all, for example:

mu = tensor([[ 30.,  30., 100.,   8.,   8.,   8.],
        [ 30.,  30., 100.,   8.,   8.,   8.],
        [ 30.,  30., 100.,   8.,   8.,   8.],
        [ 30.,  30., 100.,   8.,   8.,   8.]]) 
sigma = tensor([[0.0498, 2.7183, 2.7183, 2.7183, 0.0498, 0.0498],
        [0.0498, 2.7183, 2.7183, 2.7183, 0.0498, 0.0498],
        [0.0498, 2.7183, 2.7183, 2.7183, 0.0498, 0.0498],
        [0.0498, 2.7183, 2.7183, 2.7183, 0.0498, 0.0498]])

Received action: [100.       100.       100.         7.999999   7.999999   7.999999]
Received action: [100.       100.       100.         7.999999   7.999999   7.999999]
Received action: [100.       100.       100.         7.999999   7.999999   7.999999]
Received action: [100.       100.       100.         7.999999   7.999999   7.999999]

if those values are directly given to the torch Normal the output is fine.

dist = torch.distributions.normal.Normal(mu, sigma)
samples = dist.sample()
tensor([[ 30.0007,  32.4612, 107.4354,   9.7134,   7.9884,   8.0492],
        [ 29.9397,  30.2262, 103.6806,  10.6415,   8.0678,   7.8900],
        [ 30.0626,  29.4189,  96.5598,   6.5364,   7.9801,   8.1219],
        [ 30.0591,  33.8789,  94.8604,  10.9181,   7.9676,   8.0026]])

So these examples are still at the first stages, before training, where the network evaluates the test envs initially.

here is the following code with parameters of training selected

net_a = Actor(state_shape, action_shape, max_action,min_action, device=device)
actor_optim = torch.optim.Adam(net_a.parameters(), lr=1e-3)

net_c1 = Critic(state_shape, action_shape, hidden_sizes, device=device)
critic1_optim = torch.optim.Adam(net_c1.parameters(), lr=1e-3)

net_c2 = Critic(state_shape, action_shape, hidden_sizes, device=device)
critic2_optim = torch.optim.Adam(net_c2.parameters(), lr=1e-3)

policy = SACPolicy(
    actor=net_a,
    actor_optim=actor_optim,
    critic1=net_c1,
    critic1_optim=critic1_optim,
    critic2=net_c2,
    critic2_optim=critic2_optim,
    tau=0.005,  # Target network update rate
    gamma=0.99,  # Discount factor
    alpha=0.2,   # Temperature parameter
    estimation_step=1,  # Number of TD steps for Q-value estimation
    action_space=env.action_space
)

buffer = VectorReplayBuffer(20000, len(train_envs))
train_collector = Collector(policy, train_envs, buffer, exploration_noise=True)
test_collector = Collector(policy, test_envs)

def stop_fn(mean_rewards: float) -> bool:
    return train_collector.total_episodes >= 20000  # Stop after 20,000 episodes
logdir = "log"
log_path = os.path.join(logdir, "sac")

def save_best_fn(policy: BasePolicy) -> None:
    torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth"))

result = OffpolicyTrainer(
    policy=policy, 
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=10,  # Adjust training parameters as needed
    step_per_epoch=1000,
    step_per_collect=10,
    episode_per_test=100,
    batch_size=64,
    stop_fn=stop_fn,
    logger=TensorboardLogger(SummaryWriter(log_path))
).run()
assert stop_fn(result.best_reward)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant