From 797762381a66c45cdc7e05563ce1bc8750d4862c Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Sun, 28 Aug 2022 17:04:20 +0200 Subject: [PATCH 1/4] Add dropq version of TQC --- sb3_contrib/tqc/policies.py | 16 +++++++++++++++- sb3_contrib/tqc/tqc.py | 24 ++++++++++++++---------- tests/test_run.py | 11 +++++++++++ 3 files changed, 40 insertions(+), 11 deletions(-) diff --git a/sb3_contrib/tqc/policies.py b/sb3_contrib/tqc/policies.py index e8022f9c..141f994b 100644 --- a/sb3_contrib/tqc/policies.py +++ b/sb3_contrib/tqc/policies.py @@ -213,6 +213,8 @@ def __init__( n_quantiles: int = 25, n_critics: int = 2, share_features_extractor: bool = False, + dropout_rate: float = 0.0, + layer_norm: bool = False, ): super().__init__( observation_space, @@ -230,7 +232,14 @@ def __init__( self.quantiles_total = n_quantiles * n_critics for i in range(n_critics): - qf_net = create_mlp(features_dim + action_dim, n_quantiles, net_arch, activation_fn) + qf_net = create_mlp( + features_dim + action_dim, + n_quantiles, + net_arch, + activation_fn, + dropout_rate=dropout_rate, + layer_norm=layer_norm, + ) qf_net = nn.Sequential(*qf_net) self.add_module(f"qf{i}", qf_net) self.q_networks.append(qf_net) @@ -298,6 +307,9 @@ def __init__( n_quantiles: int = 25, n_critics: int = 2, share_features_extractor: bool = False, + # For the critic only + dropout_rate: float = 0.0, + layer_norm: bool = False, ): super().__init__( observation_space, @@ -341,6 +353,8 @@ def __init__( "n_critics": n_critics, "net_arch": critic_arch, "share_features_extractor": share_features_extractor, + "dropout_rate": dropout_rate, + "layer_norm": layer_norm, } self.critic_kwargs.update(tqc_kwargs) self.actor, self.actor_target = None, None diff --git a/sb3_contrib/tqc/tqc.py b/sb3_contrib/tqc/tqc.py index 8ec746bc..c47f506a 100644 --- a/sb3_contrib/tqc/tqc.py +++ b/sb3_contrib/tqc/tqc.py @@ -197,8 +197,10 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: ent_coef_losses, ent_coefs = [], [] actor_losses, critic_losses = [], [] + policy_update_delay = gradient_steps for gradient_step in range(gradient_steps): + update_actor = ((gradient_step + 1) % policy_update_delay == 0) or gradient_step == gradient_steps - 1 # Sample replay buffer replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env) @@ -216,8 +218,9 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: # so we don't change it with other losses # see https://github.com/rail-berkeley/softlearning/issues/60 ent_coef = th.exp(self.log_ent_coef.detach()) - ent_coef_loss = -(self.log_ent_coef * (log_prob + self.target_entropy).detach()).mean() - ent_coef_losses.append(ent_coef_loss.item()) + if update_actor: + ent_coef_loss = -(self.log_ent_coef * (log_prob + self.target_entropy).detach()).mean() + ent_coef_losses.append(ent_coef_loss.item()) else: ent_coef = self.ent_coef_tensor @@ -261,14 +264,15 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: self.critic.optimizer.step() # Compute actor loss - qf_pi = self.critic(replay_data.observations, actions_pi).mean(dim=2).mean(dim=1, keepdim=True) - actor_loss = (ent_coef * log_prob - qf_pi).mean() - actor_losses.append(actor_loss.item()) - - # Optimize the actor - self.actor.optimizer.zero_grad() - actor_loss.backward() - self.actor.optimizer.step() + if update_actor: + qf_pi = self.critic(replay_data.observations, actions_pi).mean(dim=2).mean(dim=1, keepdim=True) + actor_loss = (ent_coef * log_prob - qf_pi).mean() + actor_losses.append(actor_loss.item()) + + # Optimize the actor + self.actor.optimizer.zero_grad() + actor_loss.backward() + self.actor.optimizer.step() # Update target networks if gradient_step % self.target_update_interval == 0: diff --git a/tests/test_run.py b/tests/test_run.py index c9c85847..710007df 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -8,6 +8,17 @@ from sb3_contrib.common.vec_env import AsyncEval +def test_dropq(): + model = TQC( + "MlpPolicy", + "Pendulum-v1", + policy_kwargs=dict(net_arch=[64, 64], layer_norm=True, dropout_rate=0.005), + verbose=1, + buffer_size=250, + ) + model.learn(total_timesteps=300) + + @pytest.mark.parametrize("ent_coef", ["auto", 0.01, "auto_0.01"]) def test_tqc(ent_coef): model = TQC( From 6c7593920dd2178870f8c46c69c1359fadd234cd Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Sun, 28 Aug 2022 17:05:07 +0200 Subject: [PATCH 2/4] Add comment --- sb3_contrib/tqc/tqc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sb3_contrib/tqc/tqc.py b/sb3_contrib/tqc/tqc.py index c47f506a..7bed6a79 100644 --- a/sb3_contrib/tqc/tqc.py +++ b/sb3_contrib/tqc/tqc.py @@ -239,6 +239,7 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: next_actions, next_log_prob = self.actor.action_log_prob(replay_data.next_observations) # Compute and cut quantiles at the next state # batch x nets x quantiles + # Note: in dropq dropout seems to be on for target net too next_quantiles = self.critic_target(replay_data.next_observations, next_actions) # Sort and drop top k quantiles to control overestimation. From 5b627eeb89660a27efdd72e232f254fb59e27924 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Sun, 28 Aug 2022 18:56:14 +0200 Subject: [PATCH 3/4] Add TODO --- sb3_contrib/tqc/tqc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sb3_contrib/tqc/tqc.py b/sb3_contrib/tqc/tqc.py index 7bed6a79..535952d2 100644 --- a/sb3_contrib/tqc/tqc.py +++ b/sb3_contrib/tqc/tqc.py @@ -197,6 +197,7 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: ent_coef_losses, ent_coefs = [], [] actor_losses, critic_losses = [], [] + # TODO: properly handle it when train_freq > 1 policy_update_delay = gradient_steps for gradient_step in range(gradient_steps): From e1ca72a2aecedc948a3790567335640314983a9c Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Mon, 29 Aug 2022 10:51:56 +0200 Subject: [PATCH 4/4] Add policy delay --- sb3_contrib/tqc/tqc.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sb3_contrib/tqc/tqc.py b/sb3_contrib/tqc/tqc.py index 535952d2..65ebdbee 100644 --- a/sb3_contrib/tqc/tqc.py +++ b/sb3_contrib/tqc/tqc.py @@ -87,6 +87,7 @@ def __init__( replay_buffer_class: Optional[ReplayBuffer] = None, replay_buffer_kwargs: Optional[Dict[str, Any]] = None, optimize_memory_usage: bool = False, + policy_delay: int = 1, ent_coef: Union[str, float] = "auto", target_update_interval: int = 1, target_entropy: Union[str, float] = "auto", @@ -139,6 +140,7 @@ def __init__( self.target_update_interval = target_update_interval self.ent_coef_optimizer = None self.top_quantiles_to_drop_per_net = top_quantiles_to_drop_per_net + self.policy_delay = policy_delay if _init_setup_model: self._setup_model() @@ -197,11 +199,10 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: ent_coef_losses, ent_coefs = [], [] actor_losses, critic_losses = [], [] - # TODO: properly handle it when train_freq > 1 - policy_update_delay = gradient_steps for gradient_step in range(gradient_steps): - update_actor = ((gradient_step + 1) % policy_update_delay == 0) or gradient_step == gradient_steps - 1 + self._n_updates += 1 + update_actor = self._n_updates % self.policy_delay == 0 # Sample replay buffer replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env) @@ -282,8 +283,6 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: # Copy running stats, see https://github.com/DLR-RM/stable-baselines3/issues/996 polyak_update(self.batch_norm_stats, self.batch_norm_stats_target, 1.0) - self._n_updates += gradient_steps - self.logger.record("train/n_updates", self._n_updates, exclude="tensorboard") self.logger.record("train/ent_coef", np.mean(ent_coefs)) self.logger.record("train/actor_loss", np.mean(actor_losses))