Stable-Baselines-Team · araffin · May 30, 2022 · Nov 22, 2021 · Nov 23, 2021 · Nov 23, 2021
diff --git a/README.md b/README.md
@@ -25,11 +25,12 @@ We hope this allows us to provide reliable implementations following stable-base
 See documentation for the full list of included features.
 
 **RL Algorithms**:
-- [Truncated Quantile Critics (TQC)](https://arxiv.org/abs/2005.04269)
+- [Augmented Random Search (ARS)](https://arxiv.org/abs/1803.07055)
 - [Quantile Regression DQN (QR-DQN)](https://arxiv.org/abs/1710.10044)
 - [PPO with invalid action masking (MaskablePPO)](https://arxiv.org/abs/2006.14171)
+- [PPO with recurrent policy (RecurrentPPO)](https://arxiv.org/abs/1707.06347)
+- [Truncated Quantile Critics (TQC)](https://arxiv.org/abs/2005.04269)
 - [Trust Region Policy Optimization (TRPO)](https://arxiv.org/abs/1502.05477)
-- [Augmented Random Search (ARS)](https://arxiv.org/abs/1803.07055)
 
 **Gym Wrappers**:
 - [Time Feature Wrapper](https://arxiv.org/abs/1712.00378)

diff --git a/docs/guide/algos.rst b/docs/guide/algos.rst
@@ -9,7 +9,9 @@ along with some useful characteristics: support for discrete/continuous actions,
 Name         ``Box``     ``Discrete`` ``MultiDiscrete`` ``MultiBinary`` Multi Processing
 ============ =========== ============ ================= =============== ================
 ARS          ✔️          ❌️            ❌                ❌                ✔️
+MaskablePPO  ❌           ✔️             ✔️                ✔️               ✔️
 QR-DQN       ️❌          ️✔️            ❌                ❌                ✔️
+RecurrentPPO ✔️           ✔️             ✔️                ✔️               ✔️
 TQC          ✔️          ❌            ❌                ❌                ✔️
 TRPO         ✔️          ✔️             ✔️                ✔️                ✔️
 ============ =========== ============ ================= =============== ================

diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst
@@ -71,3 +71,28 @@ Train an agent using Augmented Random Search (ARS) agent on the Pendulum environ
    model = ARS("LinearPolicy", "Pendulum-v0", verbose=1)
    model.learn(total_timesteps=10000, log_interval=4)
    model.save("ars_pendulum")
+
+RecurrentPPO
+------------
+
+Train a PPO agent with a recurrent policy on the CartPole environment.
+
+.. code-block:: python
+
+ import numpy as np
+
+ from sb3_contrib import RecurrentPPO
+
+ model = RecurrentPPO("MlpLstmPolicy", "CartPole-v1", verbose=1)
+ model.learn(5000)
+
+ env = model.get_env()
+ obs = env.reset()
+ lstm_states = None
+ num_envs = 1
+ episode_starts = np.ones((num_envs,), dtype=bool)
+ while True:
+     action, lstm_states = model.predict(obs, state=lstm_states, episode_start=episode_starts, deterministic=True)
+     obs, rewards, dones, info = env.step(action)
+     episode_starts = dones
+     env.render()
diff --git a/docs/index.rst b/docs/index.rst
@@ -33,6 +33,7 @@ RL Baselines3 Zoo also offers a simple interface to train, evaluate agents and d
 
   modules/ars
   modules/ppo_mask
+  modules/ppo_recurrent
   modules/qrdqn
   modules/tqc
   modules/trpo

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -3,6 +3,29 @@
 Changelog
 ==========
 
+Release 1.4.1a0 (WIP)
+-------------------------------
+**Add Recurrent PPO**
+
+Breaking Changes:
+^^^^^^^^^^^^^^^^^
+
+New Features:
+^^^^^^^^^^^^^
+- Added ``RecurrentPPO``
+
+Bug Fixes:
+^^^^^^^^^^
+
+Deprecations:
+^^^^^^^^^^^^^
+
+Others:
+^^^^^^^
+
+Documentation:
+^^^^^^^^^^^^^^
+
 
 Release 1.4.0 (2022-01-19)
 -------------------------------

diff --git a/docs/modules/ppo_mask.rst b/docs/modules/ppo_mask.rst
@@ -5,7 +5,7 @@
 Maskable PPO
 ============
 
-Implementation of `invalid action masking <https://arxiv.org/abs/2006.14171>`_ for the Proximal Policy Optimization(PPO)
+Implementation of `invalid action masking <https://arxiv.org/abs/2006.14171>`_ for the Proximal Policy Optimization (PPO)
 algorithm. Other than adding support for action masking, the behavior is the same as in SB3's core PPO algorithm.
 
 

diff --git a/docs/modules/ppo_recurrent.rst b/docs/modules/ppo_recurrent.rst
@@ -0,0 +1,127 @@
+.. _ppo_mask:
+
+.. automodule:: sb3_contrib.ppo_recurrent
+
+Recurrent PPO
+=============
+
+Implementation of recurrent policies for the Proximal Policy Optimization (PPO)
+algorithm. Other than adding support for recurrent policies (LSTM here), the behavior is the same as in SB3's core PPO algorithm.
+
+
+.. rubric:: Available Policies
+
+.. autosummary::
+    :nosignatures:
+
+    MlpLstmPolicy
+    CnnLstmPolicy
+    MultiInputLstmPolicy
+
+
+Notes
+-----
+
+.. - Paper: https://arxiv.org/abs/2006.14171
+.. - Blog post: https://costa.sh/blog-a-closer-look-at-invalid-action-masking-in-policy-gradient-algorithms.html
+
+
+Can I use?
+----------
+
+-  Recurrent policies: ✔️
+-  Multi processing: ✔️
+-  Gym spaces:
+
+
+============= ====== ===========
+Space         Action Observation
+============= ====== ===========
+Discrete      ✔️      ✔️
+Box           ✔️      ✔️
+MultiDiscrete ✔️      ✔️
+MultiBinary   ✔️      ✔️
+Dict          ❌      ✔️
+============= ====== ===========
+
+
+Example
+-------
+
+
+.. code-block:: python
+
+  import numpy as np
+
+  from sb3_contrib import RecurrentPPO
+  from stable_baselines3.common.evaluation import evaluate_policy
+
+  model = RecurrentPPO("MlpLstmPolicy", "CartPole-v1", verbose=1)
+  model.learn(5000)
+
+  env = model.get_env()
+  mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=20, warn=False)
+  print(mean_reward)
+
+  model.save("ppo_recurrent")
+  del model # remove to demonstrate saving and loading
+
+  model = RecurrentPPO.load("ppo_recurrent")
+
+  obs = env.reset()
+  lstm_states = None
+  num_envs = 1
+  episode_starts = np.ones((num_envs,), dtype=bool)
+  while True:
+      action, lstm_states = model.predict(obs, state=lstm_states, episode_start=episode_starts, deterministic=True)
+      obs, rewards, dones, info = env.step(action)
+      episode_starts = dones
+      env.render()
+
+
+
+Results
+-------
+
+How to replicate the results?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Clone the repo for the experiment:
+
+.. code-block:: bash
+
+   git clone https://github.com/DLR-RM/rl-baselines3-zoo
+   git checkout feat/recurrent-ppo
+
+Parameters
+----------
+
+.. autoclass:: RecurrentPPO
+  :members:
+  :inherited-members:
+
+
+RecurrentPPO Policies
+---------------------
+
+.. autoclass:: MlpLstmPolicy
+  :members:
+  :inherited-members:
+
+.. autoclass:: sb3_contrib.common.recurrent.policies.RecurrentActorCriticPolicy
+  :members:
+  :noindex:
+
+.. autoclass:: CnnLstmPolicy
+  :members:
+
+.. autoclass:: sb3_contrib.common.recurrent.policies.RecurrentActorCriticCnnPolicy
+  :members:
+  :noindex:
+
+.. autoclass:: MultiInputLstmPolicy
+  :members:
+
+.. autoclass:: sb3_contrib.common.recurrent.policies.RecurrentMultiInputActorCriticPolicy
+  :members:
+  :noindex:
diff --git a/sb3_contrib/__init__.py b/sb3_contrib/__init__.py
@@ -2,6 +2,7 @@
 
 from sb3_contrib.ars import ARS
 from sb3_contrib.ppo_mask import MaskablePPO
+from sb3_contrib.ppo_recurrent import RecurrentPPO
 from sb3_contrib.qrdqn import QRDQN
 from sb3_contrib.tqc import TQC
 from sb3_contrib.trpo import TRPO

diff --git a/sb3_contrib/common/maskable/policies.py b/sb3_contrib/common/maskable/policies.py
@@ -215,12 +215,12 @@ def predict(
         action_masks: Optional[np.ndarray] = None,
     ) -> Tuple[np.ndarray, Optional[Tuple[np.ndarray, ...]]]:
         """
-        Get the policy action and state from an observation (and optional state).
+        Get the policy action from an observation (and optional hidden state).
         Includes sugar-coating to handle different observations (e.g. normalizing images).
 
         :param observation: the input observation
         :param state: The last states (can be None, used in recurrent policies)
-        :param mask: The last masks (can be None, used in recurrent policies)
+        :param episode_start: The last masks (can be None, used in recurrent policies)
         :param deterministic: Whether or not to return deterministic actions.
         :param action_masks: Action masks to apply to the action distribution
         :return: the model's action and the next state
@@ -229,8 +229,8 @@ def predict(
         # TODO (GH/1): add support for RNN policies
         # if state is None:
         #     state = self.initial_state
-        # if mask is None:
-        #     mask = [False for _ in range(self.n_envs)]
+        # if episode_start is None:
+        #     episode_start = [False for _ in range(self.n_envs)]
 
         # Switch to eval mode (this affects batch norm / dropout)
         self.set_training_mode(False)
@@ -256,7 +256,7 @@ def predict(
                 raise ValueError("Error: The environment must be vectorized when using recurrent policies.")
             actions = actions[0]
 
-        return actions, state
+        return actions, None
 
     def evaluate_actions(
         self,

diff --git a/sb3_contrib/common/recurrent/__init__.py b/sb3_contrib/common/recurrent/__init__.py