Skip to content

Commit

Permalink
Upgrade Stable-Baselines3 (#68)
Browse files Browse the repository at this point in the history
* Update train freq

* Upgrade Stable-Baselines3

* Additional fixes

* Bump version

* Upgrade docker

* Pre-process train freq
  • Loading branch information
araffin authored Feb 27, 2021
1 parent 0923702 commit f71d490
Show file tree
Hide file tree
Showing 11 changed files with 42 additions and 51 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
## Pre-Release 0.11.0a7 (WIP)
## Pre-Release 0.11.1 (2021-02-27)

### Breaking Changes
- Removed `LinearNormalActionNoise`
- Evaluation is now deterministic by default, except for Atari games
- `sb3_contrib` is now required
- `TimeFeatureWrapper` was moved to the contrib repo
- Replaced old `plot_train.py` script with updated `plot_training_success.py`
- Renamed ``n_episodes_rollout`` to ``train_freq`` tuple to match latest version of SB3

### New Features
- Added option to choose which `VecEnv` class to use for multiprocessing
Expand Down
23 changes: 11 additions & 12 deletions hyperparams/ddpg.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Pendulum-v0:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"

Expand All @@ -27,7 +27,7 @@ LunarLanderContinuous-v2:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"

Expand All @@ -40,7 +40,7 @@ BipedalWalker-v3:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"

Expand All @@ -54,7 +54,7 @@ BipedalWalkerHardcore-v3:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"

Expand All @@ -69,7 +69,7 @@ HalfCheetahBulletEnv-v0:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"

Expand All @@ -84,7 +84,7 @@ AntBulletEnv-v0:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 7e-4
policy_kwargs: "dict(net_arch=[400, 300])"

Expand All @@ -100,7 +100,6 @@ HopperBulletEnv-v0:
noise_std: 0.1
train_freq: 64
gradient_steps: 64
n_episodes_rollout: -1
batch_size: 256
learning_rate: !!float 7e-4
policy_kwargs: "dict(net_arch=[400, 300])"
Expand All @@ -116,7 +115,7 @@ Walker2DBulletEnv-v0:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
batch_size: 256
learning_rate: !!float 7e-4
policy_kwargs: "dict(net_arch=[400, 300])"
Expand All @@ -133,7 +132,7 @@ HumanoidBulletEnv-v0:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"

Expand All @@ -148,7 +147,7 @@ ReacherBulletEnv-v0:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"

Expand All @@ -163,7 +162,7 @@ InvertedDoublePendulumBulletEnv-v0:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"

Expand All @@ -178,6 +177,6 @@ InvertedPendulumSwingupBulletEnv-v0:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"
7 changes: 3 additions & 4 deletions hyperparams/her.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ NeckGoalEnvRelativeSparse-v2:
ent_coef: 'auto'
gamma: 0.99
tau: 0.02
n_episodes_rollout: 1
train_freq: [1, "episode"]
gradient_steps: -1
train_freq: -1

# 10 episodes of warm-up
learning_starts: 1500
use_sde_at_warmup: True
Expand All @@ -40,9 +40,8 @@ NeckGoalEnvRelativeDense-v2:
ent_coef: 'auto'
gamma: 0.99
tau: 0.02
n_episodes_rollout: 1
train_freq: [1, "episode"]
gradient_steps: -1
train_freq: -1
# 10 episodes of warm-up
learning_starts: 1500
use_sde_at_warmup: True
Expand Down
11 changes: 4 additions & 7 deletions hyperparams/sac.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,8 @@ NeckEnvRelative-v2:
ent_coef: 'auto'
gamma: 0.99
tau: 0.02
n_episodes_rollout: 1
train_freq: [1, "episode"]
gradient_steps: -1
train_freq: -1
# 10 episodes of warm-up
learning_starts: 3000
use_sde_at_warmup: True
Expand Down Expand Up @@ -69,9 +68,8 @@ Pendulum-v0:
policy: 'MlpPolicy'
learning_rate: !!float 1e-3
use_sde: True
n_episodes_rollout: 1
train_freq: [1, "episode"]
gradient_steps: -1
train_freq: -1
policy_kwargs: "dict(log_std_init=-2, net_arch=[64, 64])"

LunarLanderContinuous-v2:
Expand Down Expand Up @@ -297,7 +295,7 @@ CarRacing-v0:
gamma: 0.98
tau: 0.02
train_freq: 64
# n_episodes_rollout: 1
# train_freq: [1, "episode"]
gradient_steps: 64
# sde_sample_freq: 64
learning_starts: 1000
Expand All @@ -323,8 +321,7 @@ donkey-generated-track-v0:
gamma: 0.99
tau: 0.02
# train_freq: 64
train_freq: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
# gradient_steps: -1
gradient_steps: 64
learning_starts: 500
Expand Down
24 changes: 12 additions & 12 deletions hyperparams/td3.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Pendulum-v0:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"

Expand All @@ -27,7 +27,7 @@ LunarLanderContinuous-v2:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"

Expand All @@ -40,7 +40,7 @@ BipedalWalker-v3:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"

Expand All @@ -54,7 +54,7 @@ BipedalWalkerHardcore-v3:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"

Expand All @@ -69,7 +69,7 @@ HalfCheetahBulletEnv-v0:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"

Expand All @@ -83,7 +83,7 @@ AntBulletEnv-v0:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"

Expand All @@ -97,7 +97,7 @@ HopperBulletEnv-v0:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"

Expand All @@ -111,7 +111,7 @@ Walker2DBulletEnv-v0:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"

Expand All @@ -127,7 +127,7 @@ HumanoidBulletEnv-v0:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"

Expand All @@ -142,7 +142,7 @@ ReacherBulletEnv-v0:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"

Expand All @@ -157,7 +157,7 @@ InvertedDoublePendulumBulletEnv-v0:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"

Expand All @@ -172,7 +172,7 @@ InvertedPendulumSwingupBulletEnv-v0:
noise_type: 'normal'
noise_std: 0.1
gradient_steps: -1
n_episodes_rollout: 1
train_freq: [1, "episode"]
learning_rate: !!float 1e-3
policy_kwargs: "dict(net_arch=[400, 300])"

Expand Down
1 change: 0 additions & 1 deletion hyperparams/tqc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ Pendulum-v0:
policy: 'MlpPolicy'
learning_rate: !!float 1e-3
use_sde: True
n_episodes_rollout: -1
gradient_steps: 64
train_freq: 64
policy_kwargs: "dict(log_std_init=-2, net_arch=[64, 64])"
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
stable-baselines3[extra,tests,docs]>=0.11.0a4
stable-baselines3[extra,tests,docs]>=0.11.1
box2d-py==2.3.8
pybullet
gym-minigrid
Expand All @@ -7,4 +7,4 @@ optuna
pytablewriter
seaborn
pyyaml>=5.1
sb3-contrib>=0.11.0a4
sb3-contrib>=0.11.1
2 changes: 1 addition & 1 deletion scripts/build_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
PARENT=stablebaselines/stable-baselines3

TAG=stablebaselines/rl-baselines3-zoo
VERSION=0.11.0a4
VERSION=0.11.1

if [[ ${USE_GPU} == "True" ]]; then
PARENT="${PARENT}:${VERSION}"
Expand Down
4 changes: 4 additions & 0 deletions utils/exp_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,10 @@ def _preprocess_hyperparams(
hyperparams = self._preprocess_her_model_class(hyperparams)
hyperparams = self._preprocess_schedules(hyperparams)

# Pre-process train_freq
if "train_freq" in hyperparams and isinstance(hyperparams["train_freq"], list):
hyperparams["train_freq"] = tuple(hyperparams["train_freq"])

# Should we overwrite the number of timesteps?
if self.n_timesteps > 0:
if self.verbose:
Expand Down
12 changes: 2 additions & 10 deletions utils/hyperparams_opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,12 +212,10 @@ def sample_td3_params(trial: optuna.Trial) -> Dict[str, Any]:
episodic = trial.suggest_categorical("episodic", [True, False])

if episodic:
n_episodes_rollout = 1
train_freq, gradient_steps = -1, -1
train_freq, gradient_steps = (1, "episode"), -1
else:
train_freq = trial.suggest_categorical("train_freq", [1, 16, 128, 256, 1000, 2000])
gradient_steps = train_freq
n_episodes_rollout = -1

noise_type = trial.suggest_categorical("noise_type", ["ornstein-uhlenbeck", "normal", None])
noise_std = trial.suggest_uniform("noise_std", 0, 1)
Expand All @@ -241,7 +239,6 @@ def sample_td3_params(trial: optuna.Trial) -> Dict[str, Any]:
"buffer_size": buffer_size,
"train_freq": train_freq,
"gradient_steps": gradient_steps,
"n_episodes_rollout": n_episodes_rollout,
"policy_kwargs": dict(net_arch=net_arch),
}

Expand Down Expand Up @@ -274,12 +271,10 @@ def sample_ddpg_params(trial: optuna.Trial) -> Dict[str, Any]:
episodic = trial.suggest_categorical("episodic", [True, False])

if episodic:
n_episodes_rollout = 1
train_freq, gradient_steps = -1, -1
train_freq, gradient_steps = (1, "episode"), -1
else:
train_freq = trial.suggest_categorical("train_freq", [1, 16, 128, 256, 1000, 2000])
gradient_steps = train_freq
n_episodes_rollout = -1

noise_type = trial.suggest_categorical("noise_type", ["ornstein-uhlenbeck", "normal", None])
noise_std = trial.suggest_uniform("noise_std", 0, 1)
Expand All @@ -302,7 +297,6 @@ def sample_ddpg_params(trial: optuna.Trial) -> Dict[str, Any]:
"buffer_size": buffer_size,
"train_freq": train_freq,
"gradient_steps": gradient_steps,
"n_episodes_rollout": n_episodes_rollout,
"policy_kwargs": dict(net_arch=net_arch),
}

Expand Down Expand Up @@ -337,7 +331,6 @@ def sample_dqn_params(trial: optuna.Trial) -> Dict[str, Any]:
train_freq = trial.suggest_categorical("train_freq", [1, 4, 8, 16, 128, 256, 1000])
subsample_steps = trial.suggest_categorical("subsample_steps", [1, 2, 4, 8])
gradient_steps = max(train_freq // subsample_steps, 1)
n_episodes_rollout = -1

net_arch = trial.suggest_categorical("net_arch", ["tiny", "small", "medium"])

Expand All @@ -350,7 +343,6 @@ def sample_dqn_params(trial: optuna.Trial) -> Dict[str, Any]:
"buffer_size": buffer_size,
"train_freq": train_freq,
"gradient_steps": gradient_steps,
"n_episodes_rollout": n_episodes_rollout,
"exploration_fraction": exploration_fraction,
"exploration_final_eps": exploration_final_eps,
"target_update_interval": target_update_interval,
Expand Down
2 changes: 1 addition & 1 deletion version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.11.0a7
0.11.1

0 comments on commit f71d490

Please sign in to comment.