diff --git a/CHANGELOG.md b/CHANGELOG.md index 511e7cfad..846e3e1dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## Pre-Release 0.11.0a7 (WIP) +## Pre-Release 0.11.1 (2021-02-27) ### Breaking Changes - Removed `LinearNormalActionNoise` @@ -6,6 +6,7 @@ - `sb3_contrib` is now required - `TimeFeatureWrapper` was moved to the contrib repo - Replaced old `plot_train.py` script with updated `plot_training_success.py` +- Renamed ``n_episodes_rollout`` to ``train_freq`` tuple to match latest version of SB3 ### New Features - Added option to choose which `VecEnv` class to use for multiprocessing diff --git a/hyperparams/ddpg.yml b/hyperparams/ddpg.yml index 706804859..54ffec311 100644 --- a/hyperparams/ddpg.yml +++ b/hyperparams/ddpg.yml @@ -14,7 +14,7 @@ Pendulum-v0: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -27,7 +27,7 @@ LunarLanderContinuous-v2: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -40,7 +40,7 @@ BipedalWalker-v3: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -54,7 +54,7 @@ BipedalWalkerHardcore-v3: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -69,7 +69,7 @@ HalfCheetahBulletEnv-v0: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -84,7 +84,7 @@ AntBulletEnv-v0: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 7e-4 policy_kwargs: "dict(net_arch=[400, 300])" @@ -100,7 +100,6 @@ HopperBulletEnv-v0: noise_std: 0.1 train_freq: 64 gradient_steps: 64 - n_episodes_rollout: -1 batch_size: 256 learning_rate: !!float 7e-4 policy_kwargs: "dict(net_arch=[400, 300])" @@ -116,7 +115,7 @@ Walker2DBulletEnv-v0: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] batch_size: 256 learning_rate: !!float 7e-4 policy_kwargs: "dict(net_arch=[400, 300])" @@ -133,7 +132,7 @@ HumanoidBulletEnv-v0: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -148,7 +147,7 @@ ReacherBulletEnv-v0: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -163,7 +162,7 @@ InvertedDoublePendulumBulletEnv-v0: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -178,6 +177,6 @@ InvertedPendulumSwingupBulletEnv-v0: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" diff --git a/hyperparams/her.yml b/hyperparams/her.yml index dac73aacd..5d9e2afd5 100644 --- a/hyperparams/her.yml +++ b/hyperparams/her.yml @@ -13,9 +13,9 @@ NeckGoalEnvRelativeSparse-v2: ent_coef: 'auto' gamma: 0.99 tau: 0.02 - n_episodes_rollout: 1 + train_freq: [1, "episode"] gradient_steps: -1 - train_freq: -1 + # 10 episodes of warm-up learning_starts: 1500 use_sde_at_warmup: True @@ -40,9 +40,8 @@ NeckGoalEnvRelativeDense-v2: ent_coef: 'auto' gamma: 0.99 tau: 0.02 - n_episodes_rollout: 1 + train_freq: [1, "episode"] gradient_steps: -1 - train_freq: -1 # 10 episodes of warm-up learning_starts: 1500 use_sde_at_warmup: True diff --git a/hyperparams/sac.yml b/hyperparams/sac.yml index 15b86c803..dc2d6021c 100644 --- a/hyperparams/sac.yml +++ b/hyperparams/sac.yml @@ -16,9 +16,8 @@ NeckEnvRelative-v2: ent_coef: 'auto' gamma: 0.99 tau: 0.02 - n_episodes_rollout: 1 + train_freq: [1, "episode"] gradient_steps: -1 - train_freq: -1 # 10 episodes of warm-up learning_starts: 3000 use_sde_at_warmup: True @@ -69,9 +68,8 @@ Pendulum-v0: policy: 'MlpPolicy' learning_rate: !!float 1e-3 use_sde: True - n_episodes_rollout: 1 + train_freq: [1, "episode"] gradient_steps: -1 - train_freq: -1 policy_kwargs: "dict(log_std_init=-2, net_arch=[64, 64])" LunarLanderContinuous-v2: @@ -297,7 +295,7 @@ CarRacing-v0: gamma: 0.98 tau: 0.02 train_freq: 64 - # n_episodes_rollout: 1 + # train_freq: [1, "episode"] gradient_steps: 64 # sde_sample_freq: 64 learning_starts: 1000 @@ -323,8 +321,7 @@ donkey-generated-track-v0: gamma: 0.99 tau: 0.02 # train_freq: 64 - train_freq: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] # gradient_steps: -1 gradient_steps: 64 learning_starts: 500 diff --git a/hyperparams/td3.yml b/hyperparams/td3.yml index 9b5931668..7d88b9a54 100644 --- a/hyperparams/td3.yml +++ b/hyperparams/td3.yml @@ -14,7 +14,7 @@ Pendulum-v0: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -27,7 +27,7 @@ LunarLanderContinuous-v2: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -40,7 +40,7 @@ BipedalWalker-v3: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -54,7 +54,7 @@ BipedalWalkerHardcore-v3: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -69,7 +69,7 @@ HalfCheetahBulletEnv-v0: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -83,7 +83,7 @@ AntBulletEnv-v0: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -97,7 +97,7 @@ HopperBulletEnv-v0: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -111,7 +111,7 @@ Walker2DBulletEnv-v0: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -127,7 +127,7 @@ HumanoidBulletEnv-v0: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -142,7 +142,7 @@ ReacherBulletEnv-v0: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -157,7 +157,7 @@ InvertedDoublePendulumBulletEnv-v0: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" @@ -172,7 +172,7 @@ InvertedPendulumSwingupBulletEnv-v0: noise_type: 'normal' noise_std: 0.1 gradient_steps: -1 - n_episodes_rollout: 1 + train_freq: [1, "episode"] learning_rate: !!float 1e-3 policy_kwargs: "dict(net_arch=[400, 300])" diff --git a/hyperparams/tqc.yml b/hyperparams/tqc.yml index fe64b2bc6..de7969067 100644 --- a/hyperparams/tqc.yml +++ b/hyperparams/tqc.yml @@ -19,7 +19,6 @@ Pendulum-v0: policy: 'MlpPolicy' learning_rate: !!float 1e-3 use_sde: True - n_episodes_rollout: -1 gradient_steps: 64 train_freq: 64 policy_kwargs: "dict(log_std_init=-2, net_arch=[64, 64])" diff --git a/requirements.txt b/requirements.txt index 6607e74e8..92fde19ac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -stable-baselines3[extra,tests,docs]>=0.11.0a4 +stable-baselines3[extra,tests,docs]>=0.11.1 box2d-py==2.3.8 pybullet gym-minigrid @@ -7,4 +7,4 @@ optuna pytablewriter seaborn pyyaml>=5.1 -sb3-contrib>=0.11.0a4 +sb3-contrib>=0.11.1 diff --git a/scripts/build_docker.sh b/scripts/build_docker.sh index 908587907..48484f5f1 100755 --- a/scripts/build_docker.sh +++ b/scripts/build_docker.sh @@ -3,7 +3,7 @@ PARENT=stablebaselines/stable-baselines3 TAG=stablebaselines/rl-baselines3-zoo -VERSION=0.11.0a4 +VERSION=0.11.1 if [[ ${USE_GPU} == "True" ]]; then PARENT="${PARENT}:${VERSION}" diff --git a/utils/exp_manager.py b/utils/exp_manager.py index 09a2c8717..0304cf176 100644 --- a/utils/exp_manager.py +++ b/utils/exp_manager.py @@ -309,6 +309,10 @@ def _preprocess_hyperparams( hyperparams = self._preprocess_her_model_class(hyperparams) hyperparams = self._preprocess_schedules(hyperparams) + # Pre-process train_freq + if "train_freq" in hyperparams and isinstance(hyperparams["train_freq"], list): + hyperparams["train_freq"] = tuple(hyperparams["train_freq"]) + # Should we overwrite the number of timesteps? if self.n_timesteps > 0: if self.verbose: diff --git a/utils/hyperparams_opt.py b/utils/hyperparams_opt.py index 327196763..bfcec6b13 100644 --- a/utils/hyperparams_opt.py +++ b/utils/hyperparams_opt.py @@ -212,12 +212,10 @@ def sample_td3_params(trial: optuna.Trial) -> Dict[str, Any]: episodic = trial.suggest_categorical("episodic", [True, False]) if episodic: - n_episodes_rollout = 1 - train_freq, gradient_steps = -1, -1 + train_freq, gradient_steps = (1, "episode"), -1 else: train_freq = trial.suggest_categorical("train_freq", [1, 16, 128, 256, 1000, 2000]) gradient_steps = train_freq - n_episodes_rollout = -1 noise_type = trial.suggest_categorical("noise_type", ["ornstein-uhlenbeck", "normal", None]) noise_std = trial.suggest_uniform("noise_std", 0, 1) @@ -241,7 +239,6 @@ def sample_td3_params(trial: optuna.Trial) -> Dict[str, Any]: "buffer_size": buffer_size, "train_freq": train_freq, "gradient_steps": gradient_steps, - "n_episodes_rollout": n_episodes_rollout, "policy_kwargs": dict(net_arch=net_arch), } @@ -274,12 +271,10 @@ def sample_ddpg_params(trial: optuna.Trial) -> Dict[str, Any]: episodic = trial.suggest_categorical("episodic", [True, False]) if episodic: - n_episodes_rollout = 1 - train_freq, gradient_steps = -1, -1 + train_freq, gradient_steps = (1, "episode"), -1 else: train_freq = trial.suggest_categorical("train_freq", [1, 16, 128, 256, 1000, 2000]) gradient_steps = train_freq - n_episodes_rollout = -1 noise_type = trial.suggest_categorical("noise_type", ["ornstein-uhlenbeck", "normal", None]) noise_std = trial.suggest_uniform("noise_std", 0, 1) @@ -302,7 +297,6 @@ def sample_ddpg_params(trial: optuna.Trial) -> Dict[str, Any]: "buffer_size": buffer_size, "train_freq": train_freq, "gradient_steps": gradient_steps, - "n_episodes_rollout": n_episodes_rollout, "policy_kwargs": dict(net_arch=net_arch), } @@ -337,7 +331,6 @@ def sample_dqn_params(trial: optuna.Trial) -> Dict[str, Any]: train_freq = trial.suggest_categorical("train_freq", [1, 4, 8, 16, 128, 256, 1000]) subsample_steps = trial.suggest_categorical("subsample_steps", [1, 2, 4, 8]) gradient_steps = max(train_freq // subsample_steps, 1) - n_episodes_rollout = -1 net_arch = trial.suggest_categorical("net_arch", ["tiny", "small", "medium"]) @@ -350,7 +343,6 @@ def sample_dqn_params(trial: optuna.Trial) -> Dict[str, Any]: "buffer_size": buffer_size, "train_freq": train_freq, "gradient_steps": gradient_steps, - "n_episodes_rollout": n_episodes_rollout, "exploration_fraction": exploration_fraction, "exploration_final_eps": exploration_final_eps, "target_update_interval": target_update_interval, diff --git a/version.txt b/version.txt index 9522e5b3c..af88ba824 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.11.0a7 +0.11.1