diff --git a/pyproject.toml b/pyproject.toml index 512f59a58c..8a16fe4638 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ extra_standard_library = [ "bpy", "matplotlib", "gymnasium", + "gym", "scipy", "hid", "yaml", diff --git a/source/extensions/omni.isaac.orbit_tasks/config/extension.toml b/source/extensions/omni.isaac.orbit_tasks/config/extension.toml index 8e7469b613..f6f8ec7734 100644 --- a/source/extensions/omni.isaac.orbit_tasks/config/extension.toml +++ b/source/extensions/omni.isaac.orbit_tasks/config/extension.toml @@ -1,7 +1,7 @@ [package] # Note: Semantic Versioning is used: https://semver.org/ -version = "0.5.1" +version = "0.5.2" # Description title = "ORBIT Environments" diff --git a/source/extensions/omni.isaac.orbit_tasks/docs/CHANGELOG.rst b/source/extensions/omni.isaac.orbit_tasks/docs/CHANGELOG.rst index f95fbdf676..4b6b98e798 100644 --- a/source/extensions/omni.isaac.orbit_tasks/docs/CHANGELOG.rst +++ b/source/extensions/omni.isaac.orbit_tasks/docs/CHANGELOG.rst @@ -1,6 +1,17 @@ Changelog --------- +0.5.2 (2023-11-08) +~~~~~~~~~~~~~~~~~~ + +Fixed +^^^^^ + +* Fixed the RL wrappers for Stable-Baselines3 and RL-Games. It now works with their most recent versions. +* Fixed the :meth:`get_checkpoint_path` to allow any in-between sub-folders between the run directory and the + checkpoint directory. + + 0.5.1 (2023-11-04) ~~~~~~~~~~~~~~~~~~ diff --git a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/utils/parse_cfg.py b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/utils/parse_cfg.py index e3e6322fd6..80641ee13a 100644 --- a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/utils/parse_cfg.py +++ b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/utils/parse_cfg.py @@ -140,21 +140,26 @@ def parse_env_cfg(task_name: str, use_gpu: bool | None = None, num_envs: int | N def get_checkpoint_path( - log_path: str, run_dir: str = "*", checkpoint: str = "*", sort_alphabetical: bool = True + log_path: str, run_dir: str = ".*", checkpoint: str = ".*", other_dirs: list[str] = None, sort_alpha: bool = True ) -> str: """Get path to the model checkpoint in input directory. - The checkpoint file is resolved as: //. - If run_dir and checkpoint are regex expressions then the most recent (highest alphabetical order) run and checkpoint are selected. + The checkpoint file is resolved as: //<*other_dirs>/, where the + :attr:`other_dirs` are intermediate folder names to concatenate. These cannot be regex expressions. + + If :attr:`run_dir` and :attr:`checkpoint` are regex expressions then the most recent (highest alphabetical order) + run and checkpoint are selected. To disable this behavior, set the flag :attr:`sort_alpha` to False. Args: log_path: The log directory path to find models in. - run_dir: Regex expression for the name of the directory containing the run. Defaults to the most + run_dir: The regex expression for the name of the directory containing the run. Defaults to the most recent directory created inside :obj:`log_dir`. - checkpoint: The model checkpoint file or directory name. Defaults to the most recent + other_dirs: The intermediate directories between the run directory and the checkpoint file. Defaults to + None, which implies that checkpoint file is directly under the run directory. + checkpoint: The regex expression for the model checkpoint file. Defaults to the most recent torch-model saved in the :obj:`run_dir` directory. - sort_alphabetical: Whether to sort the runs and checkpoints by alphabetical order. Defaults to True. - If False, the checkpoints are sorted by the last modified time. + sort_alpha: Whether to sort the runs by alphabetical order. Defaults to True. + If False, the folders in :attr:`run_dir` are sorted by the last modified time. Raises: ValueError: When no runs are found in the input directory. @@ -173,12 +178,15 @@ def get_checkpoint_path( os.path.join(log_path, run) for run in os.scandir(log_path) if run.is_dir() and re.match(run_dir, run.name) ] # sort matched runs by alphabetical order (latest run should be last) - if sort_alphabetical: + if sort_alpha: runs.sort() else: runs = sorted(runs, key=os.path.getmtime) # create last run file path - run_path = runs[-1] + if other_dirs is not None: + run_path = os.path.join(runs[-1], *other_dirs) + else: + run_path = runs[-1] except IndexError: raise ValueError(f"No runs present in the directory: '{log_path}' match: '{run_dir}'.") diff --git a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/utils/wrappers/rl_games.py b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/utils/wrappers/rl_games.py index 9e0f7560e0..a0f25b6015 100644 --- a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/utils/wrappers/rl_games.py +++ b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/utils/wrappers/rl_games.py @@ -33,7 +33,8 @@ from __future__ import annotations -import gymnasium as gym +import gym.spaces # needed for rl-games incompatibility: https://github.com/Denys88/rl_games/issues/261 +import gymnasium import torch from rl_games.common import env_configurations @@ -61,13 +62,12 @@ class RlGamesVecEnvWrapper(IVecEnv): observations. This dictionary contains "obs" and "states" which typically correspond to the actor and critic observations respectively. - To use asymmetric actor-critic, the environment instance must have the attributes + To use asymmetric actor-critic, the environment observations from :class:`RLTaskEnv` + must have the key or group name "critic". The observation group is used to set the :attr:`num_states` (int) and :attr:`state_space` (:obj:`gym.spaces.Box`). These are - used by the learning agent to allocate buffers in the trajectory memory. Additionally, - the method :meth:`_get_observations()` should have the key "critic" which corresponds - to the privileged observations. Since this is optional for some environments, the wrapper - checks if these attributes exist. If they don't then the wrapper defaults to zero as number - of privileged observations. + used by the learning agent in RL-Games to allocate buffers in the trajectory memory. + Since this is optional for some environments, the wrapper checks if these attributes exist. + If they don't then the wrapper defaults to zero as number of privileged observations. .. caution:: @@ -104,19 +104,11 @@ def __init__(self, env: RLTaskEnv, rl_device: str, clip_obs: float, clip_actions self._clip_obs = clip_obs self._clip_actions = clip_actions self._sim_device = env.unwrapped.device - - # information about spaces for the wrapper - # note: rl-games only wants single observation and action spaces - self.rlg_observation_space = self.unwrapped.single_observation_space["policy"] - self.rlg_action_space = self.unwrapped.single_action_space # information for privileged observations - self.rlg_state_space = self.unwrapped.single_observation_space.get("critic") - if self.rlg_state_space is not None: - if not isinstance(self.rlg_state_space, gym.spaces.Box): - raise ValueError(f"Privileged observations must be of type Box. Type: {type(self.rlg_state_space)}") - self.rlg_num_states = self.rlg_state_space.shape[0] - else: + if self.state_space is None: self.rlg_num_states = 0 + else: + self.rlg_num_states = self.state_space.shape[0] def __str__(self): """Returns the wrapper name and the :attr:`env` representation string.""" @@ -142,14 +134,35 @@ def render_mode(self) -> str | None: return self.env.render_mode @property - def observation_space(self) -> gym.Space: + def observation_space(self) -> gym.spaces.Box: """Returns the :attr:`Env` :attr:`observation_space`.""" - return self.env.observation_space + # note: rl-games only wants single observation space + policy_obs_space = self.unwrapped.single_observation_space["policy"] + if not isinstance(policy_obs_space, gymnasium.spaces.Box): + raise NotImplementedError( + f"The RL-Games wrapper does not currently support observation space: '{type(policy_obs_space)}'." + f" If you need to support this, please modify the wrapper: {self.__class__.__name__}," + " and if you are nice, please send a merge-request." + ) + # note: maybe should check if we are a sub-set of the actual space. don't do it right now since + # in RLTaskEnv we are setting action space as (-inf, inf). + return gym.spaces.Box(-self._clip_obs, self._clip_obs, policy_obs_space.shape) @property def action_space(self) -> gym.Space: """Returns the :attr:`Env` :attr:`action_space`.""" - return self.env.action_space + # note: rl-games only wants single action space + action_space = self.unwrapped.single_action_space + if not isinstance(action_space, gymnasium.spaces.Box): + raise NotImplementedError( + f"The RL-Games wrapper does not currently support action space: '{type(action_space)}'." + f" If you need to support this, please modify the wrapper: {self.__class__.__name__}," + " and if you are nice, please send a merge-request." + ) + # return casted space in gym.spaces.Box (OpenAI Gym) + # note: maybe should check if we are a sub-set of the actual space. don't do it right now since + # in RLTaskEnv we are setting action space as (-inf, inf). + return gym.spaces.Box(-self._clip_actions, self._clip_actions, action_space.shape) @classmethod def class_name(cls) -> str: @@ -168,6 +181,35 @@ def unwrapped(self) -> RLTaskEnv: Properties """ + @property + def num_envs(self) -> int: + """Returns the number of sub-environment instances.""" + return self.unwrapped.num_envs + + @property + def device(self) -> str: + """Returns the base environment simulation device.""" + return self.unwrapped.device + + @property + def state_space(self) -> gym.spaces.Box | None: + """Returns the :attr:`Env` :attr:`observation_space`.""" + # note: rl-games only wants single observation space + critic_obs_space = self.unwrapped.single_observation_space.get("critic") + # check if we even have a critic obs + if critic_obs_space is None: + return None + elif not isinstance(critic_obs_space, gymnasium.spaces.Box): + raise NotImplementedError( + f"The RL-Games wrapper does not currently support state space: '{type(critic_obs_space)}'." + f" If you need to support this, please modify the wrapper: {self.__class__.__name__}," + " and if you are nice, please send a merge-request." + ) + # return casted space in gym.spaces.Box (OpenAI Gym) + # note: maybe should check if we are a sub-set of the actual space. don't do it right now since + # in RLTaskEnv we are setting action space as (-inf, inf). + return gym.spaces.Box(-self._clip_obs, self._clip_obs, critic_obs_space.shape) + def get_number_of_agents(self) -> int: """Returns number of actors in the environment.""" return getattr(self, "num_agents", 1) @@ -175,9 +217,9 @@ def get_number_of_agents(self) -> int: def get_env_info(self) -> dict: """Returns the Gym spaces for the environment.""" return { - "observation_space": self.rlg_observation_space, - "action_space": self.rlg_action_space, - "state_space": self.rlg_state_space, + "observation_space": self.observation_space, + "action_space": self.action_space, + "state_space": self.state_space, } """ diff --git a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/utils/wrappers/sb3.py b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/utils/wrappers/sb3.py index 96ddf79bec..634d44c670 100644 --- a/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/utils/wrappers/sb3.py +++ b/source/extensions/omni.isaac.orbit_tasks/omni/isaac/orbit_tasks/utils/wrappers/sb3.py @@ -17,10 +17,13 @@ from __future__ import annotations +import gymnasium as gym import numpy as np import torch +import torch.nn as nn # noqa: F401 from typing import Any +from stable_baselines3.common.utils import constant_fn from stable_baselines3.common.vec_env.base_vec_env import VecEnv, VecEnvObs, VecEnvStepReturn from omni.isaac.orbit.envs import RLTaskEnv @@ -44,16 +47,28 @@ def process_sb3_cfg(cfg: dict) -> dict: Reference: https://github.com/DLR-RM/rl-baselines3-zoo/blob/0e5eb145faefa33e7d79c7f8c179788574b20da5/utils/exp_manager.py#L358 """ - _direct_eval = ["policy_kwargs", "replay_buffer_class", "replay_buffer_kwargs"] - def update_dict(d): - for key, value in d.items(): + def update_dict(hyperparams: dict[str, Any]) -> dict[str, Any]: + for key, value in hyperparams.items(): if isinstance(value, dict): update_dict(value) else: - if key in _direct_eval: - d[key] = eval(value) - return d + if key in ["policy_kwargs", "replay_buffer_class", "replay_buffer_kwargs"]: + hyperparams[key] = eval(value) + elif key in ["learning_rate", "clip_range", "clip_range_vf", "delta_std"]: + if isinstance(value, str): + _, initial_value = value.split("_") + initial_value = float(initial_value) + hyperparams[key] = lambda progress_remaining: progress_remaining * initial_value + elif isinstance(value, (float, int)): + # Negative value: ignore (ex: for clipping) + if value < 0: + continue + hyperparams[key] = constant_fn(float(value)) + else: + raise ValueError(f"Invalid value for {key}: {hyperparams[key]}") + + return hyperparams # parse agent configuration and convert to classes return update_dict(cfg) @@ -127,9 +142,14 @@ def __init__(self, env: RLTaskEnv): self.num_envs = self.unwrapped.num_envs self.sim_device = self.unwrapped.device self.render_mode = self.unwrapped.render_mode - # initialize vec-env + # obtain gym spaces + # note: stable-baselines3 does not like when we have unbounded action space so + # we set it to some high value here. Maybe this is not general but something to think about. observation_space = self.unwrapped.single_observation_space["policy"] action_space = self.unwrapped.single_action_space + if isinstance(action_space, gym.spaces.Box) and action_space.is_bounded() != "both": + action_space = gym.spaces.Box(low=-100, high=100, shape=action_space.shape) + # initialize vec-env VecEnv.__init__(self, self.num_envs, observation_space, action_space) # add buffer for logging episodic information self._ep_rew_buf = torch.zeros(self.num_envs, device=self.sim_device) diff --git a/source/extensions/omni.isaac.orbit_tasks/setup.py b/source/extensions/omni.isaac.orbit_tasks/setup.py index eb083b06d8..db05a5262e 100644 --- a/source/extensions/omni.isaac.orbit_tasks/setup.py +++ b/source/extensions/omni.isaac.orbit_tasks/setup.py @@ -34,8 +34,8 @@ # Extra dependencies for RL agents EXTRAS_REQUIRE = { "sb3": ["stable-baselines3>=2.0"], - "skrl": ["skrl>=0.10.0"], - "rl_games": ["rl-games==1.6.1"], + "skrl": ["skrl==0.10.0"], + "rl_games": ["rl-games==1.6.1", "gym"], # rl-games still needs gym :( "rsl_rl": ["rsl_rl@git+https://github.com/leggedrobotics/rsl_rl.git"], "robomimic": ["robomimic@git+https://github.com/ARISE-Initiative/robomimic.git"], } diff --git a/source/extensions/omni.isaac.orbit_tasks/test/wrappers/test_rl_games_wrapper.py b/source/extensions/omni.isaac.orbit_tasks/test/wrappers/test_rl_games_wrapper.py index 163a4700cc..69bbd6cdf6 100644 --- a/source/extensions/omni.isaac.orbit_tasks/test/wrappers/test_rl_games_wrapper.py +++ b/source/extensions/omni.isaac.orbit_tasks/test/wrappers/test_rl_games_wrapper.py @@ -82,7 +82,7 @@ def test_random_actions(self): with torch.inference_mode(): for _ in range(100): # sample actions from -1 to 1 - actions = 2 * torch.rand(env.action_space.shape, device=env.device) - 1 + actions = 2 * torch.rand(env.num_envs, *env.action_space.shape, device=env.device) - 1 # apply actions transition = env.step(actions) # check signals diff --git a/source/extensions/omni.isaac.orbit_tasks/test/wrappers/test_sb3_wrapper.py b/source/extensions/omni.isaac.orbit_tasks/test/wrappers/test_sb3_wrapper.py index 17d506423e..693b2e4998 100644 --- a/source/extensions/omni.isaac.orbit_tasks/test/wrappers/test_sb3_wrapper.py +++ b/source/extensions/omni.isaac.orbit_tasks/test/wrappers/test_sb3_wrapper.py @@ -83,7 +83,7 @@ def test_random_actions(self): with torch.inference_mode(): for _ in range(1000): # sample actions from -1 to 1 - actions = 2 * np.random.rand(env.num_envs, env.action_space.shape) - 1 + actions = 2 * np.random.rand(env.num_envs, *env.action_space.shape) - 1 # apply actions transition = env.step(actions) # check signals diff --git a/source/standalone/workflows/rl_games/play.py b/source/standalone/workflows/rl_games/play.py index ef178d35af..8e986770d1 100644 --- a/source/standalone/workflows/rl_games/play.py +++ b/source/standalone/workflows/rl_games/play.py @@ -84,17 +84,15 @@ def main(): # find checkpoint if args_cli.checkpoint is None: # specify directory for logging runs - if "full_experiment_name" not in agent_cfg["params"]["config"]: - run_dir = os.path.join("*", "nn") - else: - run_dir = os.path.join(agent_cfg["params"]["config"]["full_experiment_name"], "nn") + run_dir = agent_cfg["params"]["config"].get("full_experiment_name", ".*") # specify name of checkpoint if args_cli.use_last_checkpoint: - checkpoint_file = None + checkpoint_file = ".*" else: + # this loads the best checkpoint checkpoint_file = f"{agent_cfg['params']['config']['name']}.pth" # get path to previous checkpoint - resume_path = get_checkpoint_path(log_root_path, run_dir, checkpoint_file) + resume_path = get_checkpoint_path(log_root_path, run_dir, checkpoint_file, other_dirs=["nn"]) else: resume_path = os.path.abspath(args_cli.checkpoint) # load previously trained model diff --git a/source/standalone/workflows/rsl_rl/play.py b/source/standalone/workflows/rsl_rl/play.py index 29ac2e83f7..e6f69226e2 100644 --- a/source/standalone/workflows/rsl_rl/play.py +++ b/source/standalone/workflows/rsl_rl/play.py @@ -15,7 +15,7 @@ from omni.isaac.orbit.app import AppLauncher # local imports -import source.standalone.workflows.rsl_rl.cli_args as cli_args # isort: skip +import cli_args # isort: skip # add argparse arguments parser = argparse.ArgumentParser(description="Train an RL agent with RSL-RL.") diff --git a/source/standalone/workflows/rsl_rl/train.py b/source/standalone/workflows/rsl_rl/train.py index c069e03b3a..0db7a65630 100644 --- a/source/standalone/workflows/rsl_rl/train.py +++ b/source/standalone/workflows/rsl_rl/train.py @@ -16,7 +16,7 @@ from omni.isaac.orbit.app import AppLauncher # local imports -import source.standalone.workflows.rsl_rl.cli_args as cli_args # isort: skip +import cli_args # isort: skip # add argparse arguments diff --git a/source/standalone/workflows/sb3/play.py b/source/standalone/workflows/sb3/play.py index 05c3cabfe9..e5af98a10d 100644 --- a/source/standalone/workflows/sb3/play.py +++ b/source/standalone/workflows/sb3/play.py @@ -21,6 +21,11 @@ parser.add_argument("--num_envs", type=int, default=None, help="Number of environments to simulate.") parser.add_argument("--task", type=str, default=None, help="Name of the task.") parser.add_argument("--checkpoint", type=str, default=None, help="Path to model checkpoint.") +parser.add_argument( + "--use_last_checkpoint", + action="store_true", + help="When no checkpoint provided, use the last saved model. Otherwise use the best saved model.", +) # append AppLauncher cli args AppLauncher.add_app_launcher_args(parser) # parse the arguments @@ -34,6 +39,7 @@ import gymnasium as gym +import os import torch import traceback @@ -43,7 +49,7 @@ import omni.isaac.contrib_tasks # noqa: F401 import omni.isaac.orbit_tasks # noqa: F401 -from omni.isaac.orbit_tasks.utils.parse_cfg import load_cfg_from_registry, parse_env_cfg +from omni.isaac.orbit_tasks.utils.parse_cfg import get_checkpoint_path, load_cfg_from_registry, parse_env_cfg from omni.isaac.orbit_tasks.utils.wrappers.sb3 import Sb3VecEnvWrapper, process_sb3_cfg @@ -72,12 +78,21 @@ def main(): clip_reward=np.inf, ) + # directory for logging into + log_root_path = os.path.join("logs", "sb3", args_cli.task) + log_root_path = os.path.abspath(log_root_path) # check checkpoint is valid if args_cli.checkpoint is None: - raise ValueError("Checkpoint path is not valid.") + if args_cli.use_last_checkpoint: + checkpoint = "model_.*.zip" + else: + checkpoint = "model.zip" + checkpoint_path = get_checkpoint_path(log_root_path, ".*", checkpoint) + else: + checkpoint_path = args_cli.checkpoint # create agent from stable baselines - print(f"Loading checkpoint from: {args_cli.checkpoint}") - agent = PPO.load(args_cli.checkpoint, env, print_system_info=True) + print(f"Loading checkpoint from: {checkpoint_path}") + agent = PPO.load(checkpoint_path, env, print_system_info=True) # reset environment obs = env.reset() diff --git a/source/standalone/workflows/sb3/train.py b/source/standalone/workflows/sb3/train.py index afc5f87fea..3b6b4a4f51 100644 --- a/source/standalone/workflows/sb3/train.py +++ b/source/standalone/workflows/sb3/train.py @@ -3,7 +3,12 @@ # # SPDX-License-Identifier: BSD-3-Clause -"""Script to train RL agent with Stable Baselines3.""" +"""Script to train RL agent with Stable Baselines3. + +Since Stable-Baselines3 does not support buffers living on GPU directly, +we recommend using smaller number of environments. Otherwise, +there will be significant overhead in GPU->CPU transfer. +""" from __future__ import annotations @@ -68,8 +73,6 @@ def main(): # parse configuration env_cfg = parse_env_cfg(args_cli.task, use_gpu=not args_cli.cpu, num_envs=args_cli.num_envs) agent_cfg = load_cfg_from_registry(args_cli.task, "sb3_cfg_entry_point") - # post-process agent configuration - agent_cfg = process_sb3_cfg(agent_cfg) # override configuration with command line arguments if args_cli.seed is not None: @@ -83,6 +86,8 @@ def main(): dump_pickle(os.path.join(log_dir, "params", "env.pkl"), env_cfg) dump_pickle(os.path.join(log_dir, "params", "agent.pkl"), agent_cfg) + # post-process agent configuration + agent_cfg = process_sb3_cfg(agent_cfg) # read configurations about the agent-training policy_arch = agent_cfg.pop("policy") n_timesteps = agent_cfg.pop("n_timesteps") diff --git a/source/standalone/workflows/skrl/play.py b/source/standalone/workflows/skrl/play.py index 98bab3164e..364f51544d 100644 --- a/source/standalone/workflows/skrl/play.py +++ b/source/standalone/workflows/skrl/play.py @@ -123,7 +123,7 @@ def main(): if args_cli.checkpoint: resume_path = os.path.abspath(args_cli.checkpoint) else: - resume_path = get_checkpoint_path(log_root_path, os.path.join("*", "checkpoints"), None) + resume_path = get_checkpoint_path(log_root_path, other_dirs=["checkpoints"]) print(f"[INFO] Loading model checkpoint from: {resume_path}") # initialize agent