diff --git a/README.md b/README.md index 743a54a2e6..da6fe42a68 100644 --- a/README.md +++ b/README.md @@ -213,6 +213,7 @@ Download the Habitat related Gibson dataset following the instructions [here](ht | [Point goal navigation](https://arxiv.org/abs/1807.06757) | MatterPort3D | [pointnav_mp3d_v1.zip](https://dl.fbaipublicfiles.com/habitat/data/datasets/pointnav/mp3d/v1/pointnav_mp3d_v1.zip) | `data/datasets/pointnav/mp3d/v1/` | [`datasets/pointnav/mp3d.yaml`](configs/datasets/pointnav/mp3d.yaml) | 400 MB | | 🆕[Point goal navigation](https://arxiv.org/abs/1807.06757) | HM3D | [pointnav_hm3d_v1.zip](https://dl.fbaipublicfiles.com/habitat/data/datasets/pointnav/hm3d/v1/pointnav_hm3d_v1.zip) | `data/datasets/pointnav/hm3d/v1/` | [`datasets/pointnav/hm3d.yaml`](configs/datasets/pointnav/hm3d.yaml) | 992 MB | | [Object goal navigation](https://arxiv.org/abs/2006.13171) | MatterPort3D | [objectnav_mp3d_v1.zip](https://dl.fbaipublicfiles.com/habitat/data/datasets/objectnav/m3d/v1/objectnav_mp3d_v1.zip) | `data/datasets/objectnav/mp3d/v1/` | [`datasets/objectnav/mp3d.yaml`](configs/datasets/objectnav/mp3d.yaml) | 170 MB | +| 🆕[Object goal navigation](https://arxiv.org/abs/2006.13171) | HM3D | [objectnav_hm3d_v1.zip](https://dl.fbaipublicfiles.com/habitat/data/datasets/objectnav/hm3d/v1/objectnav_hm3d_v1.zip) | `data/datasets/objectnav/hm3d/v1/` | [`datasets/objectnav/hm3d.yaml`](configs/datasets/objetnav/hm3d.yaml) | 154 MB | | [Embodied Question Answering](https://embodiedqa.org/) | MatterPort3D | [eqa_mp3d_v1.zip](https://dl.fbaipublicfiles.com/habitat/data/datasets/eqa/mp3d/v1/eqa_mp3d_v1.zip) | `data/datasets/eqa/mp3d/v1/` | [`datasets/eqa/mp3d.yaml`](configs/datasets/eqa/mp3d.yaml) | 44 MB | | [Visual Language Navigation](https://bringmeaspoon.org/) | MatterPort3D | [vln_r2r_mp3d_v1.zip](https://dl.fbaipublicfiles.com/habitat/data/datasets/vln/mp3d/r2r/v1/vln_r2r_mp3d_v1.zip) | `data/datasets/vln/mp3d/r2r/v1` | [`datasets/vln/mp3d_r2r.yaml`](configs/datasets/vln/mp3d_r2r.yaml) | 2.7 MB | | [Image goal navigation](https://github.com/facebookresearch/habitat-lab/pull/333) | Gibson | [pointnav_gibson_v1.zip](https://dl.fbaipublicfiles.com/habitat/data/datasets/pointnav/gibson/v1/pointnav_gibson_v1.zip) | `data/datasets/pointnav/gibson/v1/` | [`datasets/imagenav/gibson.yaml`](configs/datasets/imagenav/gibson.yaml) | 385 MB | diff --git a/configs/datasets/objectnav/hm3d.yaml b/configs/datasets/objectnav/hm3d.yaml new file mode 100644 index 0000000000..ed8caa5d01 --- /dev/null +++ b/configs/datasets/objectnav/hm3d.yaml @@ -0,0 +1,4 @@ +DATASET: + TYPE: ObjectNav-v1 + SPLIT: train + DATA_PATH: data/datasets/objectnav/hm3d/v1/{split}/{split}.json.gz diff --git a/configs/tasks/objectnav_hm3d.yaml b/configs/tasks/objectnav_hm3d.yaml new file mode 100644 index 0000000000..840a385a6d --- /dev/null +++ b/configs/tasks/objectnav_hm3d.yaml @@ -0,0 +1,56 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 + +SIMULATOR: + TURN_ANGLE: 30 + TILT_ANGLE: 30 + ACTION_SPACE_CONFIG: "v1" + AGENT_0: + SENSORS: ['RGB_SENSOR', 'DEPTH_SENSOR'] + HEIGHT: 0.88 + RADIUS: 0.18 + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + ALLOW_SLIDING: False + SEMANTIC_SENSOR: + WIDTH: 640 + HEIGHT: 480 + HFOV: 79 + POSITION: [0, 0.88, 0] + RGB_SENSOR: + WIDTH: 640 + HEIGHT: 480 + HFOV: 79 + POSITION: [0, 0.88, 0] + DEPTH_SENSOR: + WIDTH: 640 + HEIGHT: 480 + HFOV: 79 + MIN_DEPTH: 0.5 + MAX_DEPTH: 5.0 + POSITION: [0, 0.88, 0] +TASK: + TYPE: ObjectNav-v1 + POSSIBLE_ACTIONS: ["STOP", "MOVE_FORWARD", "TURN_LEFT", "TURN_RIGHT", "LOOK_UP", "LOOK_DOWN"] + + SENSORS: ['OBJECTGOAL_SENSOR', 'COMPASS_SENSOR', 'GPS_SENSOR'] + GOAL_SENSOR_UUID: objectgoal + SEMANTIC_CATEGORY_SENSOR: + WIDTH: 640 + HEIGHT: 480 + DATASET: "hm3d" + CONVERT_TO_RGB: True + RAW_NAME_TO_CATEGORY_MAPPING: "data/matterport_semantics/matterport_category_mappings.tsv" + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL'] + + DISTANCE_TO_GOAL: + DISTANCE_TO: VIEW_POINTS + SUCCESS: + SUCCESS_DISTANCE: 0.1 + +DATASET: + TYPE: ObjectNav-v1 + SPLIT: train + DATA_PATH: "data/datasets/objectnav/hm3d/v1/{split}/{split}.json.gz" + SCENES_DIR: "data/scene_datasets/" diff --git a/configs/tasks/pointnav_hm3d.yaml b/configs/tasks/pointnav_hm3d.yaml new file mode 100644 index 0000000000..4ebe7a8f8e --- /dev/null +++ b/configs/tasks/pointnav_hm3d.yaml @@ -0,0 +1,30 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + AGENT_0: + SENSORS: ['RGB_SENSOR'] + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 256 + HEIGHT: 256 + DEPTH_SENSOR: + WIDTH: 256 + HEIGHT: 256 +TASK: + TYPE: Nav-v0 + SUCCESS_DISTANCE: 0.2 + + SENSORS: ['POINTGOAL_WITH_GPS_COMPASS_SENSOR'] + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "POLAR" + DIMENSIONALITY: 2 + GOAL_SENSOR_UUID: pointgoal_with_gps_compass + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'SUCCESS', 'SPL'] + SUCCESS: + SUCCESS_DISTANCE: 0.2 +DATASET: + TYPE: PointNav-v1 + SPLIT: train + DATA_PATH: data/datasets/pointnav/hm3d/v1/{split}/{split}.json.gz diff --git a/habitat/config/default.py b/habitat/config/default.py index 72f40cd4a2..582f29dbb4 100644 --- a/habitat/config/default.py +++ b/habitat/config/default.py @@ -133,6 +133,16 @@ def __init__(self, *args, **kwargs): _C.TASK.PROXIMITY_SENSOR.TYPE = "ProximitySensor" _C.TASK.PROXIMITY_SENSOR.MAX_DETECTION_RADIUS = 2.0 # ----------------------------------------------------------------------------- +# SEMANTIC CATEGORY SENSOR +# ----------------------------------------------------------------------------- +_C.TASK.SEMANTIC_CATEGORY_SENSOR = CN() +_C.TASK.SEMANTIC_CATEGORY_SENSOR.HEIGHT = 480 +_C.TASK.SEMANTIC_CATEGORY_SENSOR.WIDTH = 640 +_C.TASK.SEMANTIC_CATEGORY_SENSOR.TYPE = "SemanticCategorySensor" +_C.TASK.SEMANTIC_CATEGORY_SENSOR.CONVERT_TO_RGB = True +_C.TASK.SEMANTIC_CATEGORY_SENSOR.DATASET = "mp3d" +_C.TASK.SEMANTIC_CATEGORY_SENSOR.RAW_NAME_TO_CATEGORY_MAPPING = "" +# ----------------------------------------------------------------------------- # SUCCESS MEASUREMENT # ----------------------------------------------------------------------------- _C.TASK.SUCCESS = CN() diff --git a/habitat/tasks/nav/nav.py b/habitat/tasks/nav/nav.py index 06cadfce89..f42f6e526d 100644 --- a/habitat/tasks/nav/nav.py +++ b/habitat/tasks/nav/nav.py @@ -34,11 +34,18 @@ from habitat.core.utils import not_none_validator, try_cv2_import from habitat.sims.habitat_simulator.actions import HabitatSimActions from habitat.tasks.utils import cartesian_to_polar +from habitat.tasks.nav.semantic_constants import ( + GIBSON_CATEGORY_TO_TASK_CATEGORY_ID, + MP3D_CATEGORY_TO_TASK_CATEGORY_ID, + HM3D_CATEGORY_TO_TASK_CATEGORY_ID, +) from habitat.utils.geometry_utils import ( quaternion_from_coeff, quaternion_rotate_vector, ) from habitat.utils.visualizations import fog_of_war, maps +from habitat_sim.utils.common import d3_40_colors_rgb +from PIL import Image try: from habitat.sims.habitat_simulator.habitat_simulator import HabitatSim @@ -514,6 +521,118 @@ def get_observation( ) +@registry.register_sensor(name="SemanticCategorySensor") +class SemanticCategorySensor(Sensor): + r"""Lists the object categories for each pixel location. + Args: + sim: reference to the simulator for calculating task observations. + """ + cls_uuid: str = "semantic_category" + + def __init__( + self, sim: Simulator, config: Config, *args: Any, **kwargs: Any + ): + self._sim = sim + self._current_episode_id = None + self.mapping = None + self.category_to_task_category_id = None + self.instance_id_to_task_id = None + self._initialize_category_mappings(config) + + super().__init__(config=config) + + def _get_uuid(self, *args: Any, **kwargs: Any): + return self.cls_uuid + + def _initialize_category_mappings(self, config): + assert config.DATASET in ["gibson", "mp3d", "hm3d"] + if config.DATASET == "gibson": + cat_mapping = GIBSON_CATEGORY_TO_TASK_CATEGORY_ID + elif config.DATASET == "mp3d": + cat_mapping = MP3D_CATEGORY_TO_TASK_CATEGORY_ID + else: + cat_mapping = HM3D_CATEGORY_TO_TASK_CATEGORY_ID + self.category_to_task_category_id = cat_mapping + if config.RAW_NAME_TO_CATEGORY_MAPPING != "": + with open(config.RAW_NAME_TO_CATEGORY_MAPPING, "r") as fp: + lines = fp.readlines() + lines = lines[1:] + lines = [l.strip().split(" ") for l in lines] + self.raw_to_cat_mapping = {} + for l in lines: + raw_name = l[1] + cat_name = l[-1] + if cat_name in cat_mapping: + self.raw_to_cat_mapping[raw_name] = cat_name + else: + self.raw_to_cat_mapping = {k: k for k in cat_mapping.keys()} + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.COLOR + + def _get_observation_space(self, *args: Any, **kwargs: Any): + if self.config.CONVERT_TO_RGB: + observation_space = spaces.Box( + low=0, + high=255, + shape=(self.config.HEIGHT, self.config.WIDTH, 3), + dtype=np.uint8, + ) + else: + observation_space = spaces.Box( + low=np.iinfo(np.int32).min, + high=np.iinfo(np.int32).max, + shape=(self.config.HEIGHT, self.config.WIDTH), + dtype=np.int32, + ) + return observation_space + + def get_observation( + self, *args: Any, observations, episode, **kwargs: Any + ): + episode_uniq_id = f"{episode.scene_id} {episode.episode_id}" + if self._current_episode_id != episode_uniq_id: + self._current_episode_id = episode_uniq_id + # Get mapping from instance id to task id + scene = self._sim.semantic_annotations() + self.instance_id_to_task_id = np.ones( + (len(scene.objects), ), dtype=np.int64 + ) * -1 # Non-task objects are set to -1 + for obj in scene.objects: + if obj is None: + continue + obj_inst_id = int(obj.id.split("_")[-1]) + obj_name = obj.category.name() + if obj_name in self.raw_to_cat_mapping: + obj_name = self.raw_to_cat_mapping[obj_name] + obj_task_id = self.category_to_task_category_id[obj_name] + self.instance_id_to_task_id[obj_inst_id] = obj_task_id + # Set invalid instance IDs to unknown object 0 + semantic = np.copy(observations["semantic"]) + semantic[semantic >= self.instance_id_to_task_id.shape[0]] = 0 + # Map from instance id to task id + semantic_category = np.take(self.instance_id_to_task_id, semantic) + if self.config.CONVERT_TO_RGB: + semantic_category = self.convert_semantic_to_rgb(semantic_category) + + return semantic_category + + def convert_semantic_to_rgb(self, x): + max_valid_id = max(self.category_to_task_category_id.values()) + assert max_valid_id < 39 + # Map invalid values (-1) to max_valid_id + 1 + invalid_locs = x == -1 + x[x == -1] = max_valid_id + 1 + # Get RGB image + semantic_img = Image.new("P", (x.shape[1], x.shape[0])) + semantic_img.putpalette(d3_40_colors_rgb.flatten()) + semantic_img.putdata((x.flatten() % 40).astype(np.uint8)) + semantic_img = np.array(semantic_img.convert("RGB")) + # Set pixels for invalid objects to (0, 0, 0) + semantic_img[invalid_locs, :] = np.array([0, 0, 0]) + return semantic_img + + @registry.register_measure class Success(Measure): r"""Whether or not the agent succeeded at its task diff --git a/habitat/tasks/nav/semantic_constants.py b/habitat/tasks/nav/semantic_constants.py new file mode 100644 index 0000000000..593482f48e --- /dev/null +++ b/habitat/tasks/nav/semantic_constants.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +GIBSON_CATEGORY_TO_TASK_CATEGORY_ID = { + 'chair': 0, + 'dining table': 1, + 'book': 2, + 'vase': 3, + 'bottle': 4, + 'couch': 5, + 'bed': 6, + 'refrigerator': 7, + 'potted plant': 8, + 'sink': 9, + 'toilet': 10, + 'clock': 11, + 'towel': 12, + 'tv': 13, + 'oven': 14, + 'cup': 15, + 'umbrella': 16, + 'bowl': 17, + 'gym_equipment': 18, + 'bench': 19, + 'clothes': 20 +} + + +MP3D_CATEGORY_TO_TASK_CATEGORY_ID = { + 'chair': 0, + 'table': 1, + 'picture': 2, + 'cabinet': 3, + 'cushion': 4, + 'sofa': 5, + 'bed': 6, + 'chest_of_drawers': 7, + 'plant': 8, + 'sink': 9, + 'toilet': 10, + 'stool': 11, + 'towel': 12, + 'tv_monitor': 13, + 'shower': 14, + 'bathtub': 15, + 'counter': 16, + 'fireplace': 17, + 'gym_equipment': 18, + 'seating': 19, + 'clothes': 20 +} + + +HM3D_CATEGORY_TO_TASK_CATEGORY_ID = { + 'chair': 0, + 'bed': 1, + 'plant': 2, + 'toilet': 3, + 'tv_monitor': 4, + 'sofa': 5, +} diff --git a/habitat/utils/visualizations/utils.py b/habitat/utils/visualizations/utils.py index 413cebd743..b793773342 100644 --- a/habitat/utils/visualizations/utils.py +++ b/habitat/utils/visualizations/utils.py @@ -230,6 +230,11 @@ def observations_to_image(observation: Dict, info: Dict) -> np.ndarray: depth_map = depth_map.astype(np.uint8) depth_map = np.stack([depth_map for _ in range(3)], axis=2) render_obs_images.append(depth_map) + elif "semantic_category" in sensor_name: + semcat = observation[sensor_name] + if not isinstance(semcat, np.ndarray): + semcat = semcat.cpu().numpy() + render_obs_images.append(semcat) # add image goal if observation has image_goal info if "imagegoal" in observation: diff --git a/habitat_baselines/common/obs_transformers.py b/habitat_baselines/common/obs_transformers.py index e8390039ed..3e98c06f29 100644 --- a/habitat_baselines/common/obs_transformers.py +++ b/habitat_baselines/common/obs_transformers.py @@ -74,7 +74,7 @@ def __init__( self, size: int, channels_last: bool = True, - trans_keys: Tuple[str, ...] = ("rgb", "depth", "semantic"), + trans_keys: Tuple[str, ...] = ("rgb", "depth", "semantic_category"), ): """Args: size: The size you want to resize the shortest edge to @@ -145,7 +145,7 @@ def __init__( self, size: Union[numbers.Integral, Tuple[int, int]], channels_last: bool = True, - trans_keys: Tuple[str, ...] = ("rgb", "depth", "semantic"), + trans_keys: Tuple[str, ...] = ("rgb", "depth", "semantic_category"), ): """Args: size: A sequence (h, w) or int of the size you wish to resize/center_crop. diff --git a/habitat_baselines/config/objectnav/ddppo_objectnav_hm3d.yaml b/habitat_baselines/config/objectnav/ddppo_objectnav_hm3d.yaml new file mode 100644 index 0000000000..3d62ae1811 --- /dev/null +++ b/habitat_baselines/config/objectnav/ddppo_objectnav_hm3d.yaml @@ -0,0 +1,74 @@ +BASE_TASK_CONFIG_PATH: "configs/tasks/objectnav_hm3d.yaml" +CMD_TRAILING_OPTS: ["TASK_CONFIG.ENVIRONMENT.ITERATOR_OPTIONS.MAX_SCENE_REPEAT_STEPS", "50000"] +ENV_NAME: "NavRLEnv" +SIMULATOR_GPU_ID: 0 +TORCH_GPU_ID: 0 +VIDEO_OPTION: [] +TENSORBOARD_DIR: "tb" +VIDEO_DIR: "video_dir" +TEST_EPISODE_COUNT: -1 +EVAL_CKPT_PATH_DIR: "data/new_checkpoints" +NUM_ENVIRONMENTS: 4 +CHECKPOINT_FOLDER: "data/new_checkpoints" +TRAINER_NAME: "ddppo" +SENSORS: ["DEPTH_SENSOR", "RGB_SENSOR"] +NUM_UPDATES: 270000 +LOG_INTERVAL: 10 +NUM_CHECKPOINTS: 100 +# Force PyTorch to be single threaded as +# this improves performance considerably +FORCE_TORCH_SINGLE_THREADED: True + +EVAL: + SPLIT: "val" + +RL: + SUCCESS_REWARD: 2.5 + SLACK_REWARD: -1e-3 + + POLICY: + name: "PointNavResNetPolicy" + OBS_TRANSFORMS: + ENABLED_TRANSFORMS: ("ResizeShortestEdge", "CenterCropper") + + PPO: + # ppo params + clip_param: 0.2 + ppo_epoch: 4 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.01 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + num_steps: 64 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + reward_window_size: 50 + + use_normalized_advantage: False + + hidden_size: 512 + + DDPPO: + sync_frac: 0.6 + # The PyTorch distributed backend to use + distrib_backend: NCCL + # Visual encoder backbone + pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth + # Initialize with pretrained weights + pretrained: False + # Initialize just the visual encoder backbone with pretrained weights + pretrained_encoder: False + # Whether or not the visual encoder backbone will be trained. + train_encoder: True + # Whether or not to reset the critic linear layer + reset_critic: True + + # Model parameters + backbone: resnet50 + rnn_type: LSTM + num_recurrent_layers: 2 diff --git a/habitat_baselines/rl/ddppo/policy/resnet_policy.py b/habitat_baselines/rl/ddppo/policy/resnet_policy.py index 1354161a1e..a9a5d11cf2 100644 --- a/habitat_baselines/rl/ddppo/policy/resnet_policy.py +++ b/habitat_baselines/rl/ddppo/policy/resnet_policy.py @@ -119,15 +119,22 @@ def __init__( else: self._n_input_depth = 0 + if "semantic_category" in observation_space.spaces: + self._n_input_semantic_category = observation_space.spaces["semantic_category"].shape[2] + spatial_size = observation_space.spaces["semantic_category"].shape[0] // 2 + assert self._n_input_semantic_category == 3, "ResNetEncoder only supports RGB values from SemanticCategory sensor!" + else: + self._n_input_semantic_category = 0 + if normalize_visual_inputs: self.running_mean_and_var: nn.Module = RunningMeanAndVar( - self._n_input_depth + self._n_input_rgb + self._n_input_depth + self._n_input_rgb + self._n_input_semantic_category ) else: self.running_mean_and_var = nn.Sequential() if not self.is_blind: - input_channels = self._n_input_depth + self._n_input_rgb + input_channels = self._n_input_depth + self._n_input_rgb + self._n_input_semantic_category self.backbone = make_backbone(input_channels, baseplanes, ngroups) final_spatial = int( @@ -190,6 +197,17 @@ def forward(self, observations: Dict[str, torch.Tensor]) -> torch.Tensor: # typ cnn_input.append(depth_observations) + if self._n_input_semantic_category > 0: + semcat_observations = observations["semantic_category"] + + # permute tensor to dimension [BATCH x CHANNEL x HEIGHT X WIDTH] + semcat_observations = semcat_observations.permute(0, 3, 1, 2) + semcat_observations = ( + semcat_observations.float() / 255.0 + ) # normalize RGB values + + cnn_input.append(semcat_observations) + x = torch.cat(cnn_input, dim=1) x = F.avg_pool2d(x, 2) diff --git a/habitat_baselines/rl/ppo/ppo_trainer.py b/habitat_baselines/rl/ppo/ppo_trainer.py index b5b8a0f61e..2a4f122018 100644 --- a/habitat_baselines/rl/ppo/ppo_trainer.py +++ b/habitat_baselines/rl/ppo/ppo_trainer.py @@ -66,6 +66,7 @@ class PPOTrainer(BaseRLTrainer): supported_tasks = ["Nav-v0"] SHORT_ROLLOUT_THRESHOLD: float = 0.25 + SENSORS_BLACKLIST = ["semantic"] _is_distributed: bool _obs_batching_cache: ObservationBatchingCache envs: VectorEnv @@ -95,7 +96,10 @@ def __init__(self, config=None): @property def obs_space(self): if self._obs_space is None and self.envs is not None: - self._obs_space = self.envs.observation_spaces[0] + self._obs_space = spaces.Dict({ + k: v for k, v in self.envs.observation_spaces[0].spaces.items() + if k not in self.SENSORS_BLACKLIST + }) return self._obs_space @@ -311,6 +315,8 @@ def _init_train(self): self.rollouts.to(self.device) observations = self.envs.reset() + observations = self._clean_observations(observations) + batch = batch_obs( observations, device=self.device, cache=self._obs_batching_cache ) @@ -400,6 +406,15 @@ def _extract_scalars_from_info( return result + def _clean_observations(self, observations): + clean_observations = [] + for obs in observations: + obs = { + k: v for k, v in obs.items() if k not in self.SENSORS_BLACKLIST + } + clean_observations.append(obs) + return clean_observations + @classmethod def _extract_scalars_from_infos( cls, infos: List[Dict[str, Any]] @@ -488,6 +503,7 @@ def _collect_environment_result(self, buffer_index: int = 0): observations, rewards_l, dones, infos = [ list(x) for x in zip(*outputs) ] + observations = self._clean_observations(observations) self.env_time += time.time() - t_step_env @@ -924,6 +940,7 @@ def _eval_checkpoint( self.actor_critic = self.agent.actor_critic observations = self.envs.reset() + observations = self._clean_observations(observations) batch = batch_obs( observations, device=self.device, cache=self._obs_batching_cache ) @@ -1015,6 +1032,7 @@ def _eval_checkpoint( observations, rewards_l, dones, infos = [ list(x) for x in zip(*outputs) ] + observations = self._clean_observations(observations) batch = batch_obs( # type: ignore observations, device=self.device, @@ -1058,16 +1076,21 @@ def _eval_checkpoint( current_episodes[i].episode_id, ) ] = episode_stats + goal_name = None + if hasattr(current_episodes[i], "object_category"): + goal_name = current_episodes[i].object_category if len(self.config.VIDEO_OPTION) > 0: generate_video( video_option=self.config.VIDEO_OPTION, video_dir=self.config.VIDEO_DIR, images=rgb_frames[i], + scene_id=current_episodes[i].scene_id, episode_id=current_episodes[i].episode_id, checkpoint_idx=checkpoint_index, metrics=self._extract_scalars_from_info(infos[i]), tb_writer=writer, + goal_name=goal_name, ) rgb_frames[i] = [] diff --git a/habitat_baselines/utils/common.py b/habitat_baselines/utils/common.py index 8325a3a798..0c2388dc4f 100644 --- a/habitat_baselines/utils/common.py +++ b/habitat_baselines/utils/common.py @@ -326,10 +326,12 @@ def generate_video( video_option: List[str], video_dir: Optional[str], images: List[np.ndarray], + scene_id: str, episode_id: Union[int, str], checkpoint_idx: int, metrics: Dict[str, float], tb_writer: TensorboardWriter, + goal_name: Optional[str] = None, fps: int = 10, verbose: bool = True, ) -> None: @@ -339,11 +341,13 @@ def generate_video( video_option: string list of "tensorboard" or "disk" or both. video_dir: path to target video directory. images: list of images to be converted to video. + scene_id: scene id for video naming. episode_id: episode id for video naming. checkpoint_idx: checkpoint index for video naming. metric_name: name of the performance metric, e.g. "spl". metric_value: value of metric. tb_writer: tensorboard writer object for uploading video. + goal_name: name of the goal for the current episode fps: fps for generated video. Returns: None @@ -355,9 +359,12 @@ def generate_video( for k, v in metrics.items(): metric_strs.append(f"{k}={v:.2f}") - video_name = f"episode={episode_id}-ckpt={checkpoint_idx}-" + "-".join( + scene_name = scene_id.split("/")[-1].split(".")[0] + video_name = f"scene={scene_name}-episode={episode_id}-ckpt={checkpoint_idx}-" + "-".join( metric_strs ) + if goal_name is not None: + video_name += "_" + goal_name if "disk" in video_option: assert video_dir is not None images_to_video(images, video_dir, video_name, verbose=verbose)