You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I guess the reason is that the runtime memory has been exhausted. You could consider changing from saving the highest value at 20 epochs to 4 or even lower.
Hi , Thanks for your repo.
I've followed your tutorial to create my dataset, and converted it into zarr .
the training stage went well, but when i load data to prediction, the oom shows:
(umi2) fusion@fusion-1013:~/PycharmProjects/universal_manipulation_interface$ ~/miniforge3/envs/umi2/bin/python load_checkpoint_eval.py -i data/outputs/2024.07.15/10.33.31_train_diffusion_unet_timm_umi/checkpoints/latest.ckpt -o wipe/ model_name: vit_base_patch16_clip_224.openai dataset_path: /home/fusion/PycharmProjects/universal_manipulation_interface/wipe/dataset.zarr.zip steps_per_inference: 6 Waiting for camera match dataset :wipe/videos/20240708140122 match_video_path:wipe/videos/20240708140122/raw_videos.mp4 Loaded initial frame for 1 episodes {'name': 'train_diffusion_unet_timm', '_target_': 'diffusion_policy.workspace.train_diffusion_unet_image_workspace.TrainDiffusionUnetImageWorkspace', 'task_name': 'umi', 'shape_meta': {'obs': {'camera0_rgb': {'shape': [3, 224, 224], 'horizon': 2, 'latency_steps': 0, 'down_sample_steps': 3, 'type': 'rgb', 'ignore_by_policy': False}, 'robot0_eef_pos': {'shape': [3], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}, 'robot0_eef_rot_axis_angle': {'raw_shape': [3], 'shape': [6], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'rotation_rep': 'rotation_6d', 'ignore_by_policy': False}, 'robot0_gripper_width': {'shape': [1], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}, 'robot0_eef_rot_axis_angle_wrt_start': {'raw_shape': [3], 'shape': [6], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}}, 'action': {'shape': [10], 'horizon': 16, 'latency_steps': 0, 'down_sample_steps': 3, 'rotation_rep': 'rotation_6d'}}, 'exp_name': 'default', 'n_action_steps': 8, 'policy': {'_target_': 'diffusion_policy.policy.diffusion_unet_timm_policy.DiffusionUnetTimmPolicy', 'shape_meta': {'obs': {'camera0_rgb': {'shape': [3, 224, 224], 'horizon': 2, 'latency_steps': 0, 'down_sample_steps': 3, 'type': 'rgb', 'ignore_by_policy': False}, 'robot0_eef_pos': {'shape': [3], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}, 'robot0_eef_rot_axis_angle': {'raw_shape': [3], 'shape': [6], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'rotation_rep': 'rotation_6d', 'ignore_by_policy': False}, 'robot0_gripper_width': {'shape': [1], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}, 'robot0_eef_rot_axis_angle_wrt_start': {'raw_shape': [3], 'shape': [6], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}}, 'action': {'shape': [10], 'horizon': 16, 'latency_steps': 0, 'down_sample_steps': 3, 'rotation_rep': 'rotation_6d'}}, 'noise_scheduler': {'_target_': 'diffusers.DDIMScheduler', 'num_train_timesteps': 50, 'beta_start': 0.0001, 'beta_end': 0.02, 'beta_schedule': 'squaredcos_cap_v2', 'clip_sample': True, 'set_alpha_to_one': True, 'steps_offset': 0, 'prediction_type': 'epsilon'}, 'obs_encoder': {'_target_': 'diffusion_policy.model.vision.timm_obs_encoder.TimmObsEncoder', 'shape_meta': {'obs': {'camera0_rgb': {'shape': [3, 224, 224], 'horizon': 2, 'latency_steps': 0, 'down_sample_steps': 3, 'type': 'rgb', 'ignore_by_policy': False}, 'robot0_eef_pos': {'shape': [3], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}, 'robot0_eef_rot_axis_angle': {'raw_shape': [3], 'shape': [6], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'rotation_rep': 'rotation_6d', 'ignore_by_policy': False}, 'robot0_gripper_width': {'shape': [1], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}, 'robot0_eef_rot_axis_angle_wrt_start': {'raw_shape': [3], 'shape': [6], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}}, 'action': {'shape': [10], 'horizon': 16, 'latency_steps': 0, 'down_sample_steps': 3, 'rotation_rep': 'rotation_6d'}}, 'model_name': 'vit_base_patch16_clip_224.openai', 'pretrained': True, 'frozen': False, 'global_pool': '', 'feature_aggregation': 'attention_pool_2d', 'position_encording': 'sinusoidal', 'downsample_ratio': 32, 'transforms': [{'type': 'RandomCrop', 'ratio': 0.95}, {'_target_': 'torchvision.transforms.ColorJitter', 'brightness': 0.3, 'contrast': 0.4, 'saturation': 0.5, 'hue': 0.08}], 'use_group_norm': True, 'share_rgb_model': False, 'imagenet_norm': True}, 'num_inference_steps': 16, 'obs_as_global_cond': True, 'diffusion_step_embed_dim': 128, 'down_dims': [256, 512, 1024], 'kernel_size': 5, 'n_groups': 8, 'cond_predict_scale': True, 'input_pertub': 0.1, 'train_diffusion_n_samples': 1}, 'ema': {'_target_': 'diffusion_policy.model.diffusion.ema_model.EMAModel', 'update_after_step': 0, 'inv_gamma': 1.0, 'power': 0.75, 'min_value': 0.0, 'max_value': 0.9999}, 'dataloader': {'batch_size': 8, 'num_workers': 8, 'shuffle': True, 'pin_memory': True, 'persistent_workers': True}, 'val_dataloader': {'batch_size': 8, 'num_workers': 8, 'shuffle': False, 'pin_memory': True, 'persistent_workers': True}, 'optimizer': {'_target_': 'torch.optim.AdamW', 'lr': 0.0003, 'betas': [0.95, 0.999], 'eps': 1e-08, 'weight_decay': 1e-06}, 'training': {'device': 'cuda:0', 'seed': 42, 'debug': False, 'resume': False, 'lr_scheduler': 'cosine', 'lr_warmup_steps': 2000, 'num_epochs': 120, 'gradient_accumulate_every': 1, 'use_ema': True, 'freeze_encoder': False, 'rollout_every': 10, 'checkpoint_every': 10, 'val_every': 1, 'sample_every': 5, 'max_train_steps': None, 'max_val_steps': None, 'tqdm_interval_sec': 1.0}, 'logging': {'project': 'umi', 'resume': False, 'mode': 'online', 'name': '2024.07.10-14.08.26_train_diffusion_unet_timm_umi', 'tags': ['train_diffusion_unet_timm', 'umi', 'default'], 'id': None, 'group': None}, 'checkpoint': {'topk': {'monitor_key': 'train_loss', 'mode': 'min', 'k': 20, 'format_str': 'epoch={epoch:04d}-train_loss={train_loss:.3f}.ckpt'}, 'save_last_ckpt': True, 'save_last_snapshot': False}, 'multi_run': {'run_dir': 'data/outputs/2024.07.10/14.08.26_train_diffusion_unet_timm_umi', 'wandb_name_base': '2024.07.10-14.08.26_train_diffusion_unet_timm_umi'}, 'task': {'name': 'umi', 'camera_obs_latency': 0.125, 'robot_obs_latency': 0.0001, 'gripper_obs_latency': 0.02, 'dataset_frequeny': 0, 'obs_down_sample_steps': 3, 'low_dim_obs_horizon': 2, 'img_obs_horizon': 2, 'action_horizon': 16, 'ignore_proprioception': False, 'shape_meta': {'obs': {'camera0_rgb': {'shape': [3, 224, 224], 'horizon': 2, 'latency_steps': 0, 'down_sample_steps': 3, 'type': 'rgb', 'ignore_by_policy': False}, 'robot0_eef_pos': {'shape': [3], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}, 'robot0_eef_rot_axis_angle': {'raw_shape': [3], 'shape': [6], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'rotation_rep': 'rotation_6d', 'ignore_by_policy': False}, 'robot0_gripper_width': {'shape': [1], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}, 'robot0_eef_rot_axis_angle_wrt_start': {'raw_shape': [3], 'shape': [6], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}}, 'action': {'shape': [10], 'horizon': 16, 'latency_steps': 0, 'down_sample_steps': 3, 'rotation_rep': 'rotation_6d'}}, 'task_name': 'umi', 'dataset_path': '/home/fusion/PycharmProjects/universal_manipulation_interface/wipe/dataset.zarr.zip', 'pose_repr': {'obs_pose_repr': 'relative', 'action_pose_repr': 'relative'}, 'env_runner': {'_target_': 'diffusion_policy.env_runner.real_pusht_image_runner.RealPushTImageRunner'}, 'dataset': {'_target_': 'diffusion_policy.dataset.umi_dataset.UmiDataset', 'shape_meta': {'obs': {'camera0_rgb': {'shape': [3, 224, 224], 'horizon': 2, 'latency_steps': 0, 'down_sample_steps': 3, 'type': 'rgb', 'ignore_by_policy': False}, 'robot0_eef_pos': {'shape': [3], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}, 'robot0_eef_rot_axis_angle': {'raw_shape': [3], 'shape': [6], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'rotation_rep': 'rotation_6d', 'ignore_by_policy': False}, 'robot0_gripper_width': {'shape': [1], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}, 'robot0_eef_rot_axis_angle_wrt_start': {'raw_shape': [3], 'shape': [6], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}}, 'action': {'shape': [10], 'horizon': 16, 'latency_steps': 0, 'down_sample_steps': 3, 'rotation_rep': 'rotation_6d'}}, 'dataset_path': '/home/fusion/PycharmProjects/universal_manipulation_interface/wipe/dataset.zarr.zip', 'cache_dir': None, 'pose_repr': {'obs_pose_repr': 'relative', 'action_pose_repr': 'relative'}, 'action_padding': False, 'temporally_independent_normalization': False, 'repeat_frame_prob': 0.0, 'max_duration': None, 'seed': 42, 'val_ratio': 0.05}}} rgb keys: ['camera0_rgb'] low_dim_keys keys: ['robot0_eef_pos', 'robot0_eef_rot_axis_angle', 'robot0_eef_rot_axis_angle_wrt_start', 'robot0_gripper_width'] vit will use the CLS token. feature_aggregation (attention_pool_2d) is ignored! ==> reduce pretrained obs_encorder's lr obs_encorder params: 151 Warming up policy inference Traceback (most recent call last): File "/home/fusion/PycharmProjects/universal_manipulation_interface/load_checkpoint_eval.py", line 322, in <module> main() File "/home/fusion/miniforge3/envs/umi2/lib/python3.9/site-packages/click/core.py", line 1157, in __call__ return self.main(*args, **kwargs) File "/home/fusion/miniforge3/envs/umi2/lib/python3.9/site-packages/click/core.py", line 1078, in main rv = self.invoke(ctx) File "/home/fusion/miniforge3/envs/umi2/lib/python3.9/site-packages/click/core.py", line 1434, in invoke return ctx.invoke(self.callback, **ctx.params) File "/home/fusion/miniforge3/envs/umi2/lib/python3.9/site-packages/click/core.py", line 783, in invoke return __callback(*args, **kwargs) File "/home/fusion/PycharmProjects/universal_manipulation_interface/load_checkpoint_eval.py", line 266, in main result = policy.predict_action(obs_dict) File "/home/fusion/PycharmProjects/universal_manipulation_interface/diffusion_policy/policy/diffusion_unet_timm_policy.py", line 129, in predict_action nobs = self.normalizer.normalize(obs_dict) File "/home/fusion/PycharmProjects/universal_manipulation_interface/diffusion_policy/model/common/normalizer.py", line 68, in normalize return self._normalize_impl(x, forward=True) File "/home/fusion/PycharmProjects/universal_manipulation_interface/diffusion_policy/model/common/normalizer.py", line 59, in _normalize_impl result[key] = _normalize(value, params, forward=forward) File "/home/fusion/PycharmProjects/universal_manipulation_interface/diffusion_policy/model/common/normalizer.py", line 272, in _normalize x = x * scale + offset torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.95 GiB (GPU 0; 7.78 GiB total capacity; 6.50 GiB already allocated; 917.81 MiB free; 6.53 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
#########################################################
The paper used 8 pcs of A100 , but i'm poor~ I have tried RTX3070(8G), V100(25G), A6000(51G), but all out of memory.
Actually, The trained model has 170 million parameters, 170*10^7 *4 /1024^3=0.64GB , why always show OUT OF MEMORY OOM. T_T~~~
Can somebody tell me the most minimum Gpu memory the inference stage, huge thanks~ ^o^
The text was updated successfully, but these errors were encountered: