Skip to content

Prediction stage: minimum GPU memory #55

Open
@cynthia-you

Description

@cynthia-you

Hi , Thanks for your repo.
I've followed your tutorial to create my dataset, and converted it into zarr .
the training stage went well, but when i load data to prediction, the oom shows:
(umi2) fusion@fusion-1013:~/PycharmProjects/universal_manipulation_interface$ ~/miniforge3/envs/umi2/bin/python load_checkpoint_eval.py -i data/outputs/2024.07.15/10.33.31_train_diffusion_unet_timm_umi/checkpoints/latest.ckpt -o wipe/ model_name: vit_base_patch16_clip_224.openai dataset_path: /home/fusion/PycharmProjects/universal_manipulation_interface/wipe/dataset.zarr.zip steps_per_inference: 6 Waiting for camera match dataset :wipe/videos/20240708140122 match_video_path:wipe/videos/20240708140122/raw_videos.mp4 Loaded initial frame for 1 episodes {'name': 'train_diffusion_unet_timm', '_target_': 'diffusion_policy.workspace.train_diffusion_unet_image_workspace.TrainDiffusionUnetImageWorkspace', 'task_name': 'umi', 'shape_meta': {'obs': {'camera0_rgb': {'shape': [3, 224, 224], 'horizon': 2, 'latency_steps': 0, 'down_sample_steps': 3, 'type': 'rgb', 'ignore_by_policy': False}, 'robot0_eef_pos': {'shape': [3], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}, 'robot0_eef_rot_axis_angle': {'raw_shape': [3], 'shape': [6], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'rotation_rep': 'rotation_6d', 'ignore_by_policy': False}, 'robot0_gripper_width': {'shape': [1], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}, 'robot0_eef_rot_axis_angle_wrt_start': {'raw_shape': [3], 'shape': [6], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}}, 'action': {'shape': [10], 'horizon': 16, 'latency_steps': 0, 'down_sample_steps': 3, 'rotation_rep': 'rotation_6d'}}, 'exp_name': 'default', 'n_action_steps': 8, 'policy': {'_target_': 'diffusion_policy.policy.diffusion_unet_timm_policy.DiffusionUnetTimmPolicy', 'shape_meta': {'obs': {'camera0_rgb': {'shape': [3, 224, 224], 'horizon': 2, 'latency_steps': 0, 'down_sample_steps': 3, 'type': 'rgb', 'ignore_by_policy': False}, 'robot0_eef_pos': {'shape': [3], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}, 'robot0_eef_rot_axis_angle': {'raw_shape': [3], 'shape': [6], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'rotation_rep': 'rotation_6d', 'ignore_by_policy': False}, 'robot0_gripper_width': {'shape': [1], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}, 'robot0_eef_rot_axis_angle_wrt_start': {'raw_shape': [3], 'shape': [6], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}}, 'action': {'shape': [10], 'horizon': 16, 'latency_steps': 0, 'down_sample_steps': 3, 'rotation_rep': 'rotation_6d'}}, 'noise_scheduler': {'_target_': 'diffusers.DDIMScheduler', 'num_train_timesteps': 50, 'beta_start': 0.0001, 'beta_end': 0.02, 'beta_schedule': 'squaredcos_cap_v2', 'clip_sample': True, 'set_alpha_to_one': True, 'steps_offset': 0, 'prediction_type': 'epsilon'}, 'obs_encoder': {'_target_': 'diffusion_policy.model.vision.timm_obs_encoder.TimmObsEncoder', 'shape_meta': {'obs': {'camera0_rgb': {'shape': [3, 224, 224], 'horizon': 2, 'latency_steps': 0, 'down_sample_steps': 3, 'type': 'rgb', 'ignore_by_policy': False}, 'robot0_eef_pos': {'shape': [3], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}, 'robot0_eef_rot_axis_angle': {'raw_shape': [3], 'shape': [6], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'rotation_rep': 'rotation_6d', 'ignore_by_policy': False}, 'robot0_gripper_width': {'shape': [1], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}, 'robot0_eef_rot_axis_angle_wrt_start': {'raw_shape': [3], 'shape': [6], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}}, 'action': {'shape': [10], 'horizon': 16, 'latency_steps': 0, 'down_sample_steps': 3, 'rotation_rep': 'rotation_6d'}}, 'model_name': 'vit_base_patch16_clip_224.openai', 'pretrained': True, 'frozen': False, 'global_pool': '', 'feature_aggregation': 'attention_pool_2d', 'position_encording': 'sinusoidal', 'downsample_ratio': 32, 'transforms': [{'type': 'RandomCrop', 'ratio': 0.95}, {'_target_': 'torchvision.transforms.ColorJitter', 'brightness': 0.3, 'contrast': 0.4, 'saturation': 0.5, 'hue': 0.08}], 'use_group_norm': True, 'share_rgb_model': False, 'imagenet_norm': True}, 'num_inference_steps': 16, 'obs_as_global_cond': True, 'diffusion_step_embed_dim': 128, 'down_dims': [256, 512, 1024], 'kernel_size': 5, 'n_groups': 8, 'cond_predict_scale': True, 'input_pertub': 0.1, 'train_diffusion_n_samples': 1}, 'ema': {'_target_': 'diffusion_policy.model.diffusion.ema_model.EMAModel', 'update_after_step': 0, 'inv_gamma': 1.0, 'power': 0.75, 'min_value': 0.0, 'max_value': 0.9999}, 'dataloader': {'batch_size': 8, 'num_workers': 8, 'shuffle': True, 'pin_memory': True, 'persistent_workers': True}, 'val_dataloader': {'batch_size': 8, 'num_workers': 8, 'shuffle': False, 'pin_memory': True, 'persistent_workers': True}, 'optimizer': {'_target_': 'torch.optim.AdamW', 'lr': 0.0003, 'betas': [0.95, 0.999], 'eps': 1e-08, 'weight_decay': 1e-06}, 'training': {'device': 'cuda:0', 'seed': 42, 'debug': False, 'resume': False, 'lr_scheduler': 'cosine', 'lr_warmup_steps': 2000, 'num_epochs': 120, 'gradient_accumulate_every': 1, 'use_ema': True, 'freeze_encoder': False, 'rollout_every': 10, 'checkpoint_every': 10, 'val_every': 1, 'sample_every': 5, 'max_train_steps': None, 'max_val_steps': None, 'tqdm_interval_sec': 1.0}, 'logging': {'project': 'umi', 'resume': False, 'mode': 'online', 'name': '2024.07.10-14.08.26_train_diffusion_unet_timm_umi', 'tags': ['train_diffusion_unet_timm', 'umi', 'default'], 'id': None, 'group': None}, 'checkpoint': {'topk': {'monitor_key': 'train_loss', 'mode': 'min', 'k': 20, 'format_str': 'epoch={epoch:04d}-train_loss={train_loss:.3f}.ckpt'}, 'save_last_ckpt': True, 'save_last_snapshot': False}, 'multi_run': {'run_dir': 'data/outputs/2024.07.10/14.08.26_train_diffusion_unet_timm_umi', 'wandb_name_base': '2024.07.10-14.08.26_train_diffusion_unet_timm_umi'}, 'task': {'name': 'umi', 'camera_obs_latency': 0.125, 'robot_obs_latency': 0.0001, 'gripper_obs_latency': 0.02, 'dataset_frequeny': 0, 'obs_down_sample_steps': 3, 'low_dim_obs_horizon': 2, 'img_obs_horizon': 2, 'action_horizon': 16, 'ignore_proprioception': False, 'shape_meta': {'obs': {'camera0_rgb': {'shape': [3, 224, 224], 'horizon': 2, 'latency_steps': 0, 'down_sample_steps': 3, 'type': 'rgb', 'ignore_by_policy': False}, 'robot0_eef_pos': {'shape': [3], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}, 'robot0_eef_rot_axis_angle': {'raw_shape': [3], 'shape': [6], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'rotation_rep': 'rotation_6d', 'ignore_by_policy': False}, 'robot0_gripper_width': {'shape': [1], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}, 'robot0_eef_rot_axis_angle_wrt_start': {'raw_shape': [3], 'shape': [6], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}}, 'action': {'shape': [10], 'horizon': 16, 'latency_steps': 0, 'down_sample_steps': 3, 'rotation_rep': 'rotation_6d'}}, 'task_name': 'umi', 'dataset_path': '/home/fusion/PycharmProjects/universal_manipulation_interface/wipe/dataset.zarr.zip', 'pose_repr': {'obs_pose_repr': 'relative', 'action_pose_repr': 'relative'}, 'env_runner': {'_target_': 'diffusion_policy.env_runner.real_pusht_image_runner.RealPushTImageRunner'}, 'dataset': {'_target_': 'diffusion_policy.dataset.umi_dataset.UmiDataset', 'shape_meta': {'obs': {'camera0_rgb': {'shape': [3, 224, 224], 'horizon': 2, 'latency_steps': 0, 'down_sample_steps': 3, 'type': 'rgb', 'ignore_by_policy': False}, 'robot0_eef_pos': {'shape': [3], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}, 'robot0_eef_rot_axis_angle': {'raw_shape': [3], 'shape': [6], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'rotation_rep': 'rotation_6d', 'ignore_by_policy': False}, 'robot0_gripper_width': {'shape': [1], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}, 'robot0_eef_rot_axis_angle_wrt_start': {'raw_shape': [3], 'shape': [6], 'horizon': 2, 'latency_steps': 0.0, 'down_sample_steps': 3, 'type': 'low_dim', 'ignore_by_policy': False}}, 'action': {'shape': [10], 'horizon': 16, 'latency_steps': 0, 'down_sample_steps': 3, 'rotation_rep': 'rotation_6d'}}, 'dataset_path': '/home/fusion/PycharmProjects/universal_manipulation_interface/wipe/dataset.zarr.zip', 'cache_dir': None, 'pose_repr': {'obs_pose_repr': 'relative', 'action_pose_repr': 'relative'}, 'action_padding': False, 'temporally_independent_normalization': False, 'repeat_frame_prob': 0.0, 'max_duration': None, 'seed': 42, 'val_ratio': 0.05}}} rgb keys: ['camera0_rgb'] low_dim_keys keys: ['robot0_eef_pos', 'robot0_eef_rot_axis_angle', 'robot0_eef_rot_axis_angle_wrt_start', 'robot0_gripper_width'] vit will use the CLS token. feature_aggregation (attention_pool_2d) is ignored! ==> reduce pretrained obs_encorder's lr obs_encorder params: 151 Warming up policy inference Traceback (most recent call last): File "/home/fusion/PycharmProjects/universal_manipulation_interface/load_checkpoint_eval.py", line 322, in <module> main() File "/home/fusion/miniforge3/envs/umi2/lib/python3.9/site-packages/click/core.py", line 1157, in __call__ return self.main(*args, **kwargs) File "/home/fusion/miniforge3/envs/umi2/lib/python3.9/site-packages/click/core.py", line 1078, in main rv = self.invoke(ctx) File "/home/fusion/miniforge3/envs/umi2/lib/python3.9/site-packages/click/core.py", line 1434, in invoke return ctx.invoke(self.callback, **ctx.params) File "/home/fusion/miniforge3/envs/umi2/lib/python3.9/site-packages/click/core.py", line 783, in invoke return __callback(*args, **kwargs) File "/home/fusion/PycharmProjects/universal_manipulation_interface/load_checkpoint_eval.py", line 266, in main result = policy.predict_action(obs_dict) File "/home/fusion/PycharmProjects/universal_manipulation_interface/diffusion_policy/policy/diffusion_unet_timm_policy.py", line 129, in predict_action nobs = self.normalizer.normalize(obs_dict) File "/home/fusion/PycharmProjects/universal_manipulation_interface/diffusion_policy/model/common/normalizer.py", line 68, in normalize return self._normalize_impl(x, forward=True) File "/home/fusion/PycharmProjects/universal_manipulation_interface/diffusion_policy/model/common/normalizer.py", line 59, in _normalize_impl result[key] = _normalize(value, params, forward=forward) File "/home/fusion/PycharmProjects/universal_manipulation_interface/diffusion_policy/model/common/normalizer.py", line 272, in _normalize x = x * scale + offset torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.95 GiB (GPU 0; 7.78 GiB total capacity; 6.50 GiB already allocated; 917.81 MiB free; 6.53 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
#########################################################
The paper used 8 pcs of A100 , but i'm poor~ I have tried RTX3070(8G), V100(25G), A6000(51G), but all out of memory.
Actually, The trained model has 170 million parameters, 170*10^7 *4 /1024^3=0.64GB , why always show OUT OF MEMORY OOM. T_T~~~
Can somebody tell me the most minimum Gpu memory the inference stage, huge thanks~ ^o^

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions