From 4df44876e9254b1a64ad0a35705a26f77179c8c3 Mon Sep 17 00:00:00 2001
From: Tony Fang <galaxies@vip.qq.com>
Date: Fri, 10 Sep 2021 17:09:29 +0800
Subject: [PATCH] update smooth loss

---
 configs/320x240/train_tg_val_tg.yaml         |   2 +
 configs/320x240/train_tg_val_tg_add.yaml     |  61 +++++
 configs/320x240/train_tg_val_tg_combine.yaml |  63 +++++
 configs/inference.yaml                       |   2 +-
 datasets/transcg.py                          |  12 +-
 inference.py                                 |  12 +-
 sample_inference.py                          |   1 +
 test.py                                      |  25 +-
 train.py                                     |  46 ++--
 utils/builder.py                             |  28 ++-
 utils/constants.py                           |   6 +-
 utils/criterion.py                           | 239 +++++++++----------
 utils/data_preparation.py                    | 104 ++++++--
 utils/functions.py                           | 135 ++++++++++-
 utils/metrics.py                             |  13 +-
 15 files changed, 545 insertions(+), 204 deletions(-)
 create mode 100644 configs/320x240/train_tg_val_tg_add.yaml
 create mode 100644 configs/320x240/train_tg_val_tg_combine.yaml

diff --git a/configs/320x240/train_tg_val_tg.yaml b/configs/320x240/train_tg_val_tg.yaml
index 4b27aa7..e038496 100644
--- a/configs/320x240/train_tg_val_tg.yaml
+++ b/configs/320x240/train_tg_val_tg.yaml
@@ -27,6 +27,7 @@
     "rgb_augmentation_probability": 0.8
     "depth_min": 0.0
     "depth_max": 10.0
+    "depth_norm": 10.0
   "test":
     "type": "transcg"
     "data_dir": "data"
@@ -34,6 +35,7 @@
     "use_augmentation": False
     "depth_min": 0.0
     "depth_max": 10.0
+    "depth_norm": 10.0
 
 "dataloader":
   "num_workers": 48
diff --git a/configs/320x240/train_tg_val_tg_add.yaml b/configs/320x240/train_tg_val_tg_add.yaml
new file mode 100644
index 0000000..78641be
--- /dev/null
+++ b/configs/320x240/train_tg_val_tg_add.yaml
@@ -0,0 +1,61 @@
+# script id: 1
+"model":
+  "type": "DFNet"
+  "params":
+    "in_channels": 4
+    "hidden_channels": 64
+    "L": 5
+    "k": 12
+
+"optimizer":
+  "type": "AdamW"
+  "params":
+    "lr": 0.001
+
+"lr_scheduler":
+  "type": "MultiStepLR"
+  "params":
+    "milestones": [5, 15, 25, 35]
+    "gamma": 0.2
+
+"dataset":
+  "train":
+    "type": "transcg"
+    "data_dir": "data"
+    "image_size": !!python/tuple [320, 240]
+    "use_augmentation": True
+    "rgb_augmentation_probability": 0.8
+    "depth_min": 0.0
+    "depth_max": 10.0
+    "depth_norm": 10.0
+  "test":
+    "type": "transcg"
+    "data_dir": "data"
+    "image_size": !!python/tuple [320, 240]
+    "use_augmentation": False
+    "depth_min": 0.0
+    "depth_max": 10.0
+    "depth_norm": 10.0
+
+"dataloader":
+  "num_workers": 48
+  "shuffle": True
+  "drop_last": True
+
+"trainer":
+  "batch_size": 32
+  "test_batch_size": 1
+  "multigpu": True
+  "max_epoch": 40  
+  "criterion":
+    "type": "custom_masked_mse_loss"
+    "epsilon": 0.00000001
+
+"metrics":
+  "types": ["MSE", "MaskedMSE", "RMSE", "MaskedRMSE", "REL", "MaskedREL", "MAE", "MaskedMAE", "Threshold@1.05", "MaskedThreshold@1.05", "Threshold@1.10", "MaskedThreshold@1.10", "Threshold@1.25", "MaskedThreshold@1.25"]
+  "epsilon": 0.00000001
+  "depth_scale": 10.0
+
+"stats":
+  "stats_dir": "stats"
+  "stats_exper": "train-tg-val-tg-add"
diff --git a/configs/320x240/train_tg_val_tg_combine.yaml b/configs/320x240/train_tg_val_tg_combine.yaml
new file mode 100644
index 0000000..0416455
--- /dev/null
+++ b/configs/320x240/train_tg_val_tg_combine.yaml
@@ -0,0 +1,63 @@
+# script id: 1
+"model":
+  "type": "DFNet"
+  "params":
+    "in_channels": 4
+    "hidden_channels": 64
+    "L": 5
+    "k": 12
+
+"optimizer":
+  "type": "AdamW"
+  "params":
+    "lr": 0.001
+
+"lr_scheduler":
+  "type": "MultiStepLR"
+  "params":
+    "milestones": [5, 15, 25, 35]
+    "gamma": 0.2
+
+"dataset":
+  "train":
+    "type": "transcg"
+    "data_dir": "data"
+    "image_size": !!python/tuple [320, 240]
+    "use_augmentation": True
+    "rgb_augmentation_probability": 0.8
+    "depth_min": 0.3
+    "depth_max": 1.5
+    "depth_norm": 1.0
+  "test":
+    "type": "transcg"
+    "data_dir": "data"
+    "image_size": !!python/tuple [320, 240]
+    "use_augmentation": False
+    "depth_min": 0.3
+    "depth_max": 1.5
+    "depth_norm": 1.0
+
+"dataloader":
+  "num_workers": 48
+  "shuffle": True
+  "drop_last": True
+
+"trainer":
+  "batch_size": 32
+  "test_batch_size": 1
+  "multigpu": True
+  "max_epoch": 40  
+  "criterion":
+    "type": "custom_masked_mse_loss"
+    "epsilon": 0.00000001
+    "combined_smooth": True
+    "combined_beta": 0.005
+
+"metrics":
+  "types": ["MSE", "MaskedMSE", "RMSE", "MaskedRMSE", "REL", "MaskedREL", "MAE", "MaskedMAE", "Threshold@1.05", "MaskedThreshold@1.05", "Threshold@1.10", "MaskedThreshold@1.10", "Threshold@1.25", "MaskedThreshold@1.25"]
+  "epsilon": 0.00000001
+  "depth_scale": 1.0
+
+"stats":
+  "stats_dir": "stats"
+  "stats_exper": "train-tg-val-tg-comb"
diff --git a/configs/inference.yaml b/configs/inference.yaml
index 235e730..8755cb9 100644
--- a/configs/inference.yaml
+++ b/configs/inference.yaml
@@ -7,7 +7,7 @@
     "k": 12
 
 "inference":
-  "checkpoint_path": "stats/checkpoint.tar"
+  "checkpoint_path": "stats/train-tg-val-tg-comb/checkpoint.tar"
   "image_size": !!python/tuple [320, 240]
   "cuda_id": 0
   "depth_min": 0.0
diff --git a/datasets/transcg.py b/datasets/transcg.py
index e17087b..b5feef1 100644
--- a/datasets/transcg.py
+++ b/datasets/transcg.py
@@ -33,9 +33,6 @@ def __init__(self, data_dir, split = 'train', **kwargs):
             raise AttributeError('Invalid split option.')
         self.data_dir = data_dir
         self.split = split
-        self.high_resolution = kwargs.get('high_resolution', False)
-        if self.high_resolution and split == 'train':
-            raise AttributeError('Does not support returning high resolution images during training. If you want to train on high resolution samples, please set image_size arguments in high resolution.')
         with open(os.path.join(self.data_dir, 'metadata.json'), 'r') as fp:
             self.dataset_metadata = json.load(fp)
         self.scene_num = self.dataset_metadata['total_scenes']
@@ -64,11 +61,14 @@ def __init__(self, data_dir, split = 'train', **kwargs):
                 ])
         # Integrity double-check
         assert len(self.sample_info) == self.total_samples, "Error in total samples, expect {} samples, found {} samples.".format(self.total_samples, len(self.sample_info))
+        # Other parameters
+        self.cam_intrinsics = [None, np.load(os.path.join(self.data_dir, 'camera_intrinsics', 'camIntrinsics-D435.npy')), np.load(os.path.join(self.data_dir, 'camera_intrinsics', 'camIntrinsics-L515.npy'))]
         self.use_aug = kwargs.get('use_augmentation', True)
         self.rgb_aug_prob = kwargs.get('rgb_augmentation_probability', 0.8)
         self.image_size = kwargs.get('image_size', (1280, 720))
-        self.depth_min = kwargs.get('depth_min', 0.0)
-        self.depth_max = kwargs.get('depth_max', 10.0)
+        self.depth_min = kwargs.get('depth_min', 0.3)
+        self.depth_max = kwargs.get('depth_max', 1.5)
+        self.depth_norm = kwargs.get('depth_norm', 1.0)
 
     def __getitem__(self, id):
         img_path, camera_type, scene_type = self.sample_info[id]
@@ -76,7 +76,7 @@ def __getitem__(self, id):
         depth = np.array(Image.open(os.path.join(img_path, 'depth{}.png'.format(camera_type))), dtype = np.float32)
         depth_gt = np.array(Image.open(os.path.join(img_path, 'depth{}-gt.png'.format(camera_type))), dtype = np.float32)
         depth_gt_mask = np.array(Image.open(os.path.join(img_path, 'depth{}-gt-mask.png'.format(camera_type))), dtype = np.uint8)
-        return process_data(rgb, depth, depth_gt, depth_gt_mask, scene_type, camera_type, split = self.split, image_size = self.image_size, depth_min = self.depth_min, depth_max = self.depth_max, use_aug = self.use_aug, rgb_aug_prob = self.rgb_aug_prob, retain_original = self.high_resolution)
+        return process_data(rgb, depth, depth_gt, depth_gt_mask, self.cam_intrinsics[camera_type], scene_type, camera_type, split = self.split, image_size = self.image_size, depth_min = self.depth_min, depth_max = self.depth_max, depth_norm = self.depth_norm, use_aug = self.use_aug, rgb_aug_prob = self.rgb_aug_prob)
     
     def __len__(self):
         return self.total_samples
diff --git a/inference.py b/inference.py
index 8f05c97..3b39223 100644
--- a/inference.py
+++ b/inference.py
@@ -66,6 +66,7 @@ def __init__(self, cfg_path = os.path.join('configs', 'inference.yaml'), with_in
         
         self.image_size = self.builder.get_inference_image_size()
         self.depth_min, self.depth_max = self.builder.get_inference_depth_min_max()
+        self.depth_norm = self.builder.get_inference_depth_norm()
 
     def inference(self, rgb, depth, target_size = (1280, 720)):
         """
@@ -86,7 +87,10 @@ def inference(self, rgb, depth, target_size = (1280, 720)):
         
         rgb = cv2.resize(rgb, self.image_size, interpolation = cv2.INTER_NEAREST)
         depth = cv2.resize(depth, self.image_size, interpolation = cv2.INTER_NEAREST)
-        depth = (depth - self.depth_min) / (self.depth_max - self.depth_min)
+        depth = np.where(depth < self.depth_min, 0, depth)
+        depth = np.where(depth > self.depth_max, 0, depth)
+        depth[np.isnan(depth)] = 0
+        depth = depth / self.depth_norm
         rgb = (rgb / 255.0).transpose(2, 0, 1)
         rgb = torch.FloatTensor(rgb).to(self.device).unsqueeze(0)
         depth = torch.FloatTensor(depth).to(self.device).unsqueeze(0)
@@ -97,7 +101,7 @@ def inference(self, rgb, depth, target_size = (1280, 720)):
         if self.with_info:
             self.logger.info("Inference finished, time: {:.4f}s.".format(time_end - time_start))
         depth_res = depth_res.squeeze(0).cpu().detach().numpy()
-        depth_res = depth_res * (self.depth_max - self.depth_min) + self.depth_min
-        depth_res = cv2.resize(depth_res, target_size, interpolation = cv2.INTER_NEAREST)
+        depth_res = depth_res * self.depth_norm
+        depth_res = cv2.resize(depth_res, target_size, interpolation = cv2.INTER_LANCZOS4)
         return depth_res
-    
+    
\ No newline at end of file
diff --git a/sample_inference.py b/sample_inference.py
index 982cbc9..44ff694 100644
--- a/sample_inference.py
+++ b/sample_inference.py
@@ -55,6 +55,7 @@ def draw_point_cloud(color, depth, camera_intrinsics, use_mask = False, use_inpa
 cam_intrinsics = np.load('data/camera_intrinsics/camIntrinsics-D435.npy')
 
 res = np.clip(res, 0.1, 1.5)
+depth = np.clip(depth, 0.1, 1.5)
 
 cloud = draw_point_cloud(rgb, res, cam_intrinsics, scale = 1.0)
 
diff --git a/test.py b/test.py
index 3415eb3..a5dd6b9 100644
--- a/test.py
+++ b/test.py
@@ -14,6 +14,7 @@
 from tqdm import tqdm
 from utils.logger import ColoredLogger
 from utils.builder import ConfigBuilder
+from utils.functions import to_device
 from time import perf_counter
 
 
@@ -68,22 +69,22 @@ def test():
     running_time = []
     losses = []
     with tqdm(test_dataloader) as pbar:
-        for data in pbar:
-            rgb, depth, depth_gt, depth_gt_mask, scene_mask = data
-            rgb = rgb.to(device)
-            depth = depth.to(device)
-            depth_gt = depth_gt.to(device)
-            depth_gt_mask = depth_gt_mask.to(device)
-            scene_mask = scene_mask.to(device)
+        for data_dict in pbar:
+            data_dict = to_device(data_dict, device)
             with torch.no_grad():
                 time_start = perf_counter()
-                res = model(rgb, depth)
+                res = model(data_dict['rgb'], data_dict['depth'])
                 time_end = perf_counter()
-                loss = criterion(res, depth_gt, depth_gt_mask, scene_mask)
-                _ = metrics.evaluate_batch(res, depth_gt, depth_gt_mask, scene_mask, record = True)
+                data_dict['pred'] = res
+                loss_dict = criterion(data_dict)
+                loss = loss_dict['loss']
+                _ = metrics.evaluate_batch(data_dict, record = True)
             duration = time_end - time_start
-            pbar.set_description('Loss: {:.8f}, model time: {:.4f}s'.format(loss.mean().item(), duration))
-            losses.append(loss.mean().item())
+            if 'smooth' in loss_dict.keys():
+                pbar.set_description('Loss: {:.8f}, smooth loss: {:.8f}'.format(loss.item(), loss_dict['smooth'].item()))
+            else:
+                pbar.set_description('Loss: {:.8f}'.format(loss.item()))
+            losses.append(loss.item())
             running_time.append(duration)
     mean_loss = np.stack(losses).mean()
     avg_running_time = np.stack(running_time).mean()
diff --git a/train.py b/train.py
index 2a01fbd..d4f39da 100644
--- a/train.py
+++ b/train.py
@@ -15,7 +15,7 @@
 from utils.logger import ColoredLogger
 from utils.builder import ConfigBuilder
 from utils.constants import LOSS_INF
-from utils.functions import display_results
+from utils.functions import display_results, to_device
 from time import perf_counter
 
 
@@ -87,19 +87,19 @@ def train_one_epoch(epoch):
     model.train()
     losses = []
     with tqdm(train_dataloader) as pbar:
-        for data in pbar:
+        for data_dict in pbar:
             optimizer.zero_grad()
-            rgb, depth, depth_gt, depth_gt_mask, scene_mask = data
-            rgb = rgb.to(device)
-            depth = depth.to(device)
-            depth_gt = depth_gt.to(device)
-            depth_gt_mask = depth_gt_mask.to(device)
-            scene_mask = scene_mask.to(device)
-            res = model(rgb, depth)
-            loss = criterion(res, depth_gt, depth_gt_mask, scene_mask)
+            data_dict = to_device(data_dict, device)
+            res = model(data_dict['rgb'], data_dict['depth'])
+            data_dict['pred'] = res
+            loss_dict = criterion(data_dict)
+            loss = loss_dict['loss']
             loss.backward()
             optimizer.step()
-            pbar.set_description('Epoch {}, loss: {:.8f}'.format(epoch + 1, loss.mean().item()))
+            if 'smooth' in loss_dict.keys():
+                pbar.set_description('Epoch {}, loss: {:.8f}, smooth loss: {:.8f}'.format(epoch + 1, loss.item(), loss_dict['smooth'].item()))
+            else:
+                pbar.set_description('Epoch {}, loss: {:.8f}'.format(epoch + 1, loss.item()))
             losses.append(loss.mean().item())
     mean_loss = np.stack(losses).mean()
     logger.info('Finish training process in epoch {}, mean training loss: {:.8f}'.format(epoch + 1, mean_loss))
@@ -112,22 +112,22 @@ def test_one_epoch(epoch):
     running_time = []
     losses = []
     with tqdm(test_dataloader) as pbar:
-        for data in pbar:
-            rgb, depth, depth_gt, depth_gt_mask, scene_mask = data
-            rgb = rgb.to(device)
-            depth = depth.to(device)
-            depth_gt = depth_gt.to(device)
-            depth_gt_mask = depth_gt_mask.to(device)
-            scene_mask = scene_mask.to(device)
+        for data_dict in pbar:
+            data_dict = to_device(data_dict, device)
             with torch.no_grad():
                 time_start = perf_counter()
-                res = model(rgb, depth)
+                res = model(data_dict['rgb'], data_dict['depth'])
                 time_end = perf_counter()
-                loss = criterion(res, depth_gt, depth_gt_mask, scene_mask)
-                _ = metrics.evaluate_batch(res, depth_gt, depth_gt_mask, scene_mask, record = True)
+                data_dict['pred'] = res
+                loss_dict = criterion(data_dict)
+                loss = loss_dict['loss']
+                _ = metrics.evaluate_batch(data_dict, record = True)
             duration = time_end - time_start
-            pbar.set_description('Epoch {}, loss: {:.8f}, model time: {:.4f}s'.format(epoch + 1, loss.mean().item(), duration))
-            losses.append(loss.mean().item())
+            if 'smooth' in loss_dict.keys():
+                pbar.set_description('Epoch {}, loss: {:.8f}, smooth loss: {:.8f}'.format(epoch + 1, loss.item(), loss_dict['smooth'].item()))
+            else:
+                pbar.set_description('Epoch {}, loss: {:.8f}'.format(epoch + 1, loss.item()))
+            losses.append(loss.item())
             running_time.append(duration)
     mean_loss = np.stack(losses).mean()
     avg_running_time = np.stack(running_time).mean()
diff --git a/utils/builder.py b/utils/builder.py
index fdcab05..fa2b90c 100644
--- a/utils/builder.py
+++ b/utils/builder.py
@@ -392,9 +392,8 @@ def get_metrics(self, metrics_params = None):
         if metrics_params is None:
             metrics_params = self.metrics_params
         metrics_list = metrics_params.get('types', ['MSE', 'MaskedMSE', 'RMSE', 'MaskedRMSE', 'REL', 'MaskedREL', 'MAE', 'MaskedMAE', 'Threshold@1.05', 'MaskedThreshold@1.05', 'Threshold@1.10', 'MaskedThreshold@1.10', 'Threshold@1.25', 'MaskedThreshold@1.25'])
-        metrics_epsilon = metrics_params.get('epsilon', 1e-8)
         from utils.metrics import MetricsRecorder
-        metrics = MetricsRecorder(metrics_list = metrics_list, epsilon = metrics_epsilon)
+        metrics = MetricsRecorder(metrics_list = metrics_list, **metrics_params)
         return metrics
     
     def get_inference_image_size(self, inference_params = None):
@@ -463,10 +462,29 @@ def get_inference_depth_min_max(self, inference_params = None):
         Returns
         -------
 
-        Tuple of (int, int) the min and max depth.
+        Tuple of (float, float) the min and max depth.
         """
         if inference_params is None:
             inference_params = self.inference_params
-        depth_min = inference_params.get('depth_min', 0.1)
+        depth_min = inference_params.get('depth_min', 0.3)
         depth_max = inference_params.get('depth_max', 1.5)
-        return depth_min, depth_max
\ No newline at end of file
+        return depth_min, depth_max
+    
+    def get_inference_depth_norm(self, inference_params = None):
+        """
+        Get the depth normalization coefficient from inference configuration.
+
+        Parameters
+        ----------
+
+        inference_params: dict, optional, default: None. If inference_params is provided, then use the parameters specified in the inference_params to get the inference depth range. Otherwise, the inference parameters in the self.params will be used to get the inference depth range.
+        
+        Returns
+        -------
+
+        float, the depth normalization coefficient.
+        """
+        if inference_params is None:
+            inference_params = self.inference_params
+        depth_norm = inference_params.get('depth_norm', 1.0)
+        return depth_norm
\ No newline at end of file
diff --git a/utils/constants.py b/utils/constants.py
index 8693ded..9a2053e 100644
--- a/utils/constants.py
+++ b/utils/constants.py
@@ -4,4 +4,8 @@
 Authors: Hongjie Fang.
 """
 
-LOSS_INF = 1e18
\ No newline at end of file
+import numpy as np
+
+
+LOSS_INF = 1e18
+DILATION_KERNEL = np.array([[0, 1, 0], [1, 1, 1], [0, 1, 0]]).astype(np.uint8)
diff --git a/utils/criterion.py b/utils/criterion.py
index 3c43bff..094e241 100644
--- a/utils/criterion.py
+++ b/utils/criterion.py
@@ -5,21 +5,28 @@
 """
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 import numpy as np
+from utils.functions import get_surface_normal_from_depth
 
 
 class Criterion(nn.Module):
     """
     Various type of criterions.
     """
-    def __init__(self, type, epsilon = 1e-8, huber_k = 0.01, **kwargs):
+    def __init__(self, type, combined_smooth = False, **kwargs):
         super(Criterion, self).__init__()
-        self.epsilon = epsilon
+        self.epsilon = kwargs.get('epsilon', 1e-8)
+        self.type = str.lower(type)
+        if 'huber' in self.type:
+            self.huber_k = kwargs.get('huber_k', 0.1)
+        self.combined_smooth = combined_smooth
+        if combined_smooth:
+            self.combined_beta = kwargs.get('combined_beta', 0.5)
         self.l2_loss = self.mse_loss
         self.masked_l2_loss = self.masked_mse_loss
         self.custom_masked_l2_loss = self.custom_masked_mse_loss
-        self.huber_k = huber_k
-        self.forward = getattr(self, type)
+        self.main_loss = getattr(self, type)
         self._mse = self._l2
     
     def _l1(self, pred, gt):
@@ -41,239 +48,223 @@ def _huber(self, pred, gt):
         delta = torch.abs(pred - gt)
         return torch.where(delta <= self.huber_k, delta ** 2 / 2, self.huber_k * delta - self.huber_k ** 2 / 2)
     
-    def mse_loss(self, pred, gt, *args, **kwargs):
+    def mse_loss(self, data_dict, *args, **kwargs):
         """
         MSE loss.
 
         Parameters
         ----------
         
-        pred: tensor of shape NHW, the prediction;
-        
-        gt: tensor of shape NHW, the ground-truth.
+        data_dict: the data dict for computing L2 loss.
 
         Returns
         -------
 
         The MSE loss.
         """
-        mask = torch.where(gt < self.epsilon, False, True)
-        delta = self._l2(pred, gt)
-        mask_base = torch.sum(mask.float(), dim = [1, 2])
-        mask_base = torch.where(mask_base < self.epsilon, mask_base + self.epsilon, mask_base)
-        loss = torch.sum(delta * mask.float(), dim = [1, 2]) / mask_base
-        return torch.mean(loss)
+        pred = data_dict['pred']
+        gt = data_dict['depth_gt']
+        mask = data_dict['zero_mask']
+        return self._l2(pred, gt)[mask].mean()
     
-    def masked_mse_loss(self, pred, gt, gt_mask, *args, **kwargs):
+    def masked_mse_loss(self, data_dict, *args, **kwargs):
         """
         Masked MSE loss.
         
         Parameters
         ----------
         
-        pred: tensor of shape NHW, the prediction;
-        
-        gt: tensor of shape NHW, the ground-truth;
-        
-        gt_mask: tensor of shape NHW, the ground-truth mask.
+        data_dict: the data dict for computing L2 loss.
 
         Returns
         -------
 
         The masked MSE loss.
         """
-        zero_mask = torch.where(gt < self.epsilon, False, True)
-        mask = gt_mask & zero_mask
-        delta = self._l2(pred, gt)
-        mask_base = torch.sum(mask.float(), dim = [1, 2])
-        mask_base = torch.where(mask_base < self.epsilon, mask_base + self.epsilon, mask_base)
-        loss = torch.sum(delta * mask.float(), dim = [1, 2]) / mask_base
-        return torch.mean(loss)
+        pred = data_dict['pred']
+        gt = data_dict['depth_gt']
+        mask = data_dict['initial_loss_mask']
+        return self._l2(pred, gt)[mask].mean()
     
-    def custom_masked_mse_loss(self, pred, gt, gt_mask, use_gt_mask, *args, **kwargs):
+    def custom_masked_mse_loss(self, data_dict, *args, **kwargs):
         """
         Custom masked MSE loss.
         
         Parameters
         ----------
         
-        pred: tensor of shape NHW, the prediction;
-        
-        gt: tensor of shape NHW, the ground-truth;
-        
-        gt_mask: tensor of shape NHW, the ground-truth mask;
-        
-        use_gt_mask: tensor of shape N, whether to use the ground-truth mask.
+        data_dict: the data dict for computing L2 loss.
 
         Returns
         -------
 
         The custom masked MSE loss.
         """
-        zero_mask = torch.where(gt < self.epsilon, False, True)
-        _, use_gt_mask = torch.broadcast_tensors(gt_mask.transpose(0, 2), use_gt_mask.view(-1))
-        gt_mask = ~ (~ gt_mask & use_gt_mask.transpose(0, 2))
-        mask = gt_mask & zero_mask
-        delta = self._l2(pred, gt)
-        mask_base = torch.sum(mask.float(), dim = [1, 2])
-        mask_base = torch.where(mask_base < self.epsilon, mask_base + self.epsilon, mask_base)
-        loss = torch.sum(delta * mask.float(), dim = [1, 2]) / mask_base
-        return torch.mean(loss)
+        pred = data_dict['pred']
+        gt = data_dict['depth_gt']
+        mask = data_dict['loss_mask']
+        return self._l2(pred, gt)[mask].mean()
     
-    def l1_loss(self, pred, gt, *args, **kwargs):
+    def l1_loss(self, data_dict, *args, **kwargs):
         """
         L1 loss.
 
         Parameters
         ----------
         
-        pred: tensor of shape NHW, the prediction;
-        
-        gt: tensor of shape NHW, the ground-truth.
+        data_dict: the data dict for computing L1 loss.
 
         Returns
         -------
 
         The L1 loss.
         """
-        mask = torch.where(gt < self.epsilon, False, True)
-        delta = self._l1(pred, gt)
-        mask_base = torch.sum(mask.float(), dim = [1, 2])
-        mask_base = torch.where(mask_base < self.epsilon, mask_base + self.epsilon, mask_base)
-        loss = torch.sum(delta * mask.float(), dim = [1, 2]) / mask_base
-        return torch.mean(loss)
+        pred = data_dict['pred']
+        gt = data_dict['depth_gt']
+        mask = data_dict['zero_mask']
+        return self._l1(pred, gt)[mask].mean()
     
-    def masked_l1_loss(self, pred, gt, gt_mask, *args, **kwargs):
+    def masked_l1_loss(self, data_dict, *args, **kwargs):
         """
         Masked L1 loss.
         
         Parameters
         ----------
         
-        pred: tensor of shape NHW, the prediction;
-        
-        gt: tensor of shape NHW, the ground-truth;
-        
-        gt_mask: tensor of shape NHW, the ground-truth mask.
+        data_dict: the data dict for computing L1 loss.
 
         Returns
         -------
 
         The masked L1 loss.
         """
-        zero_mask = torch.where(gt < self.epsilon, False, True)
-        mask = gt_mask & zero_mask
-        delta = self._l1(pred, gt)
-        mask_base = torch.sum(mask.float(), dim = [1, 2])
-        mask_base = torch.where(mask_base < self.epsilon, mask_base + self.epsilon, mask_base)
-        loss = torch.sum(delta * mask.float(), dim = [1, 2]) / mask_base
-        return torch.mean(loss)
+        pred = data_dict['pred']
+        gt = data_dict['depth_gt']
+        mask = data_dict['initial_loss_mask']
+        return self._l1(pred, gt)[mask].mean()
     
-    def custom_masked_l1_loss(self, pred, gt, gt_mask, use_gt_mask, *args, **kwargs):
+    def custom_masked_l1_loss(self, data_dict, *args, **kwargs):
         """
         Custom masked L1 loss.
         
         Parameters
         ----------
-        
-        pred: tensor of shape NHW, the prediction;
-        
-        gt: tensor of shape NHW, the ground-truth;
-        
-        gt_mask: tensor of shape NHW, the ground-truth mask;
-        
-        use_gt_mask: tensor of shape N, whether to use the ground-truth mask.
+
+        data_dict: the data dict for computing L1 loss.
 
         Returns
         -------
 
         The custom masked L1 loss.
         """
-        zero_mask = torch.where(gt < self.epsilon, False, True)
-        _, use_gt_mask = torch.broadcast_tensors(gt_mask.transpose(0, 2), use_gt_mask.view(-1))
-        gt_mask = ~ (~ gt_mask & use_gt_mask.transpose(0, 2))
-        mask = gt_mask & zero_mask
-        delta = self._l1(pred, gt)
-        mask_base = torch.sum(mask.float(), dim = [1, 2])
-        mask_base = torch.where(mask_base < self.epsilon, mask_base + self.epsilon, mask_base)
-        loss = torch.sum(delta * mask.float(), dim = [1, 2]) / mask_base
-        return torch.mean(loss)
+        pred = data_dict['pred']
+        gt = data_dict['depth_gt']
+        mask = data_dict['loss_mask']
+        return self._l1(pred, gt)[mask].mean()
     
-    def huber_loss(self, pred, gt, *args, **kwargs):
+    def huber_loss(self, data_dict, *args, **kwargs):
         """
         Huber loss.
 
         Parameters
         ----------
         
-        pred: tensor of shape NHW, the prediction;
-        
-        gt: tensor of shape NHW, the ground-truth.
+        data_dict: the data dict for computing huber loss.
 
         Returns
         -------
 
         The huber loss.
         """
-        mask = torch.where(gt < self.epsilon, False, True)
-        delta = self._huber(pred, gt)
-        mask_base = torch.sum(mask.float(), dim = [1, 2])
-        mask_base = torch.where(mask_base < self.epsilon, mask_base + self.epsilon, mask_base)
-        loss = torch.sum(delta * mask.float(), dim = [1, 2]) / mask_base
-        return torch.mean(loss)
+        pred = data_dict['pred']
+        gt = data_dict['depth_gt']
+        mask = data_dict['zero_mask']
+        return self._huber(pred, gt)[mask].mean()
 
-    def masked_huber_loss(self, pred, gt, gt_mask, *args, **kwargs):
+    def masked_huber_loss(self, data_dict, *args, **kwargs):
         """
         Masked Huber loss.
         
         Parameters
         ----------
         
-        pred: tensor of shape NHW, the prediction;
-        
-        gt: tensor of shape NHW, the ground-truth;
-        
-        gt_mask: tensor of shape NHW, the ground-truth mask.
+        data_dict: the data dict for computing huber loss.
 
         Returns
         -------
 
         The masked huber loss.
         """
-        zero_mask = torch.where(gt < self.epsilon, False, True)
-        mask = gt_mask & zero_mask
-        delta = self._huber(pred, gt)
-        mask_base = torch.sum(mask.float(), dim = [1, 2])
-        mask_base = torch.where(mask_base < self.epsilon, mask_base + self.epsilon, mask_base)
-        loss = torch.sum(delta * mask.float(), dim = [1, 2]) / mask_base
-        return torch.mean(loss)
+        pred = data_dict['pred']
+        gt = data_dict['depth_gt']
+        mask = data_dict['initial_loss_mask']
+        return self._huber(pred, gt)[mask].mean()
     
-    def custom_masked_huber_loss(self, pred, gt, gt_mask, use_gt_mask, *args, **kwargs):
+    def custom_masked_huber_loss(self, data_dict, *args, **kwargs):
         """
         Custom masked huber loss.
         
         Parameters
         ----------
         
-        pred: tensor of shape NHW, the prediction;
+        data_dict: the data dict for computing huber loss.
+
+        Returns
+        -------
+
+        The custom masked huber loss.
+        """
+        pred = data_dict['pred']
+        gt = data_dict['depth_gt']
+        mask = data_dict['loss_mask']
+        return self._huber(pred, gt)[mask].mean()
+    
+    def smooth_loss(self, data_dict, *args, **kwargs):
+        """
+        Smooth loss: surface normal loss.
+
+        Parameters
+        ----------
+        
+        data_dict: the data dict for computing smooth loss.
+
+        Returns
+        -------
         
-        gt: tensor of shape NHW, the ground-truth;
+        The smooth loss.
+        """
+        # Fetch information from data dict
+        pred = data_dict['pred']
+        fx, fy, cx, cy = data_dict['fx'], data_dict['fy'], data_dict['cx'], data_dict['cy']
+        depth_gt_sn = data_dict['depth_gt_sn']
+        _, original_h, original_w = data_dict['depth_original'].shape
+        mask = data_dict['loss_mask_dilated']
+        # Calculate smooth loss.
+        pred_sn = get_surface_normal_from_depth(pred, fx, fy, cx, cy, original_size = (original_w, original_h))
+        sn_loss = 1 - F.cosine_similarity(pred_sn, depth_gt_sn, dim = 1)
+        # masking
+        return sn_loss[mask].mean()
+
+    def forward(self, data_dict):
+        """
+        Calculate criterion given data dict.
         
-        gt_mask: tensor of shape NHW, the ground-truth mask;
+        Parameters
+        ----------
         
-        use_gt_mask: tensor of shape N, whether to use the ground-truth mask.
+        data_dict: the data dict for computing loss.
 
         Returns
         -------
-
-        The custom masked huber loss.
+        
+        The pre-defined loss.
         """
-        zero_mask = torch.where(gt < self.epsilon, False, True)
-        _, use_gt_mask = torch.broadcast_tensors(gt_mask.transpose(0, 2), use_gt_mask.view(-1))
-        gt_mask = ~ (~ gt_mask & use_gt_mask.transpose(0, 2))
-        mask = gt_mask & zero_mask
-        delta = self._huber(pred, gt)
-        mask_base = torch.sum(mask.float(), dim = [1, 2])
-        mask_base = torch.where(mask_base < self.epsilon, mask_base + self.epsilon, mask_base)
-        loss = torch.sum(delta * mask.float(), dim = [1, 2]) / mask_base
-        return torch.mean(loss)
\ No newline at end of file
+        loss_dict = {
+            self.type: self.main_loss(data_dict)
+        }
+        if self.combined_smooth:
+            loss_dict['smooth'] = self.smooth_loss(data_dict)
+            loss_dict['loss'] = loss_dict[self.type] + self.combined_beta * loss_dict['smooth']
+        else:
+            loss_dict['loss'] = loss_dict[self.type]
+        return loss_dict
diff --git a/utils/data_preparation.py b/utils/data_preparation.py
index 300e015..9c771c2 100644
--- a/utils/data_preparation.py
+++ b/utils/data_preparation.py
@@ -11,6 +11,8 @@
 import random
 import OpenEXR
 import numpy as np
+from utils.functions import get_surface_normal_from_depth
+from utils.constants import DILATION_KERNEL
 
 
 def chromatic_transform(image):
@@ -92,7 +94,7 @@ def add_noise(image, level = 0.1):
     return noisy.astype('uint8')
 
 
-def exr_loader(exr_path, ndim=3, ndim_representation = ['R', 'G', 'B']):
+def exr_loader(exr_path, ndim = 3, ndim_representation = ['R', 'G', 'B']):
     """
     Loads a .exr file as a numpy array.
 
@@ -145,7 +147,7 @@ def exr_loader(exr_path, ndim=3, ndim_representation = ['R', 'G', 'B']):
         return exr_arr
 
 
-def process_depth(depth, camera_type = 0, depth_min = 0.1, depth_max = 1.5):
+def process_depth(depth, camera_type = 0, depth_min = 0.3, depth_max = 1.5, depth_norm = 1.0):
     """
     Process the depth information, including scaling, normalization and clear NaN values.
     
@@ -159,7 +161,9 @@ def process_depth(depth, camera_type = 0, depth_min = 0.1, depth_max = 1.5):
         - 1: scale 1000 (RealSense D415, RealSense D435, etc.);
         - 2: scale 4000 (RealSense L515).
     
-    depth_min, depth_max: int, optional, default: 0.1, 1.5, the min depth and the max depth;
+    depth_min, depth_max: int, optional, default: 0.3, 1.5, the min depth and the max depth;
+
+    depth_norm: float, optional, default: 1.0, the depth normalization coefficient.
 
     Returns
     -------
@@ -172,12 +176,29 @@ def process_depth(depth, camera_type = 0, depth_min = 0.1, depth_max = 1.5):
     if camera_type == 2:
         scale_coeff = 4000
     depth = depth / scale_coeff
-    depth = (np.clip(depth, depth_min, depth_max) - depth_min) / (depth_max - depth_min)
     depth[np.isnan(depth)] = 0.0
+    depth = np.where(depth < depth_min, 0, depth)
+    depth = np.where(depth > depth_max, 0, depth)
+    depth = depth / depth_norm
     return depth
 
 
-def process_data(rgb, depth, depth_gt, depth_gt_mask, scene_type = "cluttered", camera_type = 0, split = 'train', image_size = (720, 1280), depth_min = 0.1, depth_max = 1.5, use_aug = True, rgb_aug_prob = 0.8, retain_original = False, **kwargs):
+def process_data(
+    rgb, 
+    depth, 
+    depth_gt, 
+    depth_gt_mask, 
+    camera_intrinsics, 
+    scene_type = "cluttered", 
+    camera_type = 0, 
+    split = 'train', 
+    image_size = (720, 1280), 
+    depth_min = 0.3, 
+    depth_max = 1.5,
+    depth_norm = 10,
+    use_aug = True, 
+    rgb_aug_prob = 0.8, 
+    **kwargs):
     """
     Process images and perform data augmentation.
 
@@ -191,6 +212,8 @@ def process_data(rgb, depth, depth_gt, depth_gt_mask, scene_type = "cluttered",
     depth_gt: array, required, the ground-truth depth image;
     
     depth_gt_mask: array, required, the ground-truth depth image mask;
+
+    camera_intrinsics: array, required, the camera intrinsics of the image;
     
     scene_type: str in ['cluttered', 'isolated'], optional, default: 'cluttered', the scene type;
     
@@ -201,26 +224,25 @@ def process_data(rgb, depth, depth_gt, depth_gt_mask, scene_type = "cluttered",
     
     split: str in ['train', 'test'], optional, default: 'train', the split of the dataset;
     
-    image_size: (int, int) tuple, optional, default: (720, 1280), the size of the image;
+    image_size: tuple of (int, int), optional, default: (720, 1280), the size of the image;
     
-    depth_min, depth_max: int, optional, default: 0.1, 1.5, the min depth and the max depth;
+    depth_min, depth_max: float, optional, default: 0.1, 1.5, the min depth and the max depth;
+
+    depth_norm: float, optional, default: 1.0, the depth normalization coefficient;
 
     use_aug: bool, optional, default: True, whether use data augmentation;
     
-    rgb_aug_prob: float, optional, default: 0.8, the rgb augmentation probability (only applies when use_aug is set to True);
-
-    retain_original: bool, optional, default: False, whether to retain original samples.
+    rgb_aug_prob: float, optional, default: 0.8, the rgb augmentation probability (only applies when use_aug is set to True).
 
     Returns
     -------
     
-    rgb, depth, depth_gt, depth_gt_mask, scene_mask tensors for training and testing.
+    data_dict for training and testing.
     """
 
-    if retain_original:
-        depth_original = process_depth(depth.copy(), camera_type = camera_type)
-        depth_gt_original = process_depth(depth_gt.copy(), camera_type = camera_type)
-        depth_gt_mask_original = depth_gt_mask.copy()
+    depth_original = process_depth(depth.copy(), camera_type = camera_type, depth_min = depth_min, depth_max = depth_max, depth_norm = depth_norm)
+    depth_gt_original = process_depth(depth_gt.copy(), camera_type = camera_type)
+    depth_gt_mask_original = depth_gt_mask.copy()
 
     rgb = cv2.resize(rgb, image_size, interpolation = cv2.INTER_NEAREST)
     depth = cv2.resize(depth, image_size, interpolation = cv2.INTER_NEAREST)
@@ -229,8 +251,8 @@ def process_data(rgb, depth, depth_gt, depth_gt_mask, scene_type = "cluttered",
     depth_gt_mask = depth_gt_mask.astype(np.bool)
 
     # depth processing
-    depth = process_depth(depth, camera_type = camera_type, depth_min = depth_min, depth_max = depth_max)
-    depth_gt = process_depth(depth_gt, camera_type = camera_type, depth_min = depth_min, depth_max = depth_max)
+    depth = process_depth(depth, camera_type = camera_type, depth_min = depth_min, depth_max = depth_max, depth_norm = depth_norm)
+    depth_gt = process_depth(depth_gt, camera_type = camera_type, depth_min = depth_min, depth_max = depth_max, depth_norm = depth_norm)
 
     # RGB augmentation.
     if split == 'train' and use_aug and np.random.rand(1) > 1 - rgb_aug_prob:
@@ -263,9 +285,47 @@ def process_data(rgb, depth, depth_gt, depth_gt_mask, scene_type = "cluttered",
     rgb = rgb.transpose(2, 0, 1)
 
     # process scene mask
-    scene_mask = np.array([1 if scene_type == 'cluttered' else 0], dtype = np.bool)
-    
-    if retain_original:
-        return torch.FloatTensor(rgb), torch.FloatTensor(depth), torch.FloatTensor(depth_gt), torch.BoolTensor(depth_gt_mask), torch.BoolTensor(scene_mask), torch.FloatTensor(depth_original), torch.FloatTensor(depth_gt_original), torch.BoolTensor(depth_gt_mask_original)
+    scene_mask = (scene_type == 'cluttered')
+
+    # zero mask
+    neg_zero_mask = np.where(depth_gt < 0.01, 255, 0).astype(np.uint8)
+    neg_zero_mask_dilated = cv2.dilate(neg_zero_mask, kernel = DILATION_KERNEL)
+    neg_zero_mask[neg_zero_mask != 0] = 1
+    neg_zero_mask_dilated[neg_zero_mask_dilated != 0] = 1
+    zero_mask = np.logical_not(neg_zero_mask)
+    zero_mask_dilated = np.logical_not(neg_zero_mask_dilated)
+
+    # loss mask
+    initial_loss_mask = np.logical_and(depth_gt_mask, zero_mask)
+    initial_loss_mask_dilated = np.logical_and(depth_gt_mask, zero_mask_dilated)
+    if scene_mask:
+        loss_mask = initial_loss_mask
+        loss_mask_dilated = initial_loss_mask_dilated
     else:
-        return torch.FloatTensor(rgb), torch.FloatTensor(depth), torch.FloatTensor(depth_gt), torch.BoolTensor(depth_gt_mask), torch.BoolTensor(scene_mask)
+        loss_mask = zero_mask
+        loss_mask_dilated = zero_mask_dilated
+
+    data_dict = {
+        'rgb': torch.FloatTensor(rgb),
+        'depth': torch.FloatTensor(depth),
+        'depth_gt': torch.FloatTensor(depth_gt),
+        'depth_gt_mask': torch.BoolTensor(depth_gt_mask),
+        'scene_mask': torch.tensor(scene_mask),
+        'zero_mask': torch.BoolTensor(zero_mask),
+        'zero_mask_dilated': torch.BoolTensor(zero_mask_dilated),
+        'initial_loss_mask': torch.BoolTensor(initial_loss_mask),
+        'initial_loss_mask_dilated': torch.BoolTensor(initial_loss_mask_dilated),
+        'loss_mask': torch.BoolTensor(loss_mask),
+        'loss_mask_dilated': torch.BoolTensor(loss_mask_dilated),
+        'depth_original': torch.FloatTensor(depth_original),
+        'depth_gt_original': torch.FloatTensor(depth_gt_original),
+        'depth_gt_mask_original': torch.BoolTensor(depth_gt_mask_original),
+        'fx': torch.tensor(camera_intrinsics[0, 0]),
+        'fy': torch.tensor(camera_intrinsics[1, 1]),
+        'cx': torch.tensor(camera_intrinsics[0, 2]),
+        'cy': torch.tensor(camera_intrinsics[1, 2])
+    }
+
+    data_dict['depth_gt_sn'] = get_surface_normal_from_depth(data_dict['depth_gt'].unsqueeze(0), data_dict['fx'].unsqueeze(0), data_dict['fy'].unsqueeze(0), data_dict['cx'].unsqueeze(0), data_dict['cy'].unsqueeze(0)).squeeze(0)
+
+    return data_dict
diff --git a/utils/functions.py b/utils/functions.py
index ff0b3dc..cb25cf2 100644
--- a/utils/functions.py
+++ b/utils/functions.py
@@ -3,6 +3,11 @@
 
 Authors: Hongjie Fang.
 """
+import torch
+import einops
+import numpy as np
+import torch.nn.functional as F
+
 
 def display_results(metrics_dict, logger):
     """
@@ -28,4 +33,132 @@ def display_results(metrics_dict, logger):
             logger.info("  {}: {:.6f}".format(metric_name, metric_value))    
     except Exception:
         logger.warning("Unable to display the results, the operation is ignored.")
-        pass
\ No newline at end of file
+        pass
+
+
+def gradient(x):
+    """
+    Get gradient of xyz image.
+
+    This is adapted from implicit-depth repository, ref: https://github.com/NVlabs/implicit_depth/blob/main/src/utils/point_utils.py.
+
+    Parameters
+    ----------
+    
+    x: the xyz map to get gradient.
+
+    Returns
+    -------
+
+    the x-axis-in-image gradient and y-axis-in-image gradient of the xyz map.
+    """
+    left = x
+    right = F.pad(x, [0, 1, 0, 0])[:, :, :, 1:]
+    top = x
+    bottom = F.pad(x, [0, 0, 0, 1])[:, :, 1:, :]
+    dx, dy = right - left, bottom - top 
+    dx[:, :, :, -1] = 0
+    dy[:, :, -1, :] = 0
+    return dx, dy
+
+
+def get_surface_normal_from_xyz(x, epsilon = 1e-8):
+    """
+    Get the surface normal of xyz image.
+
+    This is adapted from implicit-depth repository, ref: https://github.com/NVlabs/implicit_depth/blob/main/src/utils/point_utils.py.
+
+    Parameters
+    ----------
+    
+    x: the xyz map to get surface normal;
+    
+    epsilon: float, optional, default: 1e-8, the epsilon to avoid nan.
+
+    Returns
+    -------
+
+    The surface normals.
+    """
+    dx, dy = gradient(x)
+    surface_normal = torch.cross(dx, dy, dim = 1)
+    surface_normal = surface_normal / (torch.norm(surface_normal, dim = 1, keepdim=True) + epsilon)
+    return surface_normal
+
+
+def get_xyz(depth, fx, fy, cx, cy, original_size = (1280, 720)):
+    """
+    Get XYZ from depth image and camera intrinsics.
+
+    Parameters
+    ----------
+    
+    depth: tensor, required, the depth image;
+
+    fx, fy, cx, cy: tensor, required, the camera intrinsics;
+
+    original_size: tuple of (int, int), optional, default: (1280, 720), the original size of image.
+
+    Returns
+    -------
+    
+    The XYZ value of each pixel.
+    """
+    bs, h, w = depth.shape
+    indices = np.indices((h, w), dtype=np.float32)
+    indices = torch.FloatTensor(np.array([indices] * bs)).to(depth.device)
+    x_scale = w / original_size[0]
+    y_scale = h / original_size[1]
+    fx *= x_scale
+    fy *= y_scale
+    cx *= x_scale
+    cy *= y_scale
+    z = depth
+    x = (indices[:, 1, :, :] - einops.repeat(cx, 'bs -> bs h w', h = h, w = w)) * z / einops.repeat(fx, 'bs -> bs h w', h = h, w = w)
+    y = (indices[:, 0, :, :] - einops.repeat(cy, 'bs -> bs h w', h = h, w = w)) * z / einops.repeat(fy, 'bs -> bs h w', h = h, w = w)
+    return torch.stack([x, y, z], axis = 1)
+
+
+def get_surface_normal_from_depth(depth, fx, fy, cx, cy, original_size = (1280, 720), epsilon = 1e-8):
+    """
+    Get surface normal from depth and camera intrinsics.
+
+    Parameters
+    ----------
+    
+    depth: tensor, required, the depth image;
+
+    fx, fy, cx, cy: tensor, required, the camera intrinsics;
+  
+    original_size: tuple of (int, int), optional, default: (1280, 720), the original size of image;
+
+    epsilon: float, optional, default: 1e-8, the epsilon to avoid nan.
+
+    Returns
+    -------
+    
+    The surface normals.
+    """
+    xyz = get_xyz(depth, fx, fy, cx, cy, original_size = original_size)
+    return get_surface_normal_from_xyz(xyz, epsilon = epsilon)
+
+
+def to_device(data_dict, device):
+    """
+    Put the data in the data_dict to the specified device.
+    
+    Parameters
+    ----------
+    
+    data_dict: dict, required, dict that contains tensors;
+    
+    device: torch.device object, required, the device.
+
+    Returns
+    -------
+
+    The final data_dict.
+    """
+    for key in data_dict.keys():
+        data_dict[key] = data_dict[key].to(device)
+    return data_dict
\ No newline at end of file
diff --git a/utils/metrics.py b/utils/metrics.py
index 1151f24..c142a57 100644
--- a/utils/metrics.py
+++ b/utils/metrics.py
@@ -25,7 +25,7 @@ class Metrics(object):
 
         - Threshold, masked threshold.
     """
-    def __init__(self, epsilon = 1e-8, depth_scale = 10.0, **kwargs):
+    def __init__(self, epsilon = 1e-8, depth_scale = 1.0, **kwargs):
         """
         Initialization.
 
@@ -338,14 +338,14 @@ def _update_recorder_dict(self, metrics_dict):
             self.metrics_recorder_dict[metric_name] += metrics_dict[metric_name] * metrics_dict['samples']
         self.metrics_recorder_dict['samples'] += metrics_dict['samples']
     
-    def evaluate_batch(self, pred, gt, gt_mask, use_gt_mask, record = True, *args, **kwargs):
+    def evaluate_batch(self, data_dict, record = True, *args, **kwargs):
         """
         Evaluate a batch of the samples.
 
         Parameters
         ----------
 
-        (pred, gt, gt_mask, use_gt_mask): a record, representing predicted depth image, ground-truth depth image, groud-truth mask and whether to use ground-truth mask respectively.
+        data_dict: a record, representing predicted depth image, ground-truth depth image, groud-truth mask and whether to use ground-truth mask respectively.
 
         record: bool, optional, default: True, whether to record the metrics of the batch of samples in the metric recorder.
 
@@ -354,12 +354,15 @@ def evaluate_batch(self, pred, gt, gt_mask, use_gt_mask, record = True, *args, *
 
         The metrics dict of the batch of samples.
         """
+        pred = data_dict['pred']
+        gt = data_dict['depth_gt']
+        gt_mask = data_dict['depth_gt_mask']
+        zero_mask = data_dict['zero_mask']
         num_samples = gt.shape[0]
-        zero_mask = torch.where(torch.abs(gt) < self.epsilon, False, True)
         metrics_dict = {'samples': num_samples}
         for metric_line in self.metrics_list:
             metric_name, metric_func, metric_kwargs = metric_line
-            metrics_dict[metric_name] = metric_func(pred, gt, zero_mask, gt_mask, use_gt_mask, **metric_kwargs)
+            metrics_dict[metric_name] = metric_func(pred, gt, zero_mask, gt_mask, **metric_kwargs)
         if record:
             self._update_recorder_dict(metrics_dict)
         return metrics_dict