From 4df44876e9254b1a64ad0a35705a26f77179c8c3 Mon Sep 17 00:00:00 2001 From: Tony Fang Date: Fri, 10 Sep 2021 17:09:29 +0800 Subject: [PATCH] update smooth loss --- configs/320x240/train_tg_val_tg.yaml | 2 + configs/320x240/train_tg_val_tg_add.yaml | 61 +++++ configs/320x240/train_tg_val_tg_combine.yaml | 63 +++++ configs/inference.yaml | 2 +- datasets/transcg.py | 12 +- inference.py | 12 +- sample_inference.py | 1 + test.py | 25 +- train.py | 46 ++-- utils/builder.py | 28 ++- utils/constants.py | 6 +- utils/criterion.py | 239 +++++++++---------- utils/data_preparation.py | 104 ++++++-- utils/functions.py | 135 ++++++++++- utils/metrics.py | 13 +- 15 files changed, 545 insertions(+), 204 deletions(-) create mode 100644 configs/320x240/train_tg_val_tg_add.yaml create mode 100644 configs/320x240/train_tg_val_tg_combine.yaml diff --git a/configs/320x240/train_tg_val_tg.yaml b/configs/320x240/train_tg_val_tg.yaml index 4b27aa7..e038496 100644 --- a/configs/320x240/train_tg_val_tg.yaml +++ b/configs/320x240/train_tg_val_tg.yaml @@ -27,6 +27,7 @@ "rgb_augmentation_probability": 0.8 "depth_min": 0.0 "depth_max": 10.0 + "depth_norm": 10.0 "test": "type": "transcg" "data_dir": "data" @@ -34,6 +35,7 @@ "use_augmentation": False "depth_min": 0.0 "depth_max": 10.0 + "depth_norm": 10.0 "dataloader": "num_workers": 48 diff --git a/configs/320x240/train_tg_val_tg_add.yaml b/configs/320x240/train_tg_val_tg_add.yaml new file mode 100644 index 0000000..78641be --- /dev/null +++ b/configs/320x240/train_tg_val_tg_add.yaml @@ -0,0 +1,61 @@ +# script id: 1 +"model": + "type": "DFNet" + "params": + "in_channels": 4 + "hidden_channels": 64 + "L": 5 + "k": 12 + +"optimizer": + "type": "AdamW" + "params": + "lr": 0.001 + +"lr_scheduler": + "type": "MultiStepLR" + "params": + "milestones": [5, 15, 25, 35] + "gamma": 0.2 + +"dataset": + "train": + "type": "transcg" + "data_dir": "data" + "image_size": !!python/tuple [320, 240] + "use_augmentation": True + "rgb_augmentation_probability": 0.8 + "depth_min": 0.0 + "depth_max": 10.0 + "depth_norm": 10.0 + "test": + "type": "transcg" + "data_dir": "data" + "image_size": !!python/tuple [320, 240] + "use_augmentation": False + "depth_min": 0.0 + "depth_max": 10.0 + "depth_norm": 10.0 + +"dataloader": + "num_workers": 48 + "shuffle": True + "drop_last": True + +"trainer": + "batch_size": 32 + "test_batch_size": 1 + "multigpu": True + "max_epoch": 40 + "criterion": + "type": "custom_masked_mse_loss" + "epsilon": 0.00000001 + +"metrics": + "types": ["MSE", "MaskedMSE", "RMSE", "MaskedRMSE", "REL", "MaskedREL", "MAE", "MaskedMAE", "Threshold@1.05", "MaskedThreshold@1.05", "Threshold@1.10", "MaskedThreshold@1.10", "Threshold@1.25", "MaskedThreshold@1.25"] + "epsilon": 0.00000001 + "depth_scale": 10.0 + +"stats": + "stats_dir": "stats" + "stats_exper": "train-tg-val-tg-add" diff --git a/configs/320x240/train_tg_val_tg_combine.yaml b/configs/320x240/train_tg_val_tg_combine.yaml new file mode 100644 index 0000000..0416455 --- /dev/null +++ b/configs/320x240/train_tg_val_tg_combine.yaml @@ -0,0 +1,63 @@ +# script id: 1 +"model": + "type": "DFNet" + "params": + "in_channels": 4 + "hidden_channels": 64 + "L": 5 + "k": 12 + +"optimizer": + "type": "AdamW" + "params": + "lr": 0.001 + +"lr_scheduler": + "type": "MultiStepLR" + "params": + "milestones": [5, 15, 25, 35] + "gamma": 0.2 + +"dataset": + "train": + "type": "transcg" + "data_dir": "data" + "image_size": !!python/tuple [320, 240] + "use_augmentation": True + "rgb_augmentation_probability": 0.8 + "depth_min": 0.3 + "depth_max": 1.5 + "depth_norm": 1.0 + "test": + "type": "transcg" + "data_dir": "data" + "image_size": !!python/tuple [320, 240] + "use_augmentation": False + "depth_min": 0.3 + "depth_max": 1.5 + "depth_norm": 1.0 + +"dataloader": + "num_workers": 48 + "shuffle": True + "drop_last": True + +"trainer": + "batch_size": 32 + "test_batch_size": 1 + "multigpu": True + "max_epoch": 40 + "criterion": + "type": "custom_masked_mse_loss" + "epsilon": 0.00000001 + "combined_smooth": True + "combined_beta": 0.005 + +"metrics": + "types": ["MSE", "MaskedMSE", "RMSE", "MaskedRMSE", "REL", "MaskedREL", "MAE", "MaskedMAE", "Threshold@1.05", "MaskedThreshold@1.05", "Threshold@1.10", "MaskedThreshold@1.10", "Threshold@1.25", "MaskedThreshold@1.25"] + "epsilon": 0.00000001 + "depth_scale": 1.0 + +"stats": + "stats_dir": "stats" + "stats_exper": "train-tg-val-tg-comb" diff --git a/configs/inference.yaml b/configs/inference.yaml index 235e730..8755cb9 100644 --- a/configs/inference.yaml +++ b/configs/inference.yaml @@ -7,7 +7,7 @@ "k": 12 "inference": - "checkpoint_path": "stats/checkpoint.tar" + "checkpoint_path": "stats/train-tg-val-tg-comb/checkpoint.tar" "image_size": !!python/tuple [320, 240] "cuda_id": 0 "depth_min": 0.0 diff --git a/datasets/transcg.py b/datasets/transcg.py index e17087b..b5feef1 100644 --- a/datasets/transcg.py +++ b/datasets/transcg.py @@ -33,9 +33,6 @@ def __init__(self, data_dir, split = 'train', **kwargs): raise AttributeError('Invalid split option.') self.data_dir = data_dir self.split = split - self.high_resolution = kwargs.get('high_resolution', False) - if self.high_resolution and split == 'train': - raise AttributeError('Does not support returning high resolution images during training. If you want to train on high resolution samples, please set image_size arguments in high resolution.') with open(os.path.join(self.data_dir, 'metadata.json'), 'r') as fp: self.dataset_metadata = json.load(fp) self.scene_num = self.dataset_metadata['total_scenes'] @@ -64,11 +61,14 @@ def __init__(self, data_dir, split = 'train', **kwargs): ]) # Integrity double-check assert len(self.sample_info) == self.total_samples, "Error in total samples, expect {} samples, found {} samples.".format(self.total_samples, len(self.sample_info)) + # Other parameters + self.cam_intrinsics = [None, np.load(os.path.join(self.data_dir, 'camera_intrinsics', 'camIntrinsics-D435.npy')), np.load(os.path.join(self.data_dir, 'camera_intrinsics', 'camIntrinsics-L515.npy'))] self.use_aug = kwargs.get('use_augmentation', True) self.rgb_aug_prob = kwargs.get('rgb_augmentation_probability', 0.8) self.image_size = kwargs.get('image_size', (1280, 720)) - self.depth_min = kwargs.get('depth_min', 0.0) - self.depth_max = kwargs.get('depth_max', 10.0) + self.depth_min = kwargs.get('depth_min', 0.3) + self.depth_max = kwargs.get('depth_max', 1.5) + self.depth_norm = kwargs.get('depth_norm', 1.0) def __getitem__(self, id): img_path, camera_type, scene_type = self.sample_info[id] @@ -76,7 +76,7 @@ def __getitem__(self, id): depth = np.array(Image.open(os.path.join(img_path, 'depth{}.png'.format(camera_type))), dtype = np.float32) depth_gt = np.array(Image.open(os.path.join(img_path, 'depth{}-gt.png'.format(camera_type))), dtype = np.float32) depth_gt_mask = np.array(Image.open(os.path.join(img_path, 'depth{}-gt-mask.png'.format(camera_type))), dtype = np.uint8) - return process_data(rgb, depth, depth_gt, depth_gt_mask, scene_type, camera_type, split = self.split, image_size = self.image_size, depth_min = self.depth_min, depth_max = self.depth_max, use_aug = self.use_aug, rgb_aug_prob = self.rgb_aug_prob, retain_original = self.high_resolution) + return process_data(rgb, depth, depth_gt, depth_gt_mask, self.cam_intrinsics[camera_type], scene_type, camera_type, split = self.split, image_size = self.image_size, depth_min = self.depth_min, depth_max = self.depth_max, depth_norm = self.depth_norm, use_aug = self.use_aug, rgb_aug_prob = self.rgb_aug_prob) def __len__(self): return self.total_samples diff --git a/inference.py b/inference.py index 8f05c97..3b39223 100644 --- a/inference.py +++ b/inference.py @@ -66,6 +66,7 @@ def __init__(self, cfg_path = os.path.join('configs', 'inference.yaml'), with_in self.image_size = self.builder.get_inference_image_size() self.depth_min, self.depth_max = self.builder.get_inference_depth_min_max() + self.depth_norm = self.builder.get_inference_depth_norm() def inference(self, rgb, depth, target_size = (1280, 720)): """ @@ -86,7 +87,10 @@ def inference(self, rgb, depth, target_size = (1280, 720)): rgb = cv2.resize(rgb, self.image_size, interpolation = cv2.INTER_NEAREST) depth = cv2.resize(depth, self.image_size, interpolation = cv2.INTER_NEAREST) - depth = (depth - self.depth_min) / (self.depth_max - self.depth_min) + depth = np.where(depth < self.depth_min, 0, depth) + depth = np.where(depth > self.depth_max, 0, depth) + depth[np.isnan(depth)] = 0 + depth = depth / self.depth_norm rgb = (rgb / 255.0).transpose(2, 0, 1) rgb = torch.FloatTensor(rgb).to(self.device).unsqueeze(0) depth = torch.FloatTensor(depth).to(self.device).unsqueeze(0) @@ -97,7 +101,7 @@ def inference(self, rgb, depth, target_size = (1280, 720)): if self.with_info: self.logger.info("Inference finished, time: {:.4f}s.".format(time_end - time_start)) depth_res = depth_res.squeeze(0).cpu().detach().numpy() - depth_res = depth_res * (self.depth_max - self.depth_min) + self.depth_min - depth_res = cv2.resize(depth_res, target_size, interpolation = cv2.INTER_NEAREST) + depth_res = depth_res * self.depth_norm + depth_res = cv2.resize(depth_res, target_size, interpolation = cv2.INTER_LANCZOS4) return depth_res - + \ No newline at end of file diff --git a/sample_inference.py b/sample_inference.py index 982cbc9..44ff694 100644 --- a/sample_inference.py +++ b/sample_inference.py @@ -55,6 +55,7 @@ def draw_point_cloud(color, depth, camera_intrinsics, use_mask = False, use_inpa cam_intrinsics = np.load('data/camera_intrinsics/camIntrinsics-D435.npy') res = np.clip(res, 0.1, 1.5) +depth = np.clip(depth, 0.1, 1.5) cloud = draw_point_cloud(rgb, res, cam_intrinsics, scale = 1.0) diff --git a/test.py b/test.py index 3415eb3..a5dd6b9 100644 --- a/test.py +++ b/test.py @@ -14,6 +14,7 @@ from tqdm import tqdm from utils.logger import ColoredLogger from utils.builder import ConfigBuilder +from utils.functions import to_device from time import perf_counter @@ -68,22 +69,22 @@ def test(): running_time = [] losses = [] with tqdm(test_dataloader) as pbar: - for data in pbar: - rgb, depth, depth_gt, depth_gt_mask, scene_mask = data - rgb = rgb.to(device) - depth = depth.to(device) - depth_gt = depth_gt.to(device) - depth_gt_mask = depth_gt_mask.to(device) - scene_mask = scene_mask.to(device) + for data_dict in pbar: + data_dict = to_device(data_dict, device) with torch.no_grad(): time_start = perf_counter() - res = model(rgb, depth) + res = model(data_dict['rgb'], data_dict['depth']) time_end = perf_counter() - loss = criterion(res, depth_gt, depth_gt_mask, scene_mask) - _ = metrics.evaluate_batch(res, depth_gt, depth_gt_mask, scene_mask, record = True) + data_dict['pred'] = res + loss_dict = criterion(data_dict) + loss = loss_dict['loss'] + _ = metrics.evaluate_batch(data_dict, record = True) duration = time_end - time_start - pbar.set_description('Loss: {:.8f}, model time: {:.4f}s'.format(loss.mean().item(), duration)) - losses.append(loss.mean().item()) + if 'smooth' in loss_dict.keys(): + pbar.set_description('Loss: {:.8f}, smooth loss: {:.8f}'.format(loss.item(), loss_dict['smooth'].item())) + else: + pbar.set_description('Loss: {:.8f}'.format(loss.item())) + losses.append(loss.item()) running_time.append(duration) mean_loss = np.stack(losses).mean() avg_running_time = np.stack(running_time).mean() diff --git a/train.py b/train.py index 2a01fbd..d4f39da 100644 --- a/train.py +++ b/train.py @@ -15,7 +15,7 @@ from utils.logger import ColoredLogger from utils.builder import ConfigBuilder from utils.constants import LOSS_INF -from utils.functions import display_results +from utils.functions import display_results, to_device from time import perf_counter @@ -87,19 +87,19 @@ def train_one_epoch(epoch): model.train() losses = [] with tqdm(train_dataloader) as pbar: - for data in pbar: + for data_dict in pbar: optimizer.zero_grad() - rgb, depth, depth_gt, depth_gt_mask, scene_mask = data - rgb = rgb.to(device) - depth = depth.to(device) - depth_gt = depth_gt.to(device) - depth_gt_mask = depth_gt_mask.to(device) - scene_mask = scene_mask.to(device) - res = model(rgb, depth) - loss = criterion(res, depth_gt, depth_gt_mask, scene_mask) + data_dict = to_device(data_dict, device) + res = model(data_dict['rgb'], data_dict['depth']) + data_dict['pred'] = res + loss_dict = criterion(data_dict) + loss = loss_dict['loss'] loss.backward() optimizer.step() - pbar.set_description('Epoch {}, loss: {:.8f}'.format(epoch + 1, loss.mean().item())) + if 'smooth' in loss_dict.keys(): + pbar.set_description('Epoch {}, loss: {:.8f}, smooth loss: {:.8f}'.format(epoch + 1, loss.item(), loss_dict['smooth'].item())) + else: + pbar.set_description('Epoch {}, loss: {:.8f}'.format(epoch + 1, loss.item())) losses.append(loss.mean().item()) mean_loss = np.stack(losses).mean() logger.info('Finish training process in epoch {}, mean training loss: {:.8f}'.format(epoch + 1, mean_loss)) @@ -112,22 +112,22 @@ def test_one_epoch(epoch): running_time = [] losses = [] with tqdm(test_dataloader) as pbar: - for data in pbar: - rgb, depth, depth_gt, depth_gt_mask, scene_mask = data - rgb = rgb.to(device) - depth = depth.to(device) - depth_gt = depth_gt.to(device) - depth_gt_mask = depth_gt_mask.to(device) - scene_mask = scene_mask.to(device) + for data_dict in pbar: + data_dict = to_device(data_dict, device) with torch.no_grad(): time_start = perf_counter() - res = model(rgb, depth) + res = model(data_dict['rgb'], data_dict['depth']) time_end = perf_counter() - loss = criterion(res, depth_gt, depth_gt_mask, scene_mask) - _ = metrics.evaluate_batch(res, depth_gt, depth_gt_mask, scene_mask, record = True) + data_dict['pred'] = res + loss_dict = criterion(data_dict) + loss = loss_dict['loss'] + _ = metrics.evaluate_batch(data_dict, record = True) duration = time_end - time_start - pbar.set_description('Epoch {}, loss: {:.8f}, model time: {:.4f}s'.format(epoch + 1, loss.mean().item(), duration)) - losses.append(loss.mean().item()) + if 'smooth' in loss_dict.keys(): + pbar.set_description('Epoch {}, loss: {:.8f}, smooth loss: {:.8f}'.format(epoch + 1, loss.item(), loss_dict['smooth'].item())) + else: + pbar.set_description('Epoch {}, loss: {:.8f}'.format(epoch + 1, loss.item())) + losses.append(loss.item()) running_time.append(duration) mean_loss = np.stack(losses).mean() avg_running_time = np.stack(running_time).mean() diff --git a/utils/builder.py b/utils/builder.py index fdcab05..fa2b90c 100644 --- a/utils/builder.py +++ b/utils/builder.py @@ -392,9 +392,8 @@ def get_metrics(self, metrics_params = None): if metrics_params is None: metrics_params = self.metrics_params metrics_list = metrics_params.get('types', ['MSE', 'MaskedMSE', 'RMSE', 'MaskedRMSE', 'REL', 'MaskedREL', 'MAE', 'MaskedMAE', 'Threshold@1.05', 'MaskedThreshold@1.05', 'Threshold@1.10', 'MaskedThreshold@1.10', 'Threshold@1.25', 'MaskedThreshold@1.25']) - metrics_epsilon = metrics_params.get('epsilon', 1e-8) from utils.metrics import MetricsRecorder - metrics = MetricsRecorder(metrics_list = metrics_list, epsilon = metrics_epsilon) + metrics = MetricsRecorder(metrics_list = metrics_list, **metrics_params) return metrics def get_inference_image_size(self, inference_params = None): @@ -463,10 +462,29 @@ def get_inference_depth_min_max(self, inference_params = None): Returns ------- - Tuple of (int, int) the min and max depth. + Tuple of (float, float) the min and max depth. """ if inference_params is None: inference_params = self.inference_params - depth_min = inference_params.get('depth_min', 0.1) + depth_min = inference_params.get('depth_min', 0.3) depth_max = inference_params.get('depth_max', 1.5) - return depth_min, depth_max \ No newline at end of file + return depth_min, depth_max + + def get_inference_depth_norm(self, inference_params = None): + """ + Get the depth normalization coefficient from inference configuration. + + Parameters + ---------- + + inference_params: dict, optional, default: None. If inference_params is provided, then use the parameters specified in the inference_params to get the inference depth range. Otherwise, the inference parameters in the self.params will be used to get the inference depth range. + + Returns + ------- + + float, the depth normalization coefficient. + """ + if inference_params is None: + inference_params = self.inference_params + depth_norm = inference_params.get('depth_norm', 1.0) + return depth_norm \ No newline at end of file diff --git a/utils/constants.py b/utils/constants.py index 8693ded..9a2053e 100644 --- a/utils/constants.py +++ b/utils/constants.py @@ -4,4 +4,8 @@ Authors: Hongjie Fang. """ -LOSS_INF = 1e18 \ No newline at end of file +import numpy as np + + +LOSS_INF = 1e18 +DILATION_KERNEL = np.array([[0, 1, 0], [1, 1, 1], [0, 1, 0]]).astype(np.uint8) diff --git a/utils/criterion.py b/utils/criterion.py index 3c43bff..094e241 100644 --- a/utils/criterion.py +++ b/utils/criterion.py @@ -5,21 +5,28 @@ """ import torch import torch.nn as nn +import torch.nn.functional as F import numpy as np +from utils.functions import get_surface_normal_from_depth class Criterion(nn.Module): """ Various type of criterions. """ - def __init__(self, type, epsilon = 1e-8, huber_k = 0.01, **kwargs): + def __init__(self, type, combined_smooth = False, **kwargs): super(Criterion, self).__init__() - self.epsilon = epsilon + self.epsilon = kwargs.get('epsilon', 1e-8) + self.type = str.lower(type) + if 'huber' in self.type: + self.huber_k = kwargs.get('huber_k', 0.1) + self.combined_smooth = combined_smooth + if combined_smooth: + self.combined_beta = kwargs.get('combined_beta', 0.5) self.l2_loss = self.mse_loss self.masked_l2_loss = self.masked_mse_loss self.custom_masked_l2_loss = self.custom_masked_mse_loss - self.huber_k = huber_k - self.forward = getattr(self, type) + self.main_loss = getattr(self, type) self._mse = self._l2 def _l1(self, pred, gt): @@ -41,239 +48,223 @@ def _huber(self, pred, gt): delta = torch.abs(pred - gt) return torch.where(delta <= self.huber_k, delta ** 2 / 2, self.huber_k * delta - self.huber_k ** 2 / 2) - def mse_loss(self, pred, gt, *args, **kwargs): + def mse_loss(self, data_dict, *args, **kwargs): """ MSE loss. Parameters ---------- - pred: tensor of shape NHW, the prediction; - - gt: tensor of shape NHW, the ground-truth. + data_dict: the data dict for computing L2 loss. Returns ------- The MSE loss. """ - mask = torch.where(gt < self.epsilon, False, True) - delta = self._l2(pred, gt) - mask_base = torch.sum(mask.float(), dim = [1, 2]) - mask_base = torch.where(mask_base < self.epsilon, mask_base + self.epsilon, mask_base) - loss = torch.sum(delta * mask.float(), dim = [1, 2]) / mask_base - return torch.mean(loss) + pred = data_dict['pred'] + gt = data_dict['depth_gt'] + mask = data_dict['zero_mask'] + return self._l2(pred, gt)[mask].mean() - def masked_mse_loss(self, pred, gt, gt_mask, *args, **kwargs): + def masked_mse_loss(self, data_dict, *args, **kwargs): """ Masked MSE loss. Parameters ---------- - pred: tensor of shape NHW, the prediction; - - gt: tensor of shape NHW, the ground-truth; - - gt_mask: tensor of shape NHW, the ground-truth mask. + data_dict: the data dict for computing L2 loss. Returns ------- The masked MSE loss. """ - zero_mask = torch.where(gt < self.epsilon, False, True) - mask = gt_mask & zero_mask - delta = self._l2(pred, gt) - mask_base = torch.sum(mask.float(), dim = [1, 2]) - mask_base = torch.where(mask_base < self.epsilon, mask_base + self.epsilon, mask_base) - loss = torch.sum(delta * mask.float(), dim = [1, 2]) / mask_base - return torch.mean(loss) + pred = data_dict['pred'] + gt = data_dict['depth_gt'] + mask = data_dict['initial_loss_mask'] + return self._l2(pred, gt)[mask].mean() - def custom_masked_mse_loss(self, pred, gt, gt_mask, use_gt_mask, *args, **kwargs): + def custom_masked_mse_loss(self, data_dict, *args, **kwargs): """ Custom masked MSE loss. Parameters ---------- - pred: tensor of shape NHW, the prediction; - - gt: tensor of shape NHW, the ground-truth; - - gt_mask: tensor of shape NHW, the ground-truth mask; - - use_gt_mask: tensor of shape N, whether to use the ground-truth mask. + data_dict: the data dict for computing L2 loss. Returns ------- The custom masked MSE loss. """ - zero_mask = torch.where(gt < self.epsilon, False, True) - _, use_gt_mask = torch.broadcast_tensors(gt_mask.transpose(0, 2), use_gt_mask.view(-1)) - gt_mask = ~ (~ gt_mask & use_gt_mask.transpose(0, 2)) - mask = gt_mask & zero_mask - delta = self._l2(pred, gt) - mask_base = torch.sum(mask.float(), dim = [1, 2]) - mask_base = torch.where(mask_base < self.epsilon, mask_base + self.epsilon, mask_base) - loss = torch.sum(delta * mask.float(), dim = [1, 2]) / mask_base - return torch.mean(loss) + pred = data_dict['pred'] + gt = data_dict['depth_gt'] + mask = data_dict['loss_mask'] + return self._l2(pred, gt)[mask].mean() - def l1_loss(self, pred, gt, *args, **kwargs): + def l1_loss(self, data_dict, *args, **kwargs): """ L1 loss. Parameters ---------- - pred: tensor of shape NHW, the prediction; - - gt: tensor of shape NHW, the ground-truth. + data_dict: the data dict for computing L1 loss. Returns ------- The L1 loss. """ - mask = torch.where(gt < self.epsilon, False, True) - delta = self._l1(pred, gt) - mask_base = torch.sum(mask.float(), dim = [1, 2]) - mask_base = torch.where(mask_base < self.epsilon, mask_base + self.epsilon, mask_base) - loss = torch.sum(delta * mask.float(), dim = [1, 2]) / mask_base - return torch.mean(loss) + pred = data_dict['pred'] + gt = data_dict['depth_gt'] + mask = data_dict['zero_mask'] + return self._l1(pred, gt)[mask].mean() - def masked_l1_loss(self, pred, gt, gt_mask, *args, **kwargs): + def masked_l1_loss(self, data_dict, *args, **kwargs): """ Masked L1 loss. Parameters ---------- - pred: tensor of shape NHW, the prediction; - - gt: tensor of shape NHW, the ground-truth; - - gt_mask: tensor of shape NHW, the ground-truth mask. + data_dict: the data dict for computing L1 loss. Returns ------- The masked L1 loss. """ - zero_mask = torch.where(gt < self.epsilon, False, True) - mask = gt_mask & zero_mask - delta = self._l1(pred, gt) - mask_base = torch.sum(mask.float(), dim = [1, 2]) - mask_base = torch.where(mask_base < self.epsilon, mask_base + self.epsilon, mask_base) - loss = torch.sum(delta * mask.float(), dim = [1, 2]) / mask_base - return torch.mean(loss) + pred = data_dict['pred'] + gt = data_dict['depth_gt'] + mask = data_dict['initial_loss_mask'] + return self._l1(pred, gt)[mask].mean() - def custom_masked_l1_loss(self, pred, gt, gt_mask, use_gt_mask, *args, **kwargs): + def custom_masked_l1_loss(self, data_dict, *args, **kwargs): """ Custom masked L1 loss. Parameters ---------- - - pred: tensor of shape NHW, the prediction; - - gt: tensor of shape NHW, the ground-truth; - - gt_mask: tensor of shape NHW, the ground-truth mask; - - use_gt_mask: tensor of shape N, whether to use the ground-truth mask. + + data_dict: the data dict for computing L1 loss. Returns ------- The custom masked L1 loss. """ - zero_mask = torch.where(gt < self.epsilon, False, True) - _, use_gt_mask = torch.broadcast_tensors(gt_mask.transpose(0, 2), use_gt_mask.view(-1)) - gt_mask = ~ (~ gt_mask & use_gt_mask.transpose(0, 2)) - mask = gt_mask & zero_mask - delta = self._l1(pred, gt) - mask_base = torch.sum(mask.float(), dim = [1, 2]) - mask_base = torch.where(mask_base < self.epsilon, mask_base + self.epsilon, mask_base) - loss = torch.sum(delta * mask.float(), dim = [1, 2]) / mask_base - return torch.mean(loss) + pred = data_dict['pred'] + gt = data_dict['depth_gt'] + mask = data_dict['loss_mask'] + return self._l1(pred, gt)[mask].mean() - def huber_loss(self, pred, gt, *args, **kwargs): + def huber_loss(self, data_dict, *args, **kwargs): """ Huber loss. Parameters ---------- - pred: tensor of shape NHW, the prediction; - - gt: tensor of shape NHW, the ground-truth. + data_dict: the data dict for computing huber loss. Returns ------- The huber loss. """ - mask = torch.where(gt < self.epsilon, False, True) - delta = self._huber(pred, gt) - mask_base = torch.sum(mask.float(), dim = [1, 2]) - mask_base = torch.where(mask_base < self.epsilon, mask_base + self.epsilon, mask_base) - loss = torch.sum(delta * mask.float(), dim = [1, 2]) / mask_base - return torch.mean(loss) + pred = data_dict['pred'] + gt = data_dict['depth_gt'] + mask = data_dict['zero_mask'] + return self._huber(pred, gt)[mask].mean() - def masked_huber_loss(self, pred, gt, gt_mask, *args, **kwargs): + def masked_huber_loss(self, data_dict, *args, **kwargs): """ Masked Huber loss. Parameters ---------- - pred: tensor of shape NHW, the prediction; - - gt: tensor of shape NHW, the ground-truth; - - gt_mask: tensor of shape NHW, the ground-truth mask. + data_dict: the data dict for computing huber loss. Returns ------- The masked huber loss. """ - zero_mask = torch.where(gt < self.epsilon, False, True) - mask = gt_mask & zero_mask - delta = self._huber(pred, gt) - mask_base = torch.sum(mask.float(), dim = [1, 2]) - mask_base = torch.where(mask_base < self.epsilon, mask_base + self.epsilon, mask_base) - loss = torch.sum(delta * mask.float(), dim = [1, 2]) / mask_base - return torch.mean(loss) + pred = data_dict['pred'] + gt = data_dict['depth_gt'] + mask = data_dict['initial_loss_mask'] + return self._huber(pred, gt)[mask].mean() - def custom_masked_huber_loss(self, pred, gt, gt_mask, use_gt_mask, *args, **kwargs): + def custom_masked_huber_loss(self, data_dict, *args, **kwargs): """ Custom masked huber loss. Parameters ---------- - pred: tensor of shape NHW, the prediction; + data_dict: the data dict for computing huber loss. + + Returns + ------- + + The custom masked huber loss. + """ + pred = data_dict['pred'] + gt = data_dict['depth_gt'] + mask = data_dict['loss_mask'] + return self._huber(pred, gt)[mask].mean() + + def smooth_loss(self, data_dict, *args, **kwargs): + """ + Smooth loss: surface normal loss. + + Parameters + ---------- + + data_dict: the data dict for computing smooth loss. + + Returns + ------- - gt: tensor of shape NHW, the ground-truth; + The smooth loss. + """ + # Fetch information from data dict + pred = data_dict['pred'] + fx, fy, cx, cy = data_dict['fx'], data_dict['fy'], data_dict['cx'], data_dict['cy'] + depth_gt_sn = data_dict['depth_gt_sn'] + _, original_h, original_w = data_dict['depth_original'].shape + mask = data_dict['loss_mask_dilated'] + # Calculate smooth loss. + pred_sn = get_surface_normal_from_depth(pred, fx, fy, cx, cy, original_size = (original_w, original_h)) + sn_loss = 1 - F.cosine_similarity(pred_sn, depth_gt_sn, dim = 1) + # masking + return sn_loss[mask].mean() + + def forward(self, data_dict): + """ + Calculate criterion given data dict. - gt_mask: tensor of shape NHW, the ground-truth mask; + Parameters + ---------- - use_gt_mask: tensor of shape N, whether to use the ground-truth mask. + data_dict: the data dict for computing loss. Returns ------- - - The custom masked huber loss. + + The pre-defined loss. """ - zero_mask = torch.where(gt < self.epsilon, False, True) - _, use_gt_mask = torch.broadcast_tensors(gt_mask.transpose(0, 2), use_gt_mask.view(-1)) - gt_mask = ~ (~ gt_mask & use_gt_mask.transpose(0, 2)) - mask = gt_mask & zero_mask - delta = self._huber(pred, gt) - mask_base = torch.sum(mask.float(), dim = [1, 2]) - mask_base = torch.where(mask_base < self.epsilon, mask_base + self.epsilon, mask_base) - loss = torch.sum(delta * mask.float(), dim = [1, 2]) / mask_base - return torch.mean(loss) \ No newline at end of file + loss_dict = { + self.type: self.main_loss(data_dict) + } + if self.combined_smooth: + loss_dict['smooth'] = self.smooth_loss(data_dict) + loss_dict['loss'] = loss_dict[self.type] + self.combined_beta * loss_dict['smooth'] + else: + loss_dict['loss'] = loss_dict[self.type] + return loss_dict diff --git a/utils/data_preparation.py b/utils/data_preparation.py index 300e015..9c771c2 100644 --- a/utils/data_preparation.py +++ b/utils/data_preparation.py @@ -11,6 +11,8 @@ import random import OpenEXR import numpy as np +from utils.functions import get_surface_normal_from_depth +from utils.constants import DILATION_KERNEL def chromatic_transform(image): @@ -92,7 +94,7 @@ def add_noise(image, level = 0.1): return noisy.astype('uint8') -def exr_loader(exr_path, ndim=3, ndim_representation = ['R', 'G', 'B']): +def exr_loader(exr_path, ndim = 3, ndim_representation = ['R', 'G', 'B']): """ Loads a .exr file as a numpy array. @@ -145,7 +147,7 @@ def exr_loader(exr_path, ndim=3, ndim_representation = ['R', 'G', 'B']): return exr_arr -def process_depth(depth, camera_type = 0, depth_min = 0.1, depth_max = 1.5): +def process_depth(depth, camera_type = 0, depth_min = 0.3, depth_max = 1.5, depth_norm = 1.0): """ Process the depth information, including scaling, normalization and clear NaN values. @@ -159,7 +161,9 @@ def process_depth(depth, camera_type = 0, depth_min = 0.1, depth_max = 1.5): - 1: scale 1000 (RealSense D415, RealSense D435, etc.); - 2: scale 4000 (RealSense L515). - depth_min, depth_max: int, optional, default: 0.1, 1.5, the min depth and the max depth; + depth_min, depth_max: int, optional, default: 0.3, 1.5, the min depth and the max depth; + + depth_norm: float, optional, default: 1.0, the depth normalization coefficient. Returns ------- @@ -172,12 +176,29 @@ def process_depth(depth, camera_type = 0, depth_min = 0.1, depth_max = 1.5): if camera_type == 2: scale_coeff = 4000 depth = depth / scale_coeff - depth = (np.clip(depth, depth_min, depth_max) - depth_min) / (depth_max - depth_min) depth[np.isnan(depth)] = 0.0 + depth = np.where(depth < depth_min, 0, depth) + depth = np.where(depth > depth_max, 0, depth) + depth = depth / depth_norm return depth -def process_data(rgb, depth, depth_gt, depth_gt_mask, scene_type = "cluttered", camera_type = 0, split = 'train', image_size = (720, 1280), depth_min = 0.1, depth_max = 1.5, use_aug = True, rgb_aug_prob = 0.8, retain_original = False, **kwargs): +def process_data( + rgb, + depth, + depth_gt, + depth_gt_mask, + camera_intrinsics, + scene_type = "cluttered", + camera_type = 0, + split = 'train', + image_size = (720, 1280), + depth_min = 0.3, + depth_max = 1.5, + depth_norm = 10, + use_aug = True, + rgb_aug_prob = 0.8, + **kwargs): """ Process images and perform data augmentation. @@ -191,6 +212,8 @@ def process_data(rgb, depth, depth_gt, depth_gt_mask, scene_type = "cluttered", depth_gt: array, required, the ground-truth depth image; depth_gt_mask: array, required, the ground-truth depth image mask; + + camera_intrinsics: array, required, the camera intrinsics of the image; scene_type: str in ['cluttered', 'isolated'], optional, default: 'cluttered', the scene type; @@ -201,26 +224,25 @@ def process_data(rgb, depth, depth_gt, depth_gt_mask, scene_type = "cluttered", split: str in ['train', 'test'], optional, default: 'train', the split of the dataset; - image_size: (int, int) tuple, optional, default: (720, 1280), the size of the image; + image_size: tuple of (int, int), optional, default: (720, 1280), the size of the image; - depth_min, depth_max: int, optional, default: 0.1, 1.5, the min depth and the max depth; + depth_min, depth_max: float, optional, default: 0.1, 1.5, the min depth and the max depth; + + depth_norm: float, optional, default: 1.0, the depth normalization coefficient; use_aug: bool, optional, default: True, whether use data augmentation; - rgb_aug_prob: float, optional, default: 0.8, the rgb augmentation probability (only applies when use_aug is set to True); - - retain_original: bool, optional, default: False, whether to retain original samples. + rgb_aug_prob: float, optional, default: 0.8, the rgb augmentation probability (only applies when use_aug is set to True). Returns ------- - rgb, depth, depth_gt, depth_gt_mask, scene_mask tensors for training and testing. + data_dict for training and testing. """ - if retain_original: - depth_original = process_depth(depth.copy(), camera_type = camera_type) - depth_gt_original = process_depth(depth_gt.copy(), camera_type = camera_type) - depth_gt_mask_original = depth_gt_mask.copy() + depth_original = process_depth(depth.copy(), camera_type = camera_type, depth_min = depth_min, depth_max = depth_max, depth_norm = depth_norm) + depth_gt_original = process_depth(depth_gt.copy(), camera_type = camera_type) + depth_gt_mask_original = depth_gt_mask.copy() rgb = cv2.resize(rgb, image_size, interpolation = cv2.INTER_NEAREST) depth = cv2.resize(depth, image_size, interpolation = cv2.INTER_NEAREST) @@ -229,8 +251,8 @@ def process_data(rgb, depth, depth_gt, depth_gt_mask, scene_type = "cluttered", depth_gt_mask = depth_gt_mask.astype(np.bool) # depth processing - depth = process_depth(depth, camera_type = camera_type, depth_min = depth_min, depth_max = depth_max) - depth_gt = process_depth(depth_gt, camera_type = camera_type, depth_min = depth_min, depth_max = depth_max) + depth = process_depth(depth, camera_type = camera_type, depth_min = depth_min, depth_max = depth_max, depth_norm = depth_norm) + depth_gt = process_depth(depth_gt, camera_type = camera_type, depth_min = depth_min, depth_max = depth_max, depth_norm = depth_norm) # RGB augmentation. if split == 'train' and use_aug and np.random.rand(1) > 1 - rgb_aug_prob: @@ -263,9 +285,47 @@ def process_data(rgb, depth, depth_gt, depth_gt_mask, scene_type = "cluttered", rgb = rgb.transpose(2, 0, 1) # process scene mask - scene_mask = np.array([1 if scene_type == 'cluttered' else 0], dtype = np.bool) - - if retain_original: - return torch.FloatTensor(rgb), torch.FloatTensor(depth), torch.FloatTensor(depth_gt), torch.BoolTensor(depth_gt_mask), torch.BoolTensor(scene_mask), torch.FloatTensor(depth_original), torch.FloatTensor(depth_gt_original), torch.BoolTensor(depth_gt_mask_original) + scene_mask = (scene_type == 'cluttered') + + # zero mask + neg_zero_mask = np.where(depth_gt < 0.01, 255, 0).astype(np.uint8) + neg_zero_mask_dilated = cv2.dilate(neg_zero_mask, kernel = DILATION_KERNEL) + neg_zero_mask[neg_zero_mask != 0] = 1 + neg_zero_mask_dilated[neg_zero_mask_dilated != 0] = 1 + zero_mask = np.logical_not(neg_zero_mask) + zero_mask_dilated = np.logical_not(neg_zero_mask_dilated) + + # loss mask + initial_loss_mask = np.logical_and(depth_gt_mask, zero_mask) + initial_loss_mask_dilated = np.logical_and(depth_gt_mask, zero_mask_dilated) + if scene_mask: + loss_mask = initial_loss_mask + loss_mask_dilated = initial_loss_mask_dilated else: - return torch.FloatTensor(rgb), torch.FloatTensor(depth), torch.FloatTensor(depth_gt), torch.BoolTensor(depth_gt_mask), torch.BoolTensor(scene_mask) + loss_mask = zero_mask + loss_mask_dilated = zero_mask_dilated + + data_dict = { + 'rgb': torch.FloatTensor(rgb), + 'depth': torch.FloatTensor(depth), + 'depth_gt': torch.FloatTensor(depth_gt), + 'depth_gt_mask': torch.BoolTensor(depth_gt_mask), + 'scene_mask': torch.tensor(scene_mask), + 'zero_mask': torch.BoolTensor(zero_mask), + 'zero_mask_dilated': torch.BoolTensor(zero_mask_dilated), + 'initial_loss_mask': torch.BoolTensor(initial_loss_mask), + 'initial_loss_mask_dilated': torch.BoolTensor(initial_loss_mask_dilated), + 'loss_mask': torch.BoolTensor(loss_mask), + 'loss_mask_dilated': torch.BoolTensor(loss_mask_dilated), + 'depth_original': torch.FloatTensor(depth_original), + 'depth_gt_original': torch.FloatTensor(depth_gt_original), + 'depth_gt_mask_original': torch.BoolTensor(depth_gt_mask_original), + 'fx': torch.tensor(camera_intrinsics[0, 0]), + 'fy': torch.tensor(camera_intrinsics[1, 1]), + 'cx': torch.tensor(camera_intrinsics[0, 2]), + 'cy': torch.tensor(camera_intrinsics[1, 2]) + } + + data_dict['depth_gt_sn'] = get_surface_normal_from_depth(data_dict['depth_gt'].unsqueeze(0), data_dict['fx'].unsqueeze(0), data_dict['fy'].unsqueeze(0), data_dict['cx'].unsqueeze(0), data_dict['cy'].unsqueeze(0)).squeeze(0) + + return data_dict diff --git a/utils/functions.py b/utils/functions.py index ff0b3dc..cb25cf2 100644 --- a/utils/functions.py +++ b/utils/functions.py @@ -3,6 +3,11 @@ Authors: Hongjie Fang. """ +import torch +import einops +import numpy as np +import torch.nn.functional as F + def display_results(metrics_dict, logger): """ @@ -28,4 +33,132 @@ def display_results(metrics_dict, logger): logger.info(" {}: {:.6f}".format(metric_name, metric_value)) except Exception: logger.warning("Unable to display the results, the operation is ignored.") - pass \ No newline at end of file + pass + + +def gradient(x): + """ + Get gradient of xyz image. + + This is adapted from implicit-depth repository, ref: https://github.com/NVlabs/implicit_depth/blob/main/src/utils/point_utils.py. + + Parameters + ---------- + + x: the xyz map to get gradient. + + Returns + ------- + + the x-axis-in-image gradient and y-axis-in-image gradient of the xyz map. + """ + left = x + right = F.pad(x, [0, 1, 0, 0])[:, :, :, 1:] + top = x + bottom = F.pad(x, [0, 0, 0, 1])[:, :, 1:, :] + dx, dy = right - left, bottom - top + dx[:, :, :, -1] = 0 + dy[:, :, -1, :] = 0 + return dx, dy + + +def get_surface_normal_from_xyz(x, epsilon = 1e-8): + """ + Get the surface normal of xyz image. + + This is adapted from implicit-depth repository, ref: https://github.com/NVlabs/implicit_depth/blob/main/src/utils/point_utils.py. + + Parameters + ---------- + + x: the xyz map to get surface normal; + + epsilon: float, optional, default: 1e-8, the epsilon to avoid nan. + + Returns + ------- + + The surface normals. + """ + dx, dy = gradient(x) + surface_normal = torch.cross(dx, dy, dim = 1) + surface_normal = surface_normal / (torch.norm(surface_normal, dim = 1, keepdim=True) + epsilon) + return surface_normal + + +def get_xyz(depth, fx, fy, cx, cy, original_size = (1280, 720)): + """ + Get XYZ from depth image and camera intrinsics. + + Parameters + ---------- + + depth: tensor, required, the depth image; + + fx, fy, cx, cy: tensor, required, the camera intrinsics; + + original_size: tuple of (int, int), optional, default: (1280, 720), the original size of image. + + Returns + ------- + + The XYZ value of each pixel. + """ + bs, h, w = depth.shape + indices = np.indices((h, w), dtype=np.float32) + indices = torch.FloatTensor(np.array([indices] * bs)).to(depth.device) + x_scale = w / original_size[0] + y_scale = h / original_size[1] + fx *= x_scale + fy *= y_scale + cx *= x_scale + cy *= y_scale + z = depth + x = (indices[:, 1, :, :] - einops.repeat(cx, 'bs -> bs h w', h = h, w = w)) * z / einops.repeat(fx, 'bs -> bs h w', h = h, w = w) + y = (indices[:, 0, :, :] - einops.repeat(cy, 'bs -> bs h w', h = h, w = w)) * z / einops.repeat(fy, 'bs -> bs h w', h = h, w = w) + return torch.stack([x, y, z], axis = 1) + + +def get_surface_normal_from_depth(depth, fx, fy, cx, cy, original_size = (1280, 720), epsilon = 1e-8): + """ + Get surface normal from depth and camera intrinsics. + + Parameters + ---------- + + depth: tensor, required, the depth image; + + fx, fy, cx, cy: tensor, required, the camera intrinsics; + + original_size: tuple of (int, int), optional, default: (1280, 720), the original size of image; + + epsilon: float, optional, default: 1e-8, the epsilon to avoid nan. + + Returns + ------- + + The surface normals. + """ + xyz = get_xyz(depth, fx, fy, cx, cy, original_size = original_size) + return get_surface_normal_from_xyz(xyz, epsilon = epsilon) + + +def to_device(data_dict, device): + """ + Put the data in the data_dict to the specified device. + + Parameters + ---------- + + data_dict: dict, required, dict that contains tensors; + + device: torch.device object, required, the device. + + Returns + ------- + + The final data_dict. + """ + for key in data_dict.keys(): + data_dict[key] = data_dict[key].to(device) + return data_dict \ No newline at end of file diff --git a/utils/metrics.py b/utils/metrics.py index 1151f24..c142a57 100644 --- a/utils/metrics.py +++ b/utils/metrics.py @@ -25,7 +25,7 @@ class Metrics(object): - Threshold, masked threshold. """ - def __init__(self, epsilon = 1e-8, depth_scale = 10.0, **kwargs): + def __init__(self, epsilon = 1e-8, depth_scale = 1.0, **kwargs): """ Initialization. @@ -338,14 +338,14 @@ def _update_recorder_dict(self, metrics_dict): self.metrics_recorder_dict[metric_name] += metrics_dict[metric_name] * metrics_dict['samples'] self.metrics_recorder_dict['samples'] += metrics_dict['samples'] - def evaluate_batch(self, pred, gt, gt_mask, use_gt_mask, record = True, *args, **kwargs): + def evaluate_batch(self, data_dict, record = True, *args, **kwargs): """ Evaluate a batch of the samples. Parameters ---------- - (pred, gt, gt_mask, use_gt_mask): a record, representing predicted depth image, ground-truth depth image, groud-truth mask and whether to use ground-truth mask respectively. + data_dict: a record, representing predicted depth image, ground-truth depth image, groud-truth mask and whether to use ground-truth mask respectively. record: bool, optional, default: True, whether to record the metrics of the batch of samples in the metric recorder. @@ -354,12 +354,15 @@ def evaluate_batch(self, pred, gt, gt_mask, use_gt_mask, record = True, *args, * The metrics dict of the batch of samples. """ + pred = data_dict['pred'] + gt = data_dict['depth_gt'] + gt_mask = data_dict['depth_gt_mask'] + zero_mask = data_dict['zero_mask'] num_samples = gt.shape[0] - zero_mask = torch.where(torch.abs(gt) < self.epsilon, False, True) metrics_dict = {'samples': num_samples} for metric_line in self.metrics_list: metric_name, metric_func, metric_kwargs = metric_line - metrics_dict[metric_name] = metric_func(pred, gt, zero_mask, gt_mask, use_gt_mask, **metric_kwargs) + metrics_dict[metric_name] = metric_func(pred, gt, zero_mask, gt_mask, **metric_kwargs) if record: self._update_recorder_dict(metrics_dict) return metrics_dict