Suit 3D GS to DTU Dataset

ingra14m · ingra14m · commit fed1278ada36 · 2023-08-07T11:43:07.000+08:00
diff --git a/arguments/__init__.py b/arguments/__init__.py
@@ -69,7 +69,7 @@ def __init__(self, parser):
 
 class OptimizationParams(ParamGroup):
     def __init__(self, parser):
-        self.iterations = 10_000
+        self.iterations = 30_000
         self.position_lr_init = 0.00016
         self.position_lr_final = 0.0000016
         self.position_lr_delay_mult = 0.01
diff --git a/scene/__init__.py b/scene/__init__.py
@@ -45,6 +45,10 @@ def __init__(self, args : ModelParams, gaussians : GaussianModel, load_iteration
         elif os.path.exists(os.path.join(args.source_path, "transforms_train.json")):
             print("Found transforms_train.json file, assuming Blender data set!")
             scene_info = sceneLoadTypeCallbacks["Blender"](args.source_path, args.white_background, args.eval)
+        elif os.path.exists(os.path.join(args.source_path, "cameras_sphere.npz")):
+            print("Found cameras_sphere.npz file, assuming DTU data set!")
+            scene_info = sceneLoadTypeCallbacks["DTU"](args.source_path, "cameras_sphere.npz", "cameras_sphere.npz")
+
         else:
             assert False, "Could not recognize scene type!"
 
diff --git a/scene/cameras.py b/scene/cameras.py
@@ -17,7 +17,7 @@
 class Camera(nn.Module):
     def __init__(self, colmap_id, R, T, FoVx, FoVy, image, gt_alpha_mask,
                  image_name, uid,
-                 trans=np.array([0.0, 0.0, 0.0]), scale=1.0, data_device = "cuda", gt_depth=None
+                 trans=np.array([0.0, 0.0, 0.0]), scale=1.0, data_device = "cuda", fid=None
                  ):
         super(Camera, self).__init__()
 
@@ -37,7 +37,7 @@ def __init__(self, colmap_id, R, T, FoVx, FoVy, image, gt_alpha_mask,
             self.data_device = torch.device("cuda")
 
         self.original_image = image.clamp(0.0, 1.0).to(self.data_device)
-        self.depth = gt_depth.to(self.data_device) if gt_depth is not None else None
+        self.fid = torch.LongTensor(np.array([fid])).to(self.data_device)
         self.image_width = self.original_image.shape[2]
         self.image_height = self.original_image.shape[1]
 
diff --git a/scene/dataset_readers.py b/scene/dataset_readers.py
@@ -19,11 +19,14 @@
 import numpy as np
 import json
 import imageio
+from glob import glob
+import cv2 as cv
 from pathlib import Path
 from plyfile import PlyData, PlyElement
 from utils.sh_utils import SH2RGB
 from scene.gaussian_model import BasicPointCloud
 
+
 class CameraInfo(NamedTuple):
     uid: int
     R: np.array
@@ -35,7 +38,8 @@ class CameraInfo(NamedTuple):
     image_name: str
     width: int
     height: int
-    depth: np.array
+    fid: int
+
 
 class SceneInfo(NamedTuple):
     point_cloud: BasicPointCloud
@@ -44,6 +48,29 @@ class SceneInfo(NamedTuple):
     nerf_normalization: dict
     ply_path: str
 
+
+def load_K_Rt_from_P(filename, P=None):
+    if P is None:
+        lines = open(filename).read().splitlines()
+        if len(lines) == 4:
+            lines = lines[1:]
+        lines = [[x[0], x[1], x[2], x[3]] for x in (x.split(" ") for x in lines)]
+        P = np.asarray(lines).astype(np.float32).squeeze()
+
+    out = cv.decomposeProjectionMatrix(P)
+    K = out[0]
+    R = out[1]
+    t = out[2]
+
+    K = K / K[2, 2]
+
+    pose = np.eye(4, dtype=np.float32)
+    pose[:3, :3] = R.transpose()
+    pose[:3, 3] = (t[:3] / t[3])[:, 0]
+
+    return K, pose
+
+
 def getNerfppNorm(cam_info):
     def get_center_and_diag(cam_centers):
         cam_centers = np.hstack(cam_centers)
@@ -67,12 +94,13 @@ def get_center_and_diag(cam_centers):
 
     return {"translate": translate, "radius": radius}
 
+
 def readColmapCameras(cam_extrinsics, cam_intrinsics, images_folder):
     cam_infos = []
     for idx, key in enumerate(cam_extrinsics):
         sys.stdout.write('\r')
         # the exact output you're looking for:
-        sys.stdout.write("Reading camera {}/{}".format(idx+1, len(cam_extrinsics)))
+        sys.stdout.write("Reading camera {}/{}".format(idx + 1, len(cam_extrinsics)))
         sys.stdout.flush()
 
         extr = cam_extrinsics[key]
@@ -84,11 +112,11 @@ def readColmapCameras(cam_extrinsics, cam_intrinsics, images_folder):
         R = np.transpose(qvec2rotmat(extr.qvec))
         T = np.array(extr.tvec)
 
-        if intr.model=="SIMPLE_PINHOLE":
+        if intr.model == "SIMPLE_PINHOLE":
             focal_length_x = intr.params[0]
             FovY = focal2fov(focal_length_x, height)
             FovX = focal2fov(focal_length_x, width)
-        elif intr.model=="PINHOLE":
+        elif intr.model == "PINHOLE":
             focal_length_x = intr.params[0]
             focal_length_y = intr.params[1]
             FovY = focal2fov(focal_length_y, height)
@@ -106,6 +134,7 @@ def readColmapCameras(cam_extrinsics, cam_intrinsics, images_folder):
     sys.stdout.write('\n')
     return cam_infos
 
+
 def fetchPly(path):
     plydata = PlyData.read(path)
     vertices = plydata['vertex']
@@ -114,12 +143,13 @@ def fetchPly(path):
     normals = np.vstack([vertices['nx'], vertices['ny'], vertices['nz']]).T
     return BasicPointCloud(points=positions, colors=colors, normals=normals)
 
+
 def storePly(path, xyz, rgb):
     # Define the dtype for the structured array
     dtype = [('x', 'f4'), ('y', 'f4'), ('z', 'f4'),
-            ('nx', 'f4'), ('ny', 'f4'), ('nz', 'f4'),
-            ('red', 'u1'), ('green', 'u1'), ('blue', 'u1')]
-    
+             ('nx', 'f4'), ('ny', 'f4'), ('nz', 'f4'),
+             ('red', 'u1'), ('green', 'u1'), ('blue', 'u1')]
+
     normals = np.zeros_like(xyz)
 
     elements = np.empty(xyz.shape[0], dtype=dtype)
@@ -131,6 +161,7 @@ def storePly(path, xyz, rgb):
     ply_data = PlyData([vertex_element])
     ply_data.write(path)
 
+
 def readColmapSceneInfo(path, images, eval, llffhold=8):
     try:
         cameras_extrinsic_file = os.path.join(path, "sparse/0", "images.bin")
@@ -144,8 +175,9 @@ def readColmapSceneInfo(path, images, eval, llffhold=8):
         cam_intrinsics = read_intrinsics_text(cameras_intrinsic_file)
 
     reading_dir = "images" if images == None else images
-    cam_infos_unsorted = readColmapCameras(cam_extrinsics=cam_extrinsics, cam_intrinsics=cam_intrinsics, images_folder=os.path.join(path, reading_dir))
-    cam_infos = sorted(cam_infos_unsorted.copy(), key = lambda x : x.image_name)
+    cam_infos_unsorted = readColmapCameras(cam_extrinsics=cam_extrinsics, cam_intrinsics=cam_intrinsics,
+                                           images_folder=os.path.join(path, reading_dir))
+    cam_infos = sorted(cam_infos_unsorted.copy(), key=lambda x: x.image_name)
 
     if eval:
         train_cam_infos = [c for idx, c in enumerate(cam_infos) if idx % llffhold != 0]
@@ -178,6 +210,7 @@ def readColmapSceneInfo(path, images, eval, llffhold=8):
                            ply_path=ply_path)
     return scene_info
 
+
 def readCamerasFromTransforms(path, transformsfile, white_background, extension=".png"):
     cam_infos = []
 
@@ -194,8 +227,8 @@ def readCamerasFromTransforms(path, transformsfile, white_background, extension=
             depth_name = os.path.join(path, frame["file_path"] + "_depth0000" + '.exr')
 
             matrix = np.linalg.inv(np.array(frame["transform_matrix"]))
-            R = -np.transpose(matrix[:3,:3])
-            R[:,0] = -R[:,0]
+            R = -np.transpose(matrix[:3, :3])
+            R[:, 0] = -R[:, 0]
             T = -matrix[:3, 3]
 
             image_path = os.path.join(path, cam_name)
@@ -205,27 +238,29 @@ def readCamerasFromTransforms(path, transformsfile, white_background, extension=
 
             im_data = np.array(image.convert("RGBA"))
 
-            bg = np.array([1,1,1]) if white_background else np.array([0, 0, 0])
+            bg = np.array([1, 1, 1]) if white_background else np.array([0, 0, 0])
 
             norm_data = im_data / 255.0
-            arr = norm_data[:,:,:3] * norm_data[:, :, 3:4] + bg * (1 - norm_data[:, :, 3:4])
-            image = Image.fromarray(np.array(arr*255.0, dtype=np.byte), "RGB")
+            arr = norm_data[:, :, :3] * norm_data[:, :, 3:4] + bg * (1 - norm_data[:, :, 3:4])
+            image = Image.fromarray(np.array(arr * 255.0, dtype=np.byte), "RGB")
 
             fovy = focal2fov(fov2focal(fovx, image.size[0]), image.size[1])
-            FovY = fovx 
+            FovY = fovx
             FovX = fovy
 
             cam_infos.append(CameraInfo(uid=idx, R=R, T=T, FovY=FovY, FovX=FovX, image=image,
-                            image_path=image_path, image_name=image_name, width=image.size[0], height=image.size[1], depth=depth))
-            
+                                        image_path=image_path, image_name=image_name, width=image.size[0],
+                                        height=image.size[1], depth=depth))
+
     return cam_infos
 
+
 def readNerfSyntheticInfo(path, white_background, eval, extension=".png"):
     print("Reading Training Transforms")
     train_cam_infos = readCamerasFromTransforms(path, "transforms_train.json", white_background, extension)
     print("Reading Test Transforms")
     test_cam_infos = readCamerasFromTransforms(path, "transforms_test.json", white_background, extension)
-    
+
     if not eval:
         train_cam_infos.extend(test_cam_infos)
         test_cam_infos = []
@@ -237,7 +272,7 @@ def readNerfSyntheticInfo(path, white_background, eval, extension=".png"):
         # Since this data set has no colmap data, we start with random points
         num_pts = 100_000
         print(f"Generating random point cloud ({num_pts})...")
-        
+
         # We create random points inside the bounds of the synthetic Blender scenes
         xyz = np.random.random((num_pts, 3)) * 2.6 - 1.3
         shs = np.random.random((num_pts, 3)) / 255.0
@@ -256,7 +291,93 @@ def readNerfSyntheticInfo(path, white_background, eval, extension=".png"):
                            ply_path=ply_path)
     return scene_info
 
+
+def readDTUCameras(path, render_camera, object_camera):
+    camera_dict = np.load(os.path.join(path, render_camera))
+    images_lis = sorted(glob(os.path.join(path, 'image/*.png')))
+    masks_lis = sorted(glob(os.path.join(path, 'mask/*.png')))
+    n_images = len(images_lis)
+    cam_infos = []
+    for idx, image_path in enumerate(images_lis):
+        image = np.array(Image.open(image_path))
+        mask = np.array(imageio.imread(masks_lis[idx])) / 255.0
+        image = Image.fromarray((image * mask).astype(np.uint8))
+        world_mat = camera_dict['world_mat_%d' % idx].astype(np.float32)
+        fid = camera_dict['fid_%d' % idx]
+        image_name = Path(image_path).stem
+        scale_mat = camera_dict['scale_mat_%d' % idx].astype(np.float32)
+        P = world_mat @ scale_mat
+        P = P[:3, :4]
+
+        K, pose = load_K_Rt_from_P(None, P)
+        a = pose[0:1, :]
+        b = pose[1:2, :]
+        c = pose[2:3, :]
+
+        pose = np.concatenate([a, -c, -b, pose[3:, :]], 0)
+
+        S = np.eye(3)
+        S[1, 1] = -1
+        S[2, 2] = -1
+        pose[1, 3] = -pose[1, 3]
+        pose[2, 3] = -pose[2, 3]
+        pose[:3, :3] = S @ pose[:3, :3] @ S
+
+        a = pose[0:1, :]
+        b = pose[1:2, :]
+        c = pose[2:3, :]
+
+        pose = np.concatenate([a, c, b, pose[3:, :]], 0)
+
+        pose[:, 3] *= 0.5
+
+        matrix = np.linalg.inv(pose)
+        R = -np.transpose(matrix[:3, :3])
+        R[:, 0] = -R[:, 0]
+        T = -matrix[:3, 3]
+
+        FovY = focal2fov(K[0, 0], image.size[1])
+        FovX = focal2fov(K[0, 0], image.size[0])
+        cam_info = CameraInfo(uid=idx, R=R, T=T, FovY=FovY, FovX=FovX, image=image,
+                              image_path=image_path, image_name=image_name, width=image.size[0], height=image.size[1], fid=fid)
+        cam_infos.append(cam_info)
+    sys.stdout.write('\n')
+    return cam_infos
+
+
+def readNeuSDTUInfo(path, render_camera, object_camera):
+    print("Reading DTU Info")
+    train_cam_infos = readDTUCameras(path, render_camera, object_camera)
+
+    nerf_normalization = getNerfppNorm(train_cam_infos)
+
+    ply_path = os.path.join(path, "points3d.ply")
+    if not os.path.exists(ply_path):
+        # Since this data set has no colmap data, we start with random points
+        num_pts = 100_000
+        print(f"Generating random point cloud ({num_pts})...")
+
+        # We create random points inside the bounds of the synthetic Blender scenes
+        xyz = np.random.random((num_pts, 3)) * 2.6 - 1.3
+        shs = np.random.random((num_pts, 3)) / 255.0
+        pcd = BasicPointCloud(points=xyz, colors=SH2RGB(shs), normals=np.zeros((num_pts, 3)))
+
+        storePly(ply_path, xyz, SH2RGB(shs) * 255)
+    try:
+        pcd = fetchPly(ply_path)
+    except:
+        pcd = None
+
+    scene_info = SceneInfo(point_cloud=pcd,
+                           train_cameras=train_cam_infos,
+                           test_cameras=[],
+                           nerf_normalization=nerf_normalization,
+                           ply_path=ply_path)
+    return scene_info
+
+
 sceneLoadTypeCallbacks = {
     "Colmap": readColmapSceneInfo,
-    "Blender" : readNerfSyntheticInfo
-}
+    "Blender": readNerfSyntheticInfo,
+    "DTU": readNeuSDTUInfo,
+}
diff --git a/train.py b/train.py
@@ -71,7 +71,7 @@ def training(dataset, opt, pipe, testing_iterations, saving_iterations):
             viewpoint_stack = scene.getTrainCameras().copy()
 
         viewpoint_cam = viewpoint_stack.pop(randint(0, len(viewpoint_stack)-1))
-        gt_depth = viewpoint_cam.depth
+        fid = viewpoint_cam.fid
         # Render
         render_pkg = render(viewpoint_cam, gaussians, pipe, background)
         image, viewspace_point_tensor, visibility_filter, radii = render_pkg["render"], render_pkg["viewspace_points"], render_pkg["visibility_filter"], render_pkg["radii"]
@@ -81,8 +81,8 @@ def training(dataset, opt, pipe, testing_iterations, saving_iterations):
         gt_image = viewpoint_cam.original_image.cuda()
         Ll1 = l1_loss(image, gt_image)
         loss = (1.0 - opt.lambda_dssim) * Ll1 + opt.lambda_dssim * (1.0 - ssim(image, gt_image))
-        depth_loss = l1_loss(depth, gt_depth) * 0.1
-        loss = loss + depth_loss
+        # depth_loss = l1_loss(depth, gt_depth) * 0.1
+        # loss = loss + depth_loss
         loss.backward()
 
         iter_end.record()
diff --git a/utils/camera_utils.py b/utils/camera_utils.py
@@ -39,19 +39,8 @@ def loadCam(args, id, cam_info, resolution_scale):
         resolution = (int(orig_w / scale), int(orig_h / scale))
 
     resized_image_rgb = PILtoTorch(cam_info.image, resolution)
-    if cam_info.depth is not None:
-        resized_depth_rgb = ArrayToTorch(cam_info.depth, resolution)
-    else:
-        resized_depth_rgb = None
 
     gt_image = resized_image_rgb[:3, ...]
-    if resized_depth_rgb is not None:
-        depth_mask = resized_depth_rgb[0, ...] > 60000
-        gt_depth = resized_depth_rgb[0, ...]
-        gt_depth[depth_mask] = 0
-    else:
-        gt_depth = None
-
     loaded_mask = None
 
     if resized_image_rgb.shape[1] == 4:
@@ -60,7 +49,7 @@ def loadCam(args, id, cam_info, resolution_scale):
     return Camera(colmap_id=cam_info.uid, R=cam_info.R, T=cam_info.T, 
                   FoVx=cam_info.FovX, FoVy=cam_info.FovY, 
                   image=gt_image, gt_alpha_mask=loaded_mask,
-                  image_name=cam_info.image_name, uid=id, data_device=args.data_device, gt_depth=gt_depth)
+                  image_name=cam_info.image_name, uid=id, data_device=args.data_device, fid=cam_info.fid)
 
 def cameraList_from_camInfos(cam_infos, resolution_scale, args):
     camera_list = []