-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathunidepth_video.py
159 lines (116 loc) · 5.94 KB
/
unidepth_video.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import argparse
import numpy as np
import os
import torch
import cv2
import sys
print("please ignore warnings about depreciation warnings for xformers components")
print("the loading takes a while the first time just wait")
sys.path.append("UniDepth")
from unidepth.models import UniDepthV2
from unidepth.utils import colorize, image_grid
from unidepth.utils.camera import Pinhole
def compute_camera_matrix(fov_horizontal_deg, fov_vertical_deg, image_width, image_height):
#We need one or the other
if fov_horizontal_deg is not None:
# Convert FoV from degrees to radians
fov_horizontal_rad = np.deg2rad(fov_horizontal_deg)
# Compute the focal lengths in pixels
fx = image_width / (2 * np.tan(fov_horizontal_rad / 2))
if fov_vertical_deg is not None:
# Convert FoV from degrees to radians
fov_vertical_rad = np.deg2rad(fov_vertical_deg)
# Compute the focal lengths in pixels
fy = image_height / (2 * np.tan(fov_vertical_rad / 2))
if fov_vertical_deg is None:
fy = fx
if fov_horizontal_deg is None:
fx = fy
# Assume the principal point is at the image center
cx = image_width / 2
cy = image_height / 2
# Construct the camera matrix
camera_matrix = np.array([[fx, 0, cx],
[ 0, fy, cy],
[ 0, 0, 1]], dtype=np.float64)
return camera_matrix
def fov_from_camera_matrix(mat):
w = mat[0][2]*2
h = mat[1][2]*2
fx = mat[0][0]
fy = mat[1][1]
fov_x = np.rad2deg(2 * np.arctan2(w, 2 * fx))
fov_y = np.rad2deg(2 * np.arctan2(h, 2 * fy))
return fov_x, fov_y
def save_24bit(frames, output_video_path, fps, max_depth_arg):
"""
Saves depth maps encoded in the R, G and B channels of a video (to increse accuracy as when compared to gray scale)
"""
height = frames.shape[1]
width = frames.shape[2]
out = cv2.VideoWriter(output_video_path, cv2.VideoWriter_fourcc(*"FFV1"), fps, (width, height))
max_depth = frames.max()
print("max metric depth: ", max_depth)
MODEL_maxOUTPUT_depth = max_depth_arg ### pick a value slitght above max metric depth to save the depth in th video file nicly
# if you pick a high value you will lose resolution
# incase you did not pick a absolute value we max out (this mean each video will have depth relative to max_depth)
# (if you want to use the video as a depth souce a absolute value is prefrable)
if MODEL_maxOUTPUT_depth < max_depth:
print("warning: output depth is deeper than max_depth. The depth will be clipped")
for i in range(frames.shape[0]):
depth = frames[i]
scaled_depth = (((255**4)/MODEL_maxOUTPUT_depth)*depth.astype(np.float64)).astype(np.uint32)
# View the depth as raw bytes: shape (H, W, 4)
depth_bytes = scaled_depth.view(np.uint8).reshape(height, width, 4)
R = (depth_bytes[:, :, 3]) # Most significant bits in R and G channel (duplicated to reduce compression artifacts)
G = (depth_bytes[:, :, 3])
B = (depth_bytes[:, :, 2]) # Least significant bit in blue channel
bgr24bit = np.dstack((B, G, R))
out.write(bgr24bit)
out.release()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Video Depth Anything Unidepth video converter')
parser.add_argument('--color_video', type=str, required=True)
parser.add_argument('--output_dir', type=str, default='./outputs')
parser.add_argument('--max_len', type=int, default=-1, help='maximum length of the input video, -1 means no limit')
parser.add_argument('--target_fps', type=int, default=-1, help='target fps of the input video, -1 means the original fps')
parser.add_argument('--max_depth', default=20, type=int, help='the max depth that the video uses', required=False)
parser.add_argument('--xfov', type=int, help='fov in deg in the x-direction, calculated from aspectratio and yfov in not given', required=False)
parser.add_argument('--yfov', type=int, help='fov in deg in the y-direction, calculated from aspectratio and xfov in not given', required=False)
args = parser.parse_args()
if args.xfov is None and args.yfov is None:
print("Either --xfov or --yfov is required.")
exit(0)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
if not os.path.isfile(args.color_video):
raise Exception("input color_video does not exist")
MODEL_maxOUTPUT_depth = args.max_depth
raw_video = cv2.VideoCapture(args.color_video)
frame_width, frame_height = int(raw_video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(raw_video.get(cv2.CAP_PROP_FRAME_HEIGHT))
frame_rate = raw_video.get(cv2.CAP_PROP_FPS)
cam_matrix = compute_camera_matrix(args.xfov, args.yfov, frame_width, frame_height).astype(np.float32)
cam_matrix_torch = torch.from_numpy(cam_matrix)
model = UniDepthV2.from_pretrained(f"lpiccinelli/unidepth-v2-vitl14").to(DEVICE)
model.interpolation_mode = "bilinear"
depths = []
frame_n = 0
while raw_video.isOpened():
ret, raw_frame = raw_video.read()
if not ret:
break
frame_n += 1
print("--- frame ",frame_n," ----")
if args.max_len < frame_n and args.max_len != -1:
break
rgb = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2RGB)
rgb_torch = torch.from_numpy(rgb).permute(2, 0, 1)
predictions = model.infer(rgb_torch, cam_matrix_torch)
depths.append(predictions["depth"].squeeze().cpu().numpy())
pred_intrinsic = predictions["intrinsics"].squeeze().cpu().numpy()
fovx, fovy = fov_from_camera_matrix(pred_intrinsic)
print("fovx:", fovx, "fovy:", fovy)
video_name = os.path.basename(args.color_video)
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
output_video_path = os.path.join(args.output_dir, os.path.splitext(video_name)[0]+'_depth.mkv')
save_24bit(np.array(depths), output_video_path, frame_rate, args.max_depth)