Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Questions about tracking effectiveness #119

Open
lxin98 opened this issue Sep 7, 2024 · 4 comments
Open

Questions about tracking effectiveness #119

lxin98 opened this issue Sep 7, 2024 · 4 comments

Comments

@lxin98
Copy link

lxin98 commented Sep 7, 2024

My tracking points were initially clustered below the mask, but as the object moved to the middle of the image, why were the points I tracked not evenly distributed on the object's mask

@lxin98
Copy link
Author

lxin98 commented Sep 7, 2024

1341846434 710184
1341846434 710184
frame_00
frame_0028
This is the running effect, how should I check for problems

@cdoersch
Copy link
Collaborator

cdoersch commented Sep 8, 2024

I need some more context in order to help you. Can you provide the script you're using? Have you tried plotting the query points on the query frame?

@lxin98
Copy link
Author

lxin98 commented Sep 9, 2024

Thank you for your reply.Here is my code:`import argparse
import glob
import os

import imageio
import mediapy as media
import numpy as np
import torch
from tapnet_torch import tapir_model, transforms
from tqdm import tqdm

def read_video(folder_path):
frame_paths = sorted(glob.glob(os.path.join(folder_path, "*")))
video = np.stack([imageio.imread(frame_path) for frame_path in frame_paths])
print(f"{video.shape=} {video.dtype=} {video.min()=} {video.max()=}")
video = media._VideoArray(video)
return video

def preprocess_frames(frames):
"""Preprocess frames to model inputs.

Args:
  frames: [num_frames, height, width, 3], [0, 255], np.uint8

Returns:
  frames: [num_frames, height, width, 3], [-1, 1], np.float32
"""
frames = frames.float()
frames = frames / 255 * 2 - 1
return frames

def main():
parser = argparse.ArgumentParser()
parser.add_argument("--image_dir", type=str, required=True, help="image dir")
parser.add_argument("--mask_dir", type=str, required=True, help="mask dir")
parser.add_argument("--out_dir", type=str, required=True, help="out dir")
parser.add_argument("--grid_size", type=int, default=4, help="grid size")
parser.add_argument("--resize_height", type=int, default=256, help="resize height")
parser.add_argument("--resize_width", type=int, default=256, help="resize width")
parser.add_argument(
"--model_type", type=str, choices=["tapir", "bootstapir"], help="model type"
)
parser.add_argument(
"--ckpt_dir",
type=str,
default="checkpoints",
help="checkpoint dir",
)
args = parser.parse_args()

folder_path = args.image_dir
mask_dir = args.mask_dir
frame_names = [
    os.path.basename(f) for f in sorted(glob.glob(os.path.join(folder_path, "*")))
]
out_dir = args.out_dir
os.makedirs(out_dir, exist_ok=True)

done = True
for t in range(len(frame_names)):
    for j in range(len(frame_names)):
        name_t = os.path.splitext(frame_names[t])[0]
        name_j = os.path.splitext(frame_names[j])[0]
        out_path = f"{out_dir}/{name_t}_{name_j}.npy"
        if not os.path.exists(out_path):
            done = False
            break
print(f"{done=}")
if done:
    print("Already done")
    return

## Load model
ckpt_file = (
    "tapir_checkpoint_panning.pt"
    if args.model_type == "tapir"
    else "bootstapir_checkpoint_v2.pt"
)
ckpt_path = os.path.join(args.ckpt_dir, ckpt_file)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = tapir_model.TAPIR(pyramid_level=1)
model.load_state_dict(torch.load(ckpt_path))
model = model.to(device)

resize_height = args.resize_height
resize_width = args.resize_width
grid_size = args.grid_size

video = read_video(folder_path)
num_frames, height, width = video.shape[0:3]
masks = read_video(mask_dir)
masks = (masks.reshape((num_frames, height, width, -1)) > 0).any(axis=-1)
print(f"{video.shape=} {masks.shape=} {masks.max()=} {masks.sum()=}")

frames = media.resize_video(video, (resize_height, resize_width))
print(f"{frames.shape=}")
frames = torch.from_numpy(frames).to(device)
frames = preprocess_frames(frames)[None]
print(f"preprocessed {frames.shape=}")

y, x = np.mgrid[0:height:grid_size, 0:width:grid_size]
y_resize, x_resize = y / (height - 1) * (resize_height - 1), x / (width - 1) * (
    resize_width - 1
)

for t in tqdm(range(num_frames), desc="query frames"):
    name_t = os.path.splitext(frame_names[t])[0]
    file_matches = glob.glob(f"{out_dir}/{name_t}_*.npy")
    if len(file_matches) == num_frames:
        print(f"Already computed tracks with query {t=} {name_t=}")
        continue

    all_points = np.stack([t * np.ones_like(y), y_resize, x_resize], axis=-1)
    mask = masks[t]
    in_mask = mask[y, x] > 0.5
    all_points_t = all_points[in_mask]
    print(f"{all_points.shape=} {all_points_t.shape=} {t=}")
    outputs = []
    if len(all_points_t) > 0:
        num_chunks = max(1, len(all_points_t) // 128)
        for points in tqdm(
            np.array_split(all_points_t, axis=0, indices_or_sections=num_chunks),
            leave=False,
            desc="points",
        ):
            points = torch.from_numpy(points.astype(np.float32))[None].to(
                device
            )  # Add batch dimension
            with torch.inference_mode():
                preds = model(frames, points)
            tracks, occlusions, expected_dist = (
                preds["tracks"][0].detach().cpu().numpy(),
                preds["occlusion"][0].detach().cpu().numpy(),
                preds["expected_dist"][0].detach().cpu().numpy(),
            )
            tracks = transforms.convert_grid_coordinates(
                tracks, (resize_width - 1, resize_height - 1), (width - 1, height - 1)
            )
            outputs.append(
                np.concatenate(
                    [tracks, occlusions[..., None], expected_dist[..., None]], axis=-1
                )
            )
        outputs = np.concatenate(outputs, axis=0)
    else:
        outputs = np.zeros((0, num_frames, 4), dtype=np.float32)

    for j in range(num_frames):
        if j == t:
            original_query_points = np.stack([x[in_mask], y[in_mask]], axis=-1)
            outputs[:, j, :2] = original_query_points
        name_j = os.path.splitext(frame_names[j])[0]
        np.save(f"{out_dir}/{name_t}_{name_j}.npy", outputs[:, j])

if name == "main":
main()`

@lxin98
Copy link
Author

lxin98 commented Sep 11, 2024

I have submitted my own code on it, please help me take a look. Thank you @cdoersch

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants