From 0d9acd432bf4542a1cf407434399f926cc64504d Mon Sep 17 00:00:00 2001 From: Carl Doersch Date: Thu, 9 Nov 2023 06:07:34 -0800 Subject: [PATCH] Improve docs for rainbow visualization PiperOrigin-RevId: 580880342 Change-Id: I6feecd8717d783ffb8cb31d7f969e9c3af96cb0e --- colabs/tapir_rainbow_demo.ipynb | 47 +++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/colabs/tapir_rainbow_demo.ipynb b/colabs/tapir_rainbow_demo.ipynb index e860133..3d8e1ad 100644 --- a/colabs/tapir_rainbow_demo.ipynb +++ b/colabs/tapir_rainbow_demo.ipynb @@ -30,7 +30,13 @@ "\u003c/p\u003e\n", "\n", "\u003cp align=\"center\"\u003e\n", - " \u003cimg src=\"https://storage.googleapis.com/dm-tapnet/horsejump_rainbow.gif\" width=\"70%\"/\u003e\n", + " \u003cimg src=\"https://storage.googleapis.com/dm-tapnet/horsejump_rainbow.gif\" width=\"70%\"/\u003e\u003cbr/\u003e\u003cbr/\u003e\n", + "\u003c/p\u003e\n", + "\u003cp\u003e\n", + " This visualization uses TAPIR to show how an object moves through space, even if the camera is tracking the object. It begins by tracking points densely on a grid. Then it estimates the camera motion as a homography (i.e., assuming either planar background or camera that rotates but does not move). Any points that move according to that homography are removed. Then we generate a \u0026ldquo;rainbow\u0026rdquo; visualization, where the tracked points leave \u0026ldquo;tails\u0026rdquo; that follow the camera motion, so it looks like the earlier positions of points are frozen in space. This visualization was inspired by a similar one from \u003ca href=\"https://omnimotion.github.io/\"\u003eOmniMotion\u003c/a\u003e, although that one assumes ground-truth segmentations are available and models the camera as only 2D translation.\n", + "\u003c/p\u003e\n", + "\u003cp\u003e\n", + " Note that we consider this algorithm \u0026ldquo;semi-automatic\u0026rdquo; because you may need some tuning for pleasing results on arbitrary videos. Tracking failures on the background may show up as foreground objects. Results are sensitive to the outlier thresholds used in RANSAC and segmentation, and you may wish to discard short tracks. You can sample in a different way (e.g. sampling points from multiple frames) and everything will work, but the \u003cfont face=\"Courier\"\u003eplot_tracks_tails\u003c/font\u003e function uses the input order of the points to choose colors, so you will have to sort the points appropriately.\n", "\u003c/p\u003e\n" ] }, @@ -197,15 +203,6 @@ " return points" ] }, - { - "cell_type": "markdown", - "metadata": { - "id": "b7X5ZNCpuemg" - }, - "source": [ - "## Inference on DAVIS" - ] - }, { "cell_type": "code", "execution_count": null, @@ -239,10 +236,11 @@ "resize_height = 512 # @param {type: \"integer\"}\n", "resize_width = 512 # @param {type: \"integer\"}\n", "stride = 16 # @param {type: \"integer\"}\n", + "query_frame = 0 # @param {type: \"integer\"}\n", "\n", "height, width = orig_frames.shape[1:3]\n", "frames = media.resize_video(orig_frames, (resize_height, resize_width))\n", - "query_points = sample_grid_points(0, resize_height, resize_width, stride)\n", + "query_points = sample_grid_points(query_frame, resize_height, resize_width, stride)\n", "batch_size = 64\n", "tracks = []\n", "visibles = []\n", @@ -275,23 +273,44 @@ }, "outputs": [], "source": [ + "# The inlier point threshold for ransac, specified in normalized coordinates\n", + "# (points are rescaled to the range [0, 1] for optimization).\n", + "ransac_inlier_threshold = 0.07 # @param {type: \"number\"}\n", + "# What fraction of points need to be inliers for RANSAC to consider a trajectory\n", + "# to be trustworthy for estimating the homography.\n", + "ransac_track_inlier_frac = 0.95 # @param {type: \"number\"}\n", + "# After initial RANSAC, how many refinement passes to adjust the homographies\n", + "# based on tracks that have been deemed trustworthy.\n", + "num_refinement_passes = 2 # @param {type: \"number\"}\n", + "# After homographies are estimated, consider points to be outliers if they are\n", + "# further than this threshold.\n", + "foreground_inlier_threshold = 0.07 # @param {type: \"number\"}\n", + "# After homographies are estimated, consider tracks to be part of the foreground\n", + "# if less than this fraction of its points are inliers.\n", + "foreground_frac = 0.6 # @param {type: \"number\"}\n", + "\n", + "\n", "occluded = 1.0 - visibles\n", "homogs, err, canonical = viz_utils.get_homographies_wrt_frame(\n", " tracks,\n", " occluded,\n", " [width, height]\n", + " thresh=ransac_inlier_threshold,\n", + " outlier_point_threshold=ransac_track_inlier_frac,\n", + " num_refinement_passes=num_refinement_passes,\n", ")\n", "\n", - "inlier_ct = np.sum((err \u003c np.square(0.07)) * visibles, axis=-1)\n", + "inliers = (err \u003c np.square(foreground_inlier_threshold)) * visibles\n", + "inlier_ct = np.sum(inliers, axis=-1)\n", "ratio = inlier_ct / np.maximum(1.0, np.sum(visibles, axis=1))\n", - "is_fg = ratio \u003c= 0.60\n", + "is_fg = ratio \u003c= foreground_frac\n", "video = viz_utils.plot_tracks_tails(\n", " orig_frames,\n", " tracks[is_fg],\n", " occluded[is_fg],\n", " homogs\n", ")\n", - "media.show_video(video, fps=16)" + "media.show_video(video, fps=24)" ] } ],