Improve docs for rainbow visualization

PiperOrigin-RevId: 580880342 Change-Id: I6feecd8717d783ffb8cb31d7f969e9c3af96cb0e
google-deepmind · Nov 9, 2023 · 0d9acd4 · 0d9acd4
1 parent 6102867
commit 0d9acd4
Showing 1 changed file with 33 additions and 14 deletions.
diff --git a/colabs/tapir_rainbow_demo.ipynb b/colabs/tapir_rainbow_demo.ipynb
@@ -30,7 +30,13 @@
         "\u003c/p\u003e\n",
         "\n",
         "\u003cp align=\"center\"\u003e\n",
-        "  \u003cimg src=\"https://storage.googleapis.com/dm-tapnet/horsejump_rainbow.gif\" width=\"70%\"/\u003e\n",
+        "  \u003cimg src=\"https://storage.googleapis.com/dm-tapnet/horsejump_rainbow.gif\" width=\"70%\"/\u003e\u003cbr/\u003e\u003cbr/\u003e\n",
+        "\u003c/p\u003e\n",
+        "\u003cp\u003e\n",
+        "  This visualization uses TAPIR to show how an object moves through space, even if the camera is tracking the object.  It begins by tracking points densely on a grid.  Then it estimates the camera motion as a homography (i.e., assuming either planar background or camera that rotates but does not move).  Any points that move according to that homography are removed.  Then we generate a \u0026ldquo;rainbow\u0026rdquo; visualization, where the tracked points leave \u0026ldquo;tails\u0026rdquo; that follow the camera motion, so it looks like the earlier positions of points are frozen in space.  This visualization was inspired by a similar one from \u003ca href=\"https://omnimotion.github.io/\"\u003eOmniMotion\u003c/a\u003e, although that one assumes ground-truth segmentations are available and models the camera as only 2D translation.\n",
+        "\u003c/p\u003e\n",
+        "\u003cp\u003e\n",
+        "  Note that we consider this algorithm \u0026ldquo;semi-automatic\u0026rdquo; because you may need some tuning for pleasing results on arbitrary videos.  Tracking failures on the background may show up as foreground objects.  Results are sensitive to the outlier thresholds used in RANSAC and segmentation, and you may wish to discard short tracks.  You can sample in a different way (e.g. sampling points from multiple frames) and everything will work, but the \u003cfont face=\"Courier\"\u003eplot_tracks_tails\u003c/font\u003e function uses the input order of the points to choose colors, so you will have to sort the points appropriately.\n",
         "\u003c/p\u003e\n"
       ]
     },
@@ -197,15 +203,6 @@
         "  return points"
       ]
     },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "b7X5ZNCpuemg"
-      },
-      "source": [
-        "## Inference on DAVIS"
-      ]
-    },
     {
       "cell_type": "code",
       "execution_count": null,
@@ -239,10 +236,11 @@
         "resize_height = 512  # @param {type: \"integer\"}\n",
         "resize_width = 512  # @param {type: \"integer\"}\n",
         "stride = 16  # @param {type: \"integer\"}\n",
+        "query_frame = 0  # @param {type: \"integer\"}\n",
         "\n",
         "height, width = orig_frames.shape[1:3]\n",
         "frames = media.resize_video(orig_frames, (resize_height, resize_width))\n",
-        "query_points = sample_grid_points(0, resize_height, resize_width, stride)\n",
+        "query_points = sample_grid_points(query_frame, resize_height, resize_width, stride)\n",
         "batch_size = 64\n",
         "tracks = []\n",
         "visibles = []\n",
@@ -275,23 +273,44 @@
       },
       "outputs": [],
       "source": [
+        "# The inlier point threshold for ransac, specified in normalized coordinates\n",
+        "# (points are rescaled to the range [0, 1] for optimization).\n",
+        "ransac_inlier_threshold = 0.07  # @param {type: \"number\"}\n",
+        "# What fraction of points need to be inliers for RANSAC to consider a trajectory\n",
+        "# to be trustworthy for estimating the homography.\n",
+        "ransac_track_inlier_frac = 0.95  # @param {type: \"number\"}\n",
+        "# After initial RANSAC, how many refinement passes to adjust the homographies\n",
+        "# based on tracks that have been deemed trustworthy.\n",
+        "num_refinement_passes = 2  # @param {type: \"number\"}\n",
+        "# After homographies are estimated, consider points to be outliers if they are\n",
+        "# further than this threshold.\n",
+        "foreground_inlier_threshold = 0.07  # @param {type: \"number\"}\n",
+        "# After homographies are estimated, consider tracks to be part of the foreground\n",
+        "# if less than this fraction of its points are inliers.\n",
+        "foreground_frac = 0.6  # @param {type: \"number\"}\n",
+        "\n",
+        "\n",
         "occluded = 1.0 - visibles\n",
         "homogs, err, canonical = viz_utils.get_homographies_wrt_frame(\n",
         "    tracks,\n",
         "    occluded,\n",
         "    [width, height]\n",
+        "    thresh=ransac_inlier_threshold,\n",
+        "    outlier_point_threshold=ransac_track_inlier_frac,\n",
+        "    num_refinement_passes=num_refinement_passes,\n",
         ")\n",
         "\n",
-        "inlier_ct = np.sum((err \u003c np.square(0.07)) * visibles, axis=-1)\n",
+        "inliers = (err \u003c np.square(foreground_inlier_threshold)) * visibles\n",
+        "inlier_ct = np.sum(inliers, axis=-1)\n",
         "ratio = inlier_ct / np.maximum(1.0, np.sum(visibles, axis=1))\n",
-        "is_fg = ratio \u003c= 0.60\n",
+        "is_fg = ratio \u003c= foreground_frac\n",
         "video = viz_utils.plot_tracks_tails(\n",
         "    orig_frames,\n",
         "    tracks[is_fg],\n",
         "    occluded[is_fg],\n",
         "    homogs\n",
         ")\n",
-        "media.show_video(video, fps=16)"
+        "media.show_video(video, fps=24)"
       ]
     }
   ],