diff --git a/examples/tutorial/colab_dibr_tutorial.ipynb b/examples/tutorial/colab_dibr_tutorial.ipynb new file mode 100644 index 000000000..0e567cfd1 --- /dev/null +++ b/examples/tutorial/colab_dibr_tutorial.ipynb @@ -0,0 +1,513 @@ +{ + "nbformat": 4, + "nbformat_minor": 5, + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + }, + "colab": { + "name": "colab-dibr-tutorial.ipynb", + "private_outputs": true, + "provenance": [], + "collapsed_sections": [] + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "occupied-senegal" + }, + "source": [ + "# Optimizing a mesh using a Differentiable Renderer\n", + "\n", + "Differentiable rendering can be used to optimize the underlying 3D properties, like geometry and lighting, by backpropagating gradients from the loss in the image space. In this tutorial, we optimize geometry and texture of a single object based on a dataset of rendered ground truth views. This tutorial demonstrates functionality in `kaolin.render.mesh`, including the key `dibr_rasterization` function. See detailed [API documentation](https://kaolin.readthedocs.io/en/latest/modules/kaolin.render.mesh.html).\n", + "\n", + "In addition, we demonstrate the use of [Kaolin's 3D checkpoints and training visualization](https://kaolin.readthedocs.io/en/latest/modules/kaolin.visualize.html) with the [Omniverse Kaolin App](https://docs.omniverse.nvidia.com/app_kaolin/app_kaolin/user_manual.html)." + ], + "id": "occupied-senegal" + }, + { + "cell_type": "code", + "metadata": { + "id": "RjOKDB_KznvW" + }, + "source": [ + "!nvidia-smi" + ], + "id": "RjOKDB_KznvW", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "OJWoX0sIzoKe" + }, + "source": [ + "# BUILD KAOLIN\n", + "# https://spltech.co.uk/how-to-turn-2d-photos-into-a-3d-model-using-nvidia-kaolin-and-pytorch-a-3d-deep-learning-tutorial/\n", + "%cd /content\n", + "!rm -rf kaolin\n", + "!git clone --recursive https://github.com/NVIDIAGameWorks/kaolin\n", + "%cd /content/kaolin\n", + "!pip install torch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 torchtext==0.8.1 cython==0.29.20 usd-core==20.11\n", + "!python setup.py develop" + ], + "id": "OJWoX0sIzoKe", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Z5nHFiFrsVuw" + }, + "source": [ + "# WRANGLE\n", + "%cd /content/kaolin/examples/samples/\n", + "!unzip rendered_clock.zip" + ], + "id": "Z5nHFiFrsVuw", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "sharp-gibson" + }, + "source": [ + "%cd /content/kaolin/\n", + "\n", + "import json\n", + "import os\n", + "import glob\n", + "import time\n", + "\n", + "from PIL import Image\n", + "import torch\n", + "import numpy as np\n", + "from matplotlib import pyplot as plt\n", + "\n", + "import kaolin as kal\n", + "\n", + "# path to the rendered image (using the data synthesizer)\n", + "rendered_path = \"/content/kaolin/examples/samples/rendered_clock/\"\n", + "# path to the output logs (readable with the training visualizer in the omniverse app)\n", + "logs_path = './logs/'\n", + "\n", + "# We initialize the timelapse that will store USD for the visualization apps\n", + "timelapse = kal.visualize.Timelapse(logs_path)" + ], + "id": "sharp-gibson", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "alike-voluntary" + }, + "source": [ + "# Hyperparameters\n", + "num_epoch = 40\n", + "batch_size = 2\n", + "laplacian_weight = 0.1\n", + "flat_weight = 0.001\n", + "image_weight = 0.1\n", + "mask_weight = 1.\n", + "lr = 5e-2\n", + "scheduler_step_size = 15\n", + "scheduler_gamma = 0.5\n", + "\n", + "texture_res = 400\n", + "\n", + "# select camera angle for best visualization\n", + "test_batch_ids = [2, 5, 10]\n", + "test_batch_size = len(test_batch_ids)" + ], + "id": "alike-voluntary", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vocal-anderson" + }, + "source": [ + "# Generating Training Data\n", + "\n", + "To optimize a mesh, typical training data includes RGB images and segmentation mask. One way to generate this data is to use the Data Generator in the [Omniverse Kaolin App](https://docs.omniverse.nvidia.com/app_kaolin/app_kaolin/user_manual.html#data-generator). We provide sample output of the app in `examples/samples/`.\n", + "\n", + "## Parse synthetic data\n", + "We first need to parse the synthetic data generated by the omniverse app.\n", + "The omniverse app generate 1 file per type of data (which can be depth map, rgb image, segmentation map), and an additional metadata json file.\n", + "\n", + "The json file contains two main fields:\n", + "- camera_properties: Contains all the data related to camera setting such as \"clipping_range\", \"horizontal_aperture\", \"focal_length\", \"tf_mat\"\n", + "- asset_transforms: Those are transformations that are applied by the [Omniverse Kaolin App](https://docs.omniverse.nvidia.com/app_kaolin/app_kaolin/user_manual.html#data-generator), such as rotation / translation between objects or normalization." + ], + "id": "vocal-anderson" + }, + { + "cell_type": "code", + "metadata": { + "id": "minus-thanks" + }, + "source": [ + "num_views = len(glob.glob(os.path.join(rendered_path,'*_rgb.png')))\n", + "train_data = []\n", + "for i in range(num_views):\n", + " data = kal.io.render.import_synthetic_view(\n", + " rendered_path, i, rgb=True, semantic=True)\n", + " train_data.append(data)\n", + "\n", + "dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, pin_memory=True)" + ], + "id": "minus-thanks", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cardiac-korea" + }, + "source": [ + "# Loading the Sphere Template\n", + "\n", + "The optimization starts from deforming an input template mesh according to the input image. We will use a sphere template that provides better performance on objects without topological holes. We use \"/kaolin/examples/samples/sphere.obj\" for convenience." + ], + "id": "cardiac-korea" + }, + { + "cell_type": "code", + "metadata": { + "id": "closed-female" + }, + "source": [ + "mesh = kal.io.obj.import_mesh('/content/kaolin/examples/samples/sphere.obj', with_materials=True)\n", + "# the sphere is usually too small (this is fine-tuned for the clock)\n", + "vertices = mesh.vertices.cuda().unsqueeze(0) * 75\n", + "vertices.requires_grad = True\n", + "faces = mesh.faces.cuda()\n", + "uvs = mesh.uvs.cuda().unsqueeze(0)\n", + "face_uvs_idx = mesh.face_uvs_idx.cuda()\n", + "\n", + "\n", + "face_uvs = kal.ops.mesh.index_vertices_by_faces(uvs, face_uvs_idx).detach()\n", + "face_uvs.requires_grad = False\n", + "\n", + "texture_map = torch.ones((1, 3, texture_res, texture_res), dtype=torch.float, device='cuda',\n", + " requires_grad=True)\n", + "\n", + "# The topology of the mesh and the uvs are constant\n", + "# so we can initialize them on the first iteration only\n", + "timelapse.add_mesh_batch(\n", + " iteration=0,\n", + " category='optimized_mesh',\n", + " faces_list=[mesh.faces.cpu()],\n", + " uvs_list=[mesh.uvs.cpu()],\n", + " face_uvs_idx_list=[mesh.face_uvs_idx.cpu()],\n", + ")\n" + ], + "id": "closed-female", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "needed-findings" + }, + "source": [ + "# Preparing the losses and regularizer\n", + "\n", + "During training we will use different losses:\n", + "- an image loss: an L1 loss based on RGB image.\n", + "- a mask loss: an Intersection over Union (IoU) of the segmentation mask with the soft_mask output by DIB-R rasterizer.\n", + "- a laplacian loss: to avoid deformation that are too strong.\n", + "- a flat loss: to keep a smooth surface and avoid faces intersecting.\n", + "\n", + "For that we need to compute the laplacian matrix and some adjacency information\n", + "(the face idx of faces connected to each edge)\n" + ], + "id": "needed-findings" + }, + { + "cell_type": "code", + "metadata": { + "id": "considerable-impression" + }, + "source": [ + "## Separate vertices center as a learnable parameter\n", + "vertices_init = vertices.detach()\n", + "vertices_init.requires_grad = False\n", + "\n", + "# This is the center of the optimized mesh, separating it as a learnable parameter helps the optimization. \n", + "vertice_shift = torch.zeros((3,), dtype=torch.float, device='cuda',\n", + " requires_grad=True)\n", + "\n", + "def recenter_vertices(vertices, vertice_shift):\n", + " \"\"\"Recenter vertices on vertice_shift for better optimization\"\"\"\n", + " vertices_min = vertices.min(dim=1, keepdim=True)[0]\n", + " vertices_max = vertices.max(dim=1, keepdim=True)[0]\n", + " vertices_mid = (vertices_min + vertices_max) / 2\n", + " vertices = vertices - vertices_mid + vertice_shift\n", + " return vertices\n", + "\n", + "\n", + "nb_faces = faces.shape[0]\n", + "nb_vertices = vertices_init.shape[1]\n", + "face_size = 3\n", + "\n", + "## Set up auxiliary connectivity matrix of edges to faces indexes for the flat loss\n", + "edges = torch.cat([faces[:,i:i+2] for i in range(face_size - 1)] +\n", + " [faces[:,[-1,0]]], dim=0)\n", + "\n", + "edges = torch.sort(edges, dim=1)[0]\n", + "face_ids = torch.arange(nb_faces, device='cuda', dtype=torch.long).repeat(face_size)\n", + "edges, edges_ids = torch.unique(edges, sorted=True, return_inverse=True, dim=0)\n", + "nb_edges = edges.shape[0]\n", + "# edge to faces\n", + "sorted_edges_ids, order_edges_ids = torch.sort(edges_ids)\n", + "sorted_faces_ids = face_ids[order_edges_ids]\n", + "# indices of first occurences of each key\n", + "idx_first = torch.where(\n", + " torch.nn.functional.pad(sorted_edges_ids[1:] != sorted_edges_ids[:-1],\n", + " (1,0), value=1))[0]\n", + "nb_faces_per_edge = idx_first[1:] - idx_first[:-1]\n", + "# compute sub_idx (2nd axis indices to store the faces)\n", + "offsets = torch.zeros(sorted_edges_ids.shape[0], device='cuda', dtype=torch.long)\n", + "offsets[idx_first[1:]] = nb_faces_per_edge\n", + "sub_idx = (torch.arange(sorted_edges_ids.shape[0], device='cuda', dtype=torch.long) -\n", + " torch.cumsum(offsets, dim=0))\n", + "nb_faces_per_edge = torch.cat([nb_faces_per_edge,\n", + " sorted_edges_ids.shape[0] - idx_first[-1:]],\n", + " dim=0)\n", + "max_sub_idx = 2\n", + "edge2faces = torch.zeros((nb_edges, max_sub_idx), device='cuda', dtype=torch.long)\n", + "edge2faces[sorted_edges_ids, sub_idx] = sorted_faces_ids\n", + "\n", + "## Set up auxiliary laplacian matrix for the laplacian loss\n", + "vertices_laplacian_matrix = kal.ops.mesh.uniform_laplacian(\n", + " nb_vertices, faces)" + ], + "id": "considerable-impression", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "joined-cross" + }, + "source": [ + "# Setting up optimizer" + ], + "id": "joined-cross" + }, + { + "cell_type": "code", + "metadata": { + "id": "undefined-eleven" + }, + "source": [ + "optim = torch.optim.Adam(params=[vertices, texture_map, vertice_shift],\n", + " lr=lr)\n", + "scheduler = torch.optim.lr_scheduler.StepLR(optim, step_size=scheduler_step_size,\n", + " gamma=scheduler_gamma)\n" + ], + "id": "undefined-eleven", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ecological-suggestion" + }, + "source": [ + "# Training\n", + "\n", + "This toy tutorial optimizes geometry and texture of the mesh directly to demonstrate losses, rasterization and 3D checkpoints available in Kaolin.\n", + "\n", + "These components can be combined with a neural architecture of your choice to learn tasks like image to 3D mesh." + ], + "id": "ecological-suggestion" + }, + { + "cell_type": "code", + "metadata": { + "scrolled": true, + "id": "immune-companion" + }, + "source": [ + "for epoch in range(num_epoch):\n", + " for idx, data in enumerate(dataloader):\n", + " optim.zero_grad()\n", + " gt_image = data['rgb'].cuda()\n", + " gt_mask = data['semantic'].cuda()\n", + " cam_transform = data['metadata']['cam_transform'].cuda()\n", + " cam_proj = data['metadata']['cam_proj'].cuda()\n", + " \n", + " ### Prepare mesh data with projection regarding to camera ###\n", + " vertices_batch = recenter_vertices(vertices, vertice_shift)\n", + "\n", + " face_vertices_camera, face_vertices_image, face_normals = \\\n", + " kal.render.mesh.prepare_vertices(\n", + " vertices_batch.repeat(batch_size, 1, 1),\n", + " faces, cam_proj, camera_transform=cam_transform\n", + " )\n", + "\n", + " ### Perform Rasterization ###\n", + " # Construct attributes that DIB-R rasterizer will interpolate.\n", + " # the first is the UVS associated to each face\n", + " # the second will make a hard segmentation mask\n", + " face_attributes = [\n", + " face_uvs.repeat(batch_size, 1, 1, 1),\n", + " torch.ones((batch_size, nb_faces, 3, 1), device='cuda')\n", + " ]\n", + "\n", + " image_features, soft_mask, face_idx = kal.render.mesh.dibr_rasterization(\n", + " gt_image.shape[1], gt_image.shape[2], face_vertices_camera[:, :, :, -1],\n", + " face_vertices_image, face_attributes, face_normals[:, :, -1])\n", + "\n", + " # image_features is a tuple in composed of the interpolated attributes of face_attributes\n", + " texture_coords, mask = image_features\n", + " image = kal.render.mesh.texture_mapping(texture_coords,\n", + " texture_map.repeat(batch_size, 1, 1, 1), \n", + " mode='bilinear')\n", + " image = torch.clamp(image * mask, 0., 1.)\n", + " \n", + " ### Compute Losses ###\n", + " image_loss = torch.mean(torch.abs(image - gt_image))\n", + " mask_loss = kal.metrics.render.mask_iou(soft_mask,\n", + " gt_mask.squeeze(-1))\n", + " # laplacian loss\n", + " vertices_mov = vertices - vertices_init\n", + " vertices_mov_laplacian = torch.matmul(vertices_laplacian_matrix, vertices_mov)\n", + " laplacian_loss = torch.mean(vertices_mov_laplacian ** 2) * nb_vertices * 3\n", + " # flat loss\n", + " mesh_normals_e1 = face_normals[:, edge2faces[:, 0]]\n", + " mesh_normals_e2 = face_normals[:, edge2faces[:, 1]]\n", + " faces_cos = torch.sum(mesh_normals_e1 * mesh_normals_e2, dim=2)\n", + " flat_loss = torch.mean((faces_cos - 1) ** 2) * edge2faces.shape[0]\n", + "\n", + " loss = (\n", + " image_loss * image_weight +\n", + " mask_loss * mask_weight +\n", + " laplacian_loss * laplacian_weight +\n", + " flat_loss * flat_weight\n", + " )\n", + " ### Update the mesh ###\n", + " loss.backward()\n", + " optim.step()\n", + "\n", + " scheduler.step()\n", + " print(f\"Epoch {epoch} - loss: {float(loss)}\")\n", + " \n", + " ### Write 3D Checkpoints ###\n", + " pbr_material = [\n", + " {'rgb': kal.io.materials.PBRMaterial(diffuse_texture=torch.clamp(texture_map[0], 0., 1.))}\n", + " ]\n", + "\n", + " vertices_batch = recenter_vertices(vertices, vertice_shift)\n", + "\n", + " # We are now adding a new state of the mesh to the timelapse\n", + " # we only modify the texture and the vertices position\n", + " timelapse.add_mesh_batch(\n", + " iteration=epoch,\n", + " category='optimized_mesh',\n", + " vertices_list=[vertices_batch[0]],\n", + " materials_list=pbr_material\n", + " )" + ], + "id": "immune-companion", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "future-intranet" + }, + "source": [ + "# Visualize training\n", + "\n", + "You can now use [the Omniverse app](https://docs.omniverse.nvidia.com/app_kaolin/app_kaolin) to visualize the mesh optimization over training by using the training visualizer on \"./logs/\", where we stored the checkpoints.\n", + "\n", + "You can also show the rendered image generated by DIB-R and the learned texture map with your 2d images libraries." + ], + "id": "future-intranet" + }, + { + "cell_type": "code", + "metadata": { + "id": "communist-skirt" + }, + "source": [ + "with torch.no_grad():\n", + " # This is similar to a training iteration (without the loss part)\n", + " data_batch = [train_data[idx] for idx in test_batch_ids]\n", + " cam_transform = torch.stack([data['metadata']['cam_transform'] for data in data_batch], dim=0).cuda()\n", + " cam_proj = torch.stack([data['metadata']['cam_proj'] for data in data_batch], dim=0).cuda()\n", + "\n", + " vertices_batch = recenter_vertices(vertices, vertice_shift)\n", + "\n", + " face_vertices_camera, face_vertices_image, face_normals = \\\n", + " kal.render.mesh.prepare_vertices(\n", + " vertices_batch.repeat(test_batch_size, 1, 1),\n", + " faces, cam_proj, camera_transform=cam_transform\n", + " )\n", + "\n", + " face_attributes = [\n", + " face_uvs.repeat(test_batch_size, 1, 1, 1),\n", + " torch.ones((test_batch_size, nb_faces, 3, 1), device='cuda')\n", + " ]\n", + "\n", + " image_features, soft_mask, face_idx = kal.render.mesh.dibr_rasterization(\n", + " 256, 256, face_vertices_camera[:, :, :, -1],\n", + " face_vertices_image, face_attributes, face_normals[:, :, -1])\n", + "\n", + " texture_coords, mask = image_features\n", + " image = kal.render.mesh.texture_mapping(texture_coords,\n", + " texture_map.repeat(test_batch_size, 1, 1, 1), \n", + " mode='bilinear')\n", + " image = torch.clamp(image * mask, 0., 1.)\n", + " \n", + " ## Display the rendered images\n", + " f, axarr = plt.subplots(1, test_batch_size, figsize=(7, 22))\n", + " f.subplots_adjust(top=0.99, bottom=0.79, left=0., right=1.4)\n", + " f.suptitle('DIB-R rendering', fontsize=30)\n", + " for i in range(test_batch_size):\n", + " axarr[i].imshow(image[i].cpu().detach())\n", + " \n", + "## Display the texture\n", + "plt.figure(figsize=(10, 10))\n", + "plt.title('2D Texture Map', fontsize=30)\n", + "plt.imshow(torch.clamp(texture_map[0], 0., 1.).cpu().detach().permute(1, 2, 0))" + ], + "id": "communist-skirt", + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file