Merge pull request #190 from Kracozebr/master

Add deformable conv to repo
WisconsinAIVision · Oct 5, 2022 · 3f423ed · 3f423ed
2 parents 7e7e624 + 3d9dda3
commit 3f423ed
Show file tree

Hide file tree

Showing 18 changed files with 1,739 additions and 101 deletions.
diff --git a/INSTALL.md b/INSTALL.md
@@ -1,36 +1,59 @@
 ## Installation
  - Set up a Python3 environment.
- - Install [Pytorch](http://pytorch.org/) 1.6.0 and TorchVision.
- - Install [TensorRT](https://developer.nvidia.com/tensorrt) 7.1.3.4 and [torch2trt](https://github.com/NVIDIA-AI-IOT/torch2trt) 0.1.0 (*optional* for evaluating models without TensorRT, currently TensorRT optimization only supports devices with [Tensor Cores](https://www.nvidia.com/en-us/data-center/tensor-cores/), and already included in [JetPack SDK](https://developer.nvidia.com/embedded/Jetpack) if using Jetson devices):
-   1. Install CUDA 10.2/11.0 and cuDNN 8.0.0.
-   2. Download TensorRT 7.1.3.4 tar file [here](https://developer.nvidia.com/nvidia-tensorrt-7x-download) and install TensorRT (refer to [official documentation](https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-713/install-guide/index.html#installing-tar) for more details).
+ - Install [Pytorch](http://pytorch.org/) 1.7.1 and TorchVision v.0.8.2.
+ - Install [TensorRT](https://developer.nvidia.com/tensorrt) 8.2.1.8 and [torch2trt_dynamic](https://github.com/grimoire/torch2trt_dynamic) v0.5.0 (*optional* for evaluating models without TensorRT, currently TensorRT optimization only supports devices with [Tensor Cores](https://www.nvidia.com/en-us/data-center/tensor-cores/), and already included in [JetPack SDK](https://developer.nvidia.com/embedded/Jetpack) if using Jetson devices):
+   1. Install CUDA 10.2/11.4 and cuDNN 8.2.
+   2. Download TensorRT 8.2.1.8 tar file [here](https://developer.nvidia.com/nvidia-tensorrt-8x-download) and install TensorRT (refer to [official documentation](https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-821/install-guide/index.html#installing-tar) for more details).
    ```Shell
-   tar xzvf TensorRT-${version}.${os}.${arch}-gnu.${cuda}.${cudnn}.tar.gz
+   version="8.x.x.x"
+   arch=$(uname -m)
+   cuda="cuda-x.x"
+   cudnn="cudnn8.x"
+   tar xzvf TensorRT-${version}.Linux.${arch}-gnu.${cuda}.${cudnn}.tar.gz
    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<TensorRT-${version}/lib>
 
    cd TensorRT-${version}/python
-   pip3 install tensorrt-*-cp3x-none-linux_x86_64.whl
+   python3 -m pip install tensorrt-*-cp3x-none-linux_x86_64.whl
 
    cd TensorRT-${version}/uff
-   pip3 install uff-0.6.9-py2.py3-none-any.whl
+   python3 -m pip install uff-0.6.9-py2.py3-none-any.whl
 
    cd TensorRT-${version}/graphsurgeon
-   pip3 install graphsurgeon-0.4.5-py2.py3-none-any.whl
+   python3 -m pip install graphsurgeon-0.4.5-py2.py3-none-any.whl
    ```
-   3. Install [torch2trt](https://github.com/NVIDIA-AI-IOT/torch2trt).
+   3. Install [torch2trt_dynamic](https://github.com/grimoire/torch2trt_dynamic).
    ```Shell
-   git clone https://github.com/NVIDIA-AI-IOT/torch2trt
-   cd torch2trt
-   sudo python setup.py install --plugins
+   git clone https://github.com/grimoire/torch2trt_dynamic.git torch2trt_dynamic
+   cd torch2trt_dynamic 
+   python setup.py develop
    ```
+   4. Install deformable convolution module to pytorch if you want to work with yolact_edge+ models. Go to ./external/mod_def_conv and run setup.py
+   ```Shell
+   cd external/mod_def_conv
+   python setup.py install
+   ```
+   5. Install [amirstan_plugin](https://github.com/grimoire/amirstan_plugin) which contain the deformable convolution plugin with dynamic shapes for TensorRT 8.x. IT is needed only if you want to work with yolact edge+ models.
+   ```Shell
+    apt install -y software-properties-common
+    wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
+    apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
+    apt update && apt install -y cmake
+    git clone --depth=1 --branch v0.5.0 https://github.com/grimoire/amirstan_plugin.git
+    cd amirstan_plugin
+    cmake -DTENSORRT_DIR=/usr/lib/x86_64-linux-gnu -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc
+    make -j$(nproc)
+
+   export AMIRSTAN_LIBRARY_PATH=<amirstan_plugin_root>/lib
+   ```
+
 
  - Install some other packages:
    ```Shell
    # Cython needs to be installed before pycocotools
    pip install cython
    pip install opencv-python pillow matplotlib
    pip install git+https://github.com/haotian-liu/cocoapi.git#"egg=pycocotools&subdirectory=PythonAPI"
-   pip install GitPython termcolor tensorboard
+   pip install GitPython termcolor tensorboard packaging
    ```
  - Clone this repository and enter it:
    ```Shell

diff --git a/README.md b/README.md
@@ -183,6 +183,11 @@ python train.py --config=yolact_edge_vid_trainflow_config --resume=./weights/yol
 python train.py --config=yolact_edge_vid_config --resume=./weights/yolact_edge_vid_trainflow_144_100000.pth
 ```
 
+### Experimental
+One can try to train yolact edge+ models with deformable convolutions. For installation instructions see [INSTALL.md](INSTALL.md)
+```Shell
+python train.py --config=yolact_edge_plus_config
+```
 
 ### Custom Datasets
 You can also train on your own dataset by following these steps:

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,43 +1,51 @@
-FROM nvcr.io/nvidia/cuda:11.0.3-cudnn8-devel-ubuntu18.04
+FROM nvcr.io/nvidia/cuda:11.4.0-cudnn8-devel-ubuntu18.04
+
 
 ARG DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y  \
-    git wget sudo build-essential \
+    git wget build-essential \
     python3 python3-setuptools python3-pip python3-dev python3-tk \
     ffmpeg libsm6 libxext6
 RUN ln -svf /usr/bin/python3 /usr/bin/python
 RUN python -m pip install --upgrade --force pip
 
-# TensorRT
-ARG version="8.0.5.39-1+cuda11.0"
-RUN apt-get update && apt-get install -y libcudnn8=${version} libcudnn8-dev=${version} && apt-mark hold libcudnn8 libcudnn8-dev
-ARG version="7.2.3-1+cuda11.0"
+# CUDNN
+ARG version="8.2.2.26-1+cuda11.4"
+RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages \
+    libcudnn8=${version} libcudnn8-dev=${version} && apt-mark hold libcudnn8 libcudnn8-dev
+
+# Install Tensorrt 8.2.1.8
+ARG version="8.2.1-1+cuda11.4"
 RUN apt-get update && \
-    apt-get install -y libnvinfer7=${version} libnvonnxparsers7=${version} libnvparsers7=${version} libnvinfer-plugin7=${version} libnvinfer-dev=${version} libnvonnxparsers-dev=${version} libnvparsers-dev=${version} libnvinfer-plugin-dev=${version} python3-libnvinfer=${version} && \
-    apt-mark hold libnvinfer7 libnvonnxparsers7 libnvparsers7 libnvinfer-plugin7 libnvinfer-dev libnvonnxparsers-dev libnvparsers-dev libnvinfer-plugin-dev python3-libnvinfer
-
-# create a non-root user
-ARG USER_ID=1000
-ARG USER=appuser
-RUN useradd -m --no-log-init --system --uid ${USER_ID} ${USER} -g sudo
-RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
-USER ${USER}
-WORKDIR /home/${USER}
-ENV PATH="/home/${USER}/.local/bin:${PATH}"
+    apt-get install -y libnvinfer8=${version} libnvonnxparsers8=${version} libnvparsers8=${version} libnvinfer-plugin8=${version} libnvinfer-dev=${version} libnvonnxparsers-dev=${version} libnvparsers-dev=${version} libnvinfer-plugin-dev=${version} python3-libnvinfer=${version} && \
+    apt-mark hold libnvinfer8 libnvonnxparsers8 libnvparsers8 libnvinfer-plugin8 libnvinfer-dev libnvonnxparsers-dev libnvparsers-dev libnvinfer-plugin-dev python3-libnvinfer
+
 
 # # Install dependencies
-RUN pip install --user cython opencv-python pillow matplotlib GitPython termcolor tensorboard
-RUN pip install --user git+https://github.com/haotian-liu/cocoapi.git#"egg=pycocotools&subdirectory=PythonAPI"
-RUN pip install --user torch==1.7.1+cu110 torchvision==0.8.2+cu110 -f https://download.pytorch.org/whl/torch_stable.html
-
-# torch2trt
-RUN git clone https://github.com/NVIDIA-AI-IOT/torch2trt
-WORKDIR /home/${USER}/torch2trt
-RUN python setup.py install --plugins --user
-
-WORKDIR /home/${USER}
-RUN ln -s /yolact_edge
-RUN ln -s /datasets
-WORKDIR /home/${USER}/yolact_edge
+RUN pip install cython opencv-python pillow matplotlib GitPython termcolor tensorboard packaging
+RUN pip install git+https://github.com/haotian-liu/cocoapi.git#"egg=pycocotools&subdirectory=PythonAPI"
+RUN pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 -f https://download.pytorch.org/whl/torch_stable.html
+
+# torch2trt_dynamic
+WORKDIR /root
+RUN git clone https://github.com/grimoire/torch2trt_dynamic.git torch2trt_dynamic && \
+    cd torch2trt_dynamic && \
+    python setup.py develop
+
+# installing plugins for torch2trt_dynamic
+WORKDIR /root
+
+RUN apt install -y software-properties-common && \
+    wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null && \
+    apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main' && \
+    apt update && apt install -y cmake && \
+    git clone --depth=1 --branch v0.5.0 https://github.com/grimoire/amirstan_plugin.git && \
+    cd amirstan_plugin && \
+    cmake -DTENSORRT_DIR=/usr/lib/x86_64-linux-gnu -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc && \
+    make -j$(nproc)
+
+ENV AMIRSTAN_LIBRARY_PATH=/root/amirstan_plugin/lib
+
+WORKDIR /root/yolact_edge
 
 ENV LANG C.UTF-8
diff --git a/docker/Dockerfile.xavier b/docker/Dockerfile.xavier
@@ -15,7 +15,7 @@
 # (ex. In this case, use AGX Xavier) 
 # $ docker build --build-arg L4T_IMAGE=nvcr.io/nvidia/l4t-pytorch:r32.5.0-pth1.6-py3 --build-arg JETSON_PLATFORM=t194
 
-ARG L4T_IMAGE=nvcr.io/nvidia/l4t-pytorch:r32.4.4-pth1.6-py3
+ARG L4T_IMAGE=nvcr.io/nvidia/l4t-pytorch:r32.7.1-pth1.10-py3
 ARG JETSON_PLATFORM=t194
 
 FROM ${L4T_IMAGE}
@@ -42,19 +42,31 @@ RUN L4T_REPO_VERSION=`python3 -c 'import sys; print(".".join((sys.argv[1].split(
 RUN ln -svf /usr/bin/python3 /usr/bin/python
 RUN python -m pip install --upgrade --force pip
 
-# # Install dependencies
+# Install dependencies
 RUN pip install cython pillow matplotlib GitPython termcolor tensorboard
 RUN pip install git+https://github.com/haotian-liu/cocoapi.git#"egg=pycocotools&subdirectory=PythonAPI"
 
-# torch2trt
+# torch2trt_dynamic
 WORKDIR /root
-RUN git clone https://github.com/NVIDIA-AI-IOT/torch2trt &&\
-    cd torch2trt &&\
-    python setup.py install --plugins
+RUN git clone https://github.com/grimoire/torch2trt_dynamic.git torch2trt_dynamic
+WORKDIR /root/torch2trt_dynamic
+RUN python setup.py develop
 
+
+# installing plugins
 WORKDIR /root
-RUN ln -s /yolact_edge
-RUN ln -s /datasets
-WORKDIR /root/yolact_edge
+RUN apt update && apt install -y software-properties-common && \
+    wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null && \
+    apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main' && \
+    apt update && apt install -y cmake && \
+    git clone --depth=1 https://github.com/grimoire/amirstan_plugin.git && \
+    cd amirstan_plugin && \
+    git submodule update --init --progress --depth=1 && \
+    cmake -DTENSORRT_DIR=/usr/include/aarch64-linux-gnu  && \
+    make -j$(nproc)
+
+ENV AMIRSTAN_LIBRARY_PATH=/root/amirstan_plugin/lib
+
+WORKDIR /workspace
 
 ENV LANG C.UTF-8
diff --git a/docker/run_xavier.sh b/docker/run_xavier.sh
@@ -2,7 +2,7 @@ docker build -t yolact_edge -f Dockerfile.xavier  .
 docker run -it --rm --net=host --privileged \
            --runtime nvidia -e DISPLAY=$DISPLAY \
            -v /tmp/.X11-unix/:/tmp/.X11-unix \
-           -v $PWD/../:/yolact_edge/:rw \
+           -v $PWD/../:/workspace/yolact_edge/:rw \
            --device /dev/video0:/dev/video0 \
            yolact_edge \
            python3 eval.py --trained_model=./weights/yolact_edge_resnet50_54_800000.pth \
@@ -11,4 +11,4 @@ docker run -it --rm --net=host --privileged \
                            --video_multiframe=2 \
                            --trt_batch_size 2 \
                            --video=0 \
-                           --calib_images ./data/coco/images
+                           --calib_images ./data/coco/images
diff --git a/docker/start.sh b/docker/start.sh
@@ -3,8 +3,10 @@
 SOURCE_CODE=$1
 DATASETS=$2
 
+docker build -t yolact_edge:11.4_cuda8.2 -f Dockerfile .
+
 docker run --gpus all -it --name=yolact_edge \
-  --shm-size=8gb --env="DISPLAY" --volume="/tmp/.X11-unix:/tmp/.X11-unix:rw" \
-  -v $SOURCE_CODE:/yolact_edge/:rw \
+  --shm-size=64gb --env="DISPLAY" --volume="/tmp/.X11-unix:/tmp/.X11-unix:rw" \
+  -v $SOURCE_CODE:/root/yolact_edge/:rw \
   -v $DATASETS:/datasets/:ro \
-  yolact_edge_image
+  yolact_edge:11.4_cuda8.2
diff --git a/external/mod_def_conv/setup.py b/external/mod_def_conv/setup.py
@@ -0,0 +1,16 @@
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+if __name__ == '__main__':
+    setup(
+        name='mod_dcn_op_v2',
+        ext_modules=[
+            CUDAExtension(
+                'mod_dcn_op_v2',
+                sources=['src/modulated_deform_conv.cpp', 'src/modulated_deform_conv_cuda.cu'],
+            )
+        ],
+        cmdclass={
+            'build_ext': BuildExtension
+        }
+    )
diff --git a/external/mod_def_conv/src/common_cuda_helper.hpp b/external/mod_def_conv/src/common_cuda_helper.hpp
@@ -0,0 +1,112 @@
+#ifndef COMMON_CUDA_HELPER
+#define COMMON_CUDA_HELPER
+
+#include <cuda.h>
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+#define THREADS_PER_BLOCK 512
+
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+inline int GET_BLOCKS(const int N) {
+  int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+  int max_block_num = 4096;
+  return min(optimal_block_num, max_block_num);
+}
+
+template <typename T>
+__device__ T bilinear_interpolate(const T* input, const int height,
+                                  const int width, T y, T x,
+                                  const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = input[y_low * width + x_low];
+  T v2 = input[y_low * width + x_high];
+  T v3 = input[y_high * width + x_low];
+  T v4 = input[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4,
+    int& x_low, int& x_high, int& y_low, int& y_high,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+#endif  // COMMON_CUDA_HELPER