liuypcoding
diff --git a/‎rcnn/BatchedNms.cu
+20-16 b/‎rcnn/BatchedNms.cu
+20-16
diff --git a/‎rcnn/BatchedNmsPlugin.h
+4-4 b/‎rcnn/BatchedNmsPlugin.h
+4-4
diff --git a/‎rcnn/CMakeLists.txt
+2-2 b/‎rcnn/CMakeLists.txt
+2-2
diff --git a/‎rcnn/PredictorDecode.cu
+23-15 b/‎rcnn/PredictorDecode.cu
+23-15
diff --git a/‎rcnn/PredictorDecodePlugin.h
+27-17 b/‎rcnn/PredictorDecodePlugin.h
+27-17
diff --git a/‎rcnn/README.md
+43-3 b/‎rcnn/README.md
+43-3
@@ -1,19 +1,17 @@
-#include "BatchedNmsPlugin.h"
-#include "cuda_utils.h"
-
-#include <algorithm>
-#include <iostream>
-#include <stdexcept>
-#include <cstdint>
-#include <vector>
-#include <cmath>
-
 #include <cuda.h>
 #include <thrust/device_ptr.h>
 #include <thrust/sequence.h>
 #include <thrust/execution_policy.h>
 #include <thrust/gather.h>
 #include <thrust/system/cuda/detail/cub/device/device_radix_sort.cuh>
+#include <cmath>
+#include <algorithm>
+#include <iostream>
+#include <stdexcept>
+#include <cstdint>
+#include <vector>
+#include "BatchedNmsPlugin.h"
+#include "./cuda_utils.h"
 
 namespace nvinfer1 {
 
@@ -64,11 +62,15 @@ int batchedNms(int batch_size,
         // Return required scratch space size cub style
         workspace_size += get_size_aligned<int>(count);   // indices
         workspace_size += get_size_aligned<int>(count);   // indices_sorted
-        workspace_size += get_size_aligned<float>(count); // scores_sorted
+        workspace_size += get_size_aligned<float>(count);  // scores_sorted
 
         size_t temp_size_sort = 0;
-        thrust::cuda_cub::cub::DeviceRadixSort::SortPairsDescending((void *)nullptr, temp_size_sort,
-            (float *)nullptr, (float *)nullptr, (int *)nullptr, (int *)nullptr, count);
+        thrust::cuda_cub::cub::DeviceRadixSort::SortPairsDescending(
+            static_cast<void*>(nullptr), temp_size_sort,
+            static_cast<float*>(nullptr),
+            static_cast<float*>(nullptr),
+            static_cast<int*>(nullptr),
+            static_cast<int*>(nullptr), count);
         workspace_size += temp_size_sort;
 
         return workspace_size;
@@ -101,17 +103,19 @@ int batchedNms(int batch_size,
         // Launch actual NMS kernel - 1 block with each thread handling n detections
         // TODO: different device has differnet max threads
         const int max_threads = 1024;
-        int num_per_thread = ceil((float)num_detections / max_threads);
+        int num_per_thread = ceil(static_cast<float>(num_detections) / max_threads);
         batched_nms_kernel << <1, max_threads, 0, stream >> > (num_per_thread, nms_thresh, num_detections,
             indices_sorted, scores_sorted, in_classes, in_boxes);
 
         // Re-sort with updated scores
         thrust::cuda_cub::cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,
-            scores_sorted, scores_sorted, indices_sorted, indices, num_detections, 0, sizeof(*scores_sorted) * 8, stream);
+            scores_sorted, scores_sorted, indices_sorted, indices,
+            num_detections, 0, sizeof(*scores_sorted) * 8, stream);
 
         // Gather filtered scores, boxes, classes
         num_detections = min(detections_per_im, num_detections);
-        cudaMemcpyAsync(out_scores, scores_sorted, num_detections * sizeof *scores_sorted, cudaMemcpyDeviceToDevice, stream);
+        cudaMemcpyAsync(out_scores, scores_sorted, num_detections * sizeof *scores_sorted,
+        cudaMemcpyDeviceToDevice, stream);
         if (num_detections < detections_per_im) {
             thrust::fill_n(on_stream, out_scores + num_detections, detections_per_im - num_detections, 0);
         }
 
@@ -32,7 +32,7 @@ class BatchedNmsPlugin : public IPluginV2Ext {
 
     size_t _count;
 
-protected:
+ protected:
     void deserialize(void const* data, size_t length) {
         const char* d = static_cast<const char*>(data);
         read(d, _nms_thresh);
@@ -52,7 +52,7 @@ class BatchedNmsPlugin : public IPluginV2Ext {
         write(d, _count);
     }
 
-public:
+ public:
     BatchedNmsPlugin(float nms_thresh, int detections_per_im)
         : _nms_thresh(nms_thresh), _detections_per_im(detections_per_im) {
         assert(nms_thresh > 0);
@@ -154,7 +154,7 @@ class BatchedNmsPlugin : public IPluginV2Ext {
         return new BatchedNmsPlugin(_nms_thresh, _detections_per_im, _count);
     }
 
-private:
+ private:
     template<typename T> void write(char*& buffer, const T& val) const {
         *reinterpret_cast<T*>(buffer) = val;
         buffer += sizeof(T);
@@ -167,7 +167,7 @@ class BatchedNmsPlugin : public IPluginV2Ext {
 };
 
 class BatchedNmsPluginCreator : public IPluginCreator {
-public:
+ public:
     BatchedNmsPluginCreator() {}
 
     const char *getPluginNamespace() const override {
 
@@ -18,8 +18,8 @@ include_directories(${PROJECT_SOURCE_DIR}/include)
 include_directories(/usr/local/cuda-10.2/include)
 link_directories(/usr/local/cuda-10.2/lib64)
 # tensorrt
-include_directories(/home/jushi/TensorRT-7.0.0.11/include)
-link_directories(/home/jushi/TensorRT-7.0.0.11/lib)
+include_directories(/home/jushi/TensorRT-7.2.1.6/include)
+link_directories(/home/jushi/TensorRT-7.2.1.6/lib)
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
 
 
@@ -1,22 +1,23 @@
-#include "PredictorDecodePlugin.h"
-#include "cuda_utils.h"
-
-#include <algorithm>
-#include <cstdint>
-
 #include <thrust/device_ptr.h>
 #include <thrust/sequence.h>
 #include <thrust/execution_policy.h>
 #include <thrust/gather.h>
 #include <thrust/system/cuda/detail/cub/device/device_radix_sort.cuh>
 #include <thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh>
 
-namespace nvinfer1 {
+#include <algorithm>
+#include <cstdint>
 
-int predictorDecode(int batchSize, const void *const *inputs, void **outputs, unsigned int num_boxes, unsigned int num_classes,
-    unsigned int image_height, unsigned int image_width, const std::vector<float>& bbox_reg_weights, void *workspace, size_t workspace_size,
-    cudaStream_t stream) {
+#include "PredictorDecodePlugin.h"
+#include "./cuda_utils.h"
+
+namespace nvinfer1 {
 
+int predictorDecode(int batchSize, const void *const *inputs,
+void **outputs, unsigned int num_boxes, unsigned int num_classes,
+unsigned int image_height, unsigned int image_width,
+const std::vector<float>& bbox_reg_weights, void *workspace,
+size_t workspace_size, cudaStream_t stream) {
     int scores_size = num_boxes * num_classes;
 
     if (!workspace || !workspace_size) {
@@ -27,15 +28,22 @@ int predictorDecode(int batchSize, const void *const *inputs, void **outputs, un
         workspace_size += get_size_aligned<float>(scores_size);    // scores_sorted
 
         size_t temp_size_sort = 0;
-        thrust::cuda_cub::cub::DeviceRadixSort::SortPairsDescending((void *)nullptr, temp_size_sort,
-            (float *)nullptr, (float *)nullptr, (int *)nullptr, (int *)nullptr, scores_size);
+        thrust::cuda_cub::cub::DeviceRadixSort::SortPairsDescending(
+            static_cast<void*>(nullptr), temp_size_sort,
+            static_cast<float*>(nullptr),
+            static_cast<float*>(nullptr),
+            static_cast<int*>(nullptr),
+            static_cast<int*>(nullptr),
+            scores_size);
         workspace_size += temp_size_sort;
 
         return workspace_size;
     }
 
     auto bbox_reg_weights_d = get_next_ptr<float>(bbox_reg_weights.size(), workspace, workspace_size);
-    cudaMemcpyAsync(bbox_reg_weights_d, bbox_reg_weights.data(), bbox_reg_weights.size() * sizeof *bbox_reg_weights_d, cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(bbox_reg_weights_d, bbox_reg_weights.data(),
+    bbox_reg_weights.size() * sizeof *bbox_reg_weights_d,
+    cudaMemcpyHostToDevice, stream);
 
     auto on_stream = thrust::cuda::par.on(stream);
 
@@ -79,8 +87,8 @@ int predictorDecode(int batchSize, const void *const *inputs, void **outputs, un
             boxes = float4{
               max(0.0f, pred_ctr_x - 0.5f * pred_w),
               max(0.0f, pred_ctr_y - 0.5f * pred_h),
-              min(pred_ctr_x + 0.5f * pred_w, (float)image_width),
-              min(pred_ctr_y + 0.5f * pred_h, (float)image_width)
+              min(pred_ctr_x + 0.5f * pred_w, static_cast<float>(image_width)),
+              min(pred_ctr_y + 0.5f * pred_h, static_cast<float>(image_width))
             };
 
             // filter empty boxes
 
@@ -14,8 +14,10 @@ using namespace nvinfer1;
 namespace nvinfer1 {
 
 int predictorDecode(int batchSize,
-    const void *const *inputs, void **outputs, unsigned int num_boxes, unsigned int num_classes, unsigned int image_height,
-    unsigned int image_width, const std::vector<float>& bbox_reg_weights, void *workspace, size_t workspace_size, cudaStream_t stream);
+const void *const *inputs, void **outputs, unsigned int num_boxes,
+unsigned int num_classes, unsigned int image_height,
+unsigned int image_width, const std::vector<float>& bbox_reg_weights,
+void *workspace, size_t workspace_size, cudaStream_t stream);
 
 /*
     input1: scores{N,C,1,1} N->nums C->num of classes
@@ -34,7 +36,7 @@ class PredictorDecodePlugin : public IPluginV2Ext {
     std::vector<float> _bbox_reg_weights;
     mutable int size = -1;
 
-protected:
+ protected:
     void deserialize(void const* data, size_t length) {
         const char* d = static_cast<const char*>(data);
         read(d, _num_boxes);
@@ -51,7 +53,9 @@ class PredictorDecodePlugin : public IPluginV2Ext {
     }
 
     size_t getSerializationSize() const override {
-        return sizeof(_num_boxes) + sizeof(_num_classes) + sizeof(_image_height) + sizeof(_image_width) + sizeof(size_t) + sizeof(float)*_bbox_reg_weights.size();
+        return sizeof(_num_boxes) + sizeof(_num_classes) +
+        sizeof(_image_height) + sizeof(_image_width) + sizeof(size_t) +
+        sizeof(float)*_bbox_reg_weights.size();
     }
 
     void serialize(void *buffer) const override {
@@ -66,14 +70,18 @@ class PredictorDecodePlugin : public IPluginV2Ext {
         }
     }
 
-public:
-    PredictorDecodePlugin(unsigned int num_boxes, unsigned int image_height, unsigned int image_width, std::vector<float> const& bbox_reg_weights)
-        : _num_boxes(num_boxes), _image_height(image_height), _image_width(image_width), _bbox_reg_weights(bbox_reg_weights) {}
+ public:
+    PredictorDecodePlugin(unsigned int num_boxes, unsigned int image_height,
+    unsigned int image_width, std::vector<float> const& bbox_reg_weights)
+        : _num_boxes(num_boxes), _image_height(image_height),
+        _image_width(image_width), _bbox_reg_weights(bbox_reg_weights) {}
 
-    PredictorDecodePlugin(unsigned int num_boxes, unsigned int num_classes, unsigned int image_height, unsigned int image_width,
-        std::vector<float> const& bbox_reg_weights)
-        : _num_boxes(num_boxes), _num_classes(num_classes), _image_height(image_height), _image_width(image_width),
-    _bbox_reg_weights(bbox_reg_weights) {}
+    PredictorDecodePlugin(unsigned int num_boxes, unsigned int num_classes,
+    unsigned int image_height, unsigned int image_width,
+    std::vector<float> const& bbox_reg_weights)
+        : _num_boxes(num_boxes), _num_classes(num_classes),
+        _image_height(image_height), _image_width(image_width),
+        _bbox_reg_weights(bbox_reg_weights) {}
 
     PredictorDecodePlugin(void const* data, size_t length) {
         this->deserialize(data, length);
@@ -108,17 +116,19 @@ class PredictorDecodePlugin : public IPluginV2Ext {
 
     size_t getWorkspaceSize(int maxBatchSize) const override {
         if (size < 0) {
-            size = predictorDecode(maxBatchSize, nullptr, nullptr, _num_boxes, _num_classes, _image_height, _image_width, _bbox_reg_weights,
-                nullptr, 0, nullptr);
+            size = predictorDecode(maxBatchSize, nullptr, nullptr,
+            _num_boxes, _num_classes, _image_height, _image_width,
+            _bbox_reg_weights, nullptr, 0, nullptr);
         }
         return size;
     }
 
     int enqueue(int batchSize,
         const void *const *inputs, void **outputs,
         void *workspace, cudaStream_t stream) override {
-        return predictorDecode(batchSize, inputs, outputs, _num_boxes, _num_classes, _image_height, _image_width, _bbox_reg_weights,
-            workspace, getWorkspaceSize(batchSize), stream);
+        return predictorDecode(batchSize, inputs, outputs, _num_boxes,
+        _num_classes, _image_height, _image_width, _bbox_reg_weights,
+        workspace, getWorkspaceSize(batchSize), stream);
     }
 
     void destroy() override {
@@ -166,7 +176,7 @@ class PredictorDecodePlugin : public IPluginV2Ext {
         return new PredictorDecodePlugin(_num_boxes, _num_classes, _image_height, _image_width, _bbox_reg_weights);
     }
 
-private:
+ private:
     template<typename T> void write(char*& buffer, const T& val) const {
         *reinterpret_cast<T*>(buffer) = val;
         buffer += sizeof(T);
@@ -179,7 +189,7 @@ class PredictorDecodePlugin : public IPluginV2Ext {
 };
 
 class PredictorDecodePluginCreator : public IPluginCreator {
-public:
+ public:
     PredictorDecodePluginCreator() {}
 
     const char *getPluginName() const override {
 
@@ -4,15 +4,19 @@ The Pytorch implementation is [facebookresearch/detectron2](https://github.com/f
 
 ## Models
 
-- [x] Faster R-CNN(R50-C4)
+- [x] Faster R-CNN(C4)
 
 - [ ] Mask R-CNN(R50-C4)
 
 ## Test Environment
 
-- GTX2080Ti / Ubuntu16.04 / cuda10.2 / cudnn8.0.4 / TensorRT7.0.0 / OpenCV4.2
+- GTX2080Ti / Ubuntu16.04 / cuda10.2 / cudnn8.0.4 / TensorRT7.2.1 / OpenCV4.2
 - GTX2080Ti / win10 / cuda10.2 / cudnn8.0.4 / TensorRT7.2.1 / OpenCV4.2 / VS2017 (need to replace function corresponding to the dirent.h and add "--extended-lambda" in CUDA C/C++ -> Command Line -> Other options)
 
+TensorRT7.2 is recomended because Resize layer in 7.0 with kLINEAR mode is a little different with opencv. You can also implement data preprocess out of tensorrt if you want to use TensorRT7.0 or more previous version. 
+
+**The result under fp32 is same to pytorch about 4 decimal places**!
+
 ## How to Run
 
 1. generate .wts from pytorch with .pkl or .pth
@@ -48,6 +52,40 @@ sudo ./rcnn -d faster.engine ../samples
 
 3. check the images generated, as follows. _zidane.jpg and _bus.jpg
 
+## Backbone
+
+#### R18, R34, R152
+
+```
+1.download pretrained model
+  R18: https://download.pytorch.org/models/resnet18-f37072fd.pth
+  R34: https://download.pytorch.org/models/resnet34-b627a593.pth
+  R152: https://download.pytorch.org/models/resnet152-394f9c45.pth
+2.convert pth to pkl by facebookresearch/detectron2/tools/convert-torchvision-to-d2.py
+3.set cfg.MODEL.RESNETS.DEPTH = 18(34,152),
+      cfg.MODEL.RESNETS.STRIDE_IN_1X1 = False,
+      cfg.MODEL.RESNETS.RES2_OUT_CHANNELS = 64, // for R18, R34
+      cfg.MODEL.PIXEL_MEAN = [123.675, 116.280, 103.530],
+      cfg.MODEL.PIXEL_STD = [58.395, 57.120, 57.375],
+      cfg.INPUT.FORMAT = "RGB"
+  and then train your own model
+4.set BACKBONE_RESNETTYPE = R18(R34, R152) in rcnn.cpp line 13
+5.modify PIXEL_MEAN and PIXEL_STD in rcnn.cpp
+6.set res2_out_channels=64 in BuildResNet in rcnn.cpp line 239 // for R18, R34
+7.generate wts file from your own model and build your engine, refer to how to run
+8.convert your image to RGB before inference
+```
+
+#### R50, R101
+
+```
+1.download pretrained model
+  R50: https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_1x/137257644/model_final_721ade.pkl
+  R101: https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_C4_3x/138204752/model_final_298dad.pkl
+2.set BACKBONE_RESNETTYPE = R50(R101) rcnn.cpp line 13
+3.follow how to run
+```
+
 ## NOTE
 
 - if you meet the error below, just try to make again. The flag has been added in CMakeLists.txt
@@ -66,10 +104,12 @@ sudo ./rcnn -d faster.engine ../samples
 
 1. quantizationType:fp32,fp16,int8. see BuildRcnnModel(rcnn.cpp line 276) for detail.
 
-2. the using of int8 is same with [tensorrtx/yolov5](../yolov5/README.md), but it has no improvement comparing to fp16.
+2. the usage of int8 is same with [tensorrtx/yolov5](../yolov5/README.md), but it has no improvement comparing to fp16.
 
 ## Plugins
 
+decode and nms plugins are modified from [retinanet-examples](https://github.com/NVIDIA/retinanet-examples/tree/master/csrc/plugins)
+
 - RpnDecodePlugin: calculate coordinates of  proposals which is the first n
 
 ```