Skip to content

Commit 5d6b09c

Browse files
authored
update rcnn (wang-xinyu#495)
add pixel_std in preprocess fix synchronization error in roialign and cudaMemcpyAsync improve coding style, limit line length less than 120 and so on update README.md upgrade TensorRT to 7.2
1 parent 8cfc8ee commit 5d6b09c

17 files changed

+363
-197
lines changed

rcnn/BatchedNms.cu

+20-16
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,17 @@
1-
#include "BatchedNmsPlugin.h"
2-
#include "cuda_utils.h"
3-
4-
#include <algorithm>
5-
#include <iostream>
6-
#include <stdexcept>
7-
#include <cstdint>
8-
#include <vector>
9-
#include <cmath>
10-
111
#include <cuda.h>
122
#include <thrust/device_ptr.h>
133
#include <thrust/sequence.h>
144
#include <thrust/execution_policy.h>
155
#include <thrust/gather.h>
166
#include <thrust/system/cuda/detail/cub/device/device_radix_sort.cuh>
7+
#include <cmath>
8+
#include <algorithm>
9+
#include <iostream>
10+
#include <stdexcept>
11+
#include <cstdint>
12+
#include <vector>
13+
#include "BatchedNmsPlugin.h"
14+
#include "./cuda_utils.h"
1715

1816
namespace nvinfer1 {
1917

@@ -64,11 +62,15 @@ int batchedNms(int batch_size,
6462
// Return required scratch space size cub style
6563
workspace_size += get_size_aligned<int>(count); // indices
6664
workspace_size += get_size_aligned<int>(count); // indices_sorted
67-
workspace_size += get_size_aligned<float>(count); // scores_sorted
65+
workspace_size += get_size_aligned<float>(count); // scores_sorted
6866

6967
size_t temp_size_sort = 0;
70-
thrust::cuda_cub::cub::DeviceRadixSort::SortPairsDescending((void *)nullptr, temp_size_sort,
71-
(float *)nullptr, (float *)nullptr, (int *)nullptr, (int *)nullptr, count);
68+
thrust::cuda_cub::cub::DeviceRadixSort::SortPairsDescending(
69+
static_cast<void*>(nullptr), temp_size_sort,
70+
static_cast<float*>(nullptr),
71+
static_cast<float*>(nullptr),
72+
static_cast<int*>(nullptr),
73+
static_cast<int*>(nullptr), count);
7274
workspace_size += temp_size_sort;
7375

7476
return workspace_size;
@@ -101,17 +103,19 @@ int batchedNms(int batch_size,
101103
// Launch actual NMS kernel - 1 block with each thread handling n detections
102104
// TODO: different device has differnet max threads
103105
const int max_threads = 1024;
104-
int num_per_thread = ceil((float)num_detections / max_threads);
106+
int num_per_thread = ceil(static_cast<float>(num_detections) / max_threads);
105107
batched_nms_kernel << <1, max_threads, 0, stream >> > (num_per_thread, nms_thresh, num_detections,
106108
indices_sorted, scores_sorted, in_classes, in_boxes);
107109

108110
// Re-sort with updated scores
109111
thrust::cuda_cub::cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,
110-
scores_sorted, scores_sorted, indices_sorted, indices, num_detections, 0, sizeof(*scores_sorted) * 8, stream);
112+
scores_sorted, scores_sorted, indices_sorted, indices,
113+
num_detections, 0, sizeof(*scores_sorted) * 8, stream);
111114

112115
// Gather filtered scores, boxes, classes
113116
num_detections = min(detections_per_im, num_detections);
114-
cudaMemcpyAsync(out_scores, scores_sorted, num_detections * sizeof *scores_sorted, cudaMemcpyDeviceToDevice, stream);
117+
cudaMemcpyAsync(out_scores, scores_sorted, num_detections * sizeof *scores_sorted,
118+
cudaMemcpyDeviceToDevice, stream);
115119
if (num_detections < detections_per_im) {
116120
thrust::fill_n(on_stream, out_scores + num_detections, detections_per_im - num_detections, 0);
117121
}

rcnn/BatchedNmsPlugin.h

+4-4
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ class BatchedNmsPlugin : public IPluginV2Ext {
3232

3333
size_t _count;
3434

35-
protected:
35+
protected:
3636
void deserialize(void const* data, size_t length) {
3737
const char* d = static_cast<const char*>(data);
3838
read(d, _nms_thresh);
@@ -52,7 +52,7 @@ class BatchedNmsPlugin : public IPluginV2Ext {
5252
write(d, _count);
5353
}
5454

55-
public:
55+
public:
5656
BatchedNmsPlugin(float nms_thresh, int detections_per_im)
5757
: _nms_thresh(nms_thresh), _detections_per_im(detections_per_im) {
5858
assert(nms_thresh > 0);
@@ -154,7 +154,7 @@ class BatchedNmsPlugin : public IPluginV2Ext {
154154
return new BatchedNmsPlugin(_nms_thresh, _detections_per_im, _count);
155155
}
156156

157-
private:
157+
private:
158158
template<typename T> void write(char*& buffer, const T& val) const {
159159
*reinterpret_cast<T*>(buffer) = val;
160160
buffer += sizeof(T);
@@ -167,7 +167,7 @@ class BatchedNmsPlugin : public IPluginV2Ext {
167167
};
168168

169169
class BatchedNmsPluginCreator : public IPluginCreator {
170-
public:
170+
public:
171171
BatchedNmsPluginCreator() {}
172172

173173
const char *getPluginNamespace() const override {

rcnn/CMakeLists.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ include_directories(${PROJECT_SOURCE_DIR}/include)
1818
include_directories(/usr/local/cuda-10.2/include)
1919
link_directories(/usr/local/cuda-10.2/lib64)
2020
# tensorrt
21-
include_directories(/home/jushi/TensorRT-7.0.0.11/include)
22-
link_directories(/home/jushi/TensorRT-7.0.0.11/lib)
21+
include_directories(/home/jushi/TensorRT-7.2.1.6/include)
22+
link_directories(/home/jushi/TensorRT-7.2.1.6/lib)
2323

2424
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
2525

rcnn/PredictorDecode.cu

+23-15
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,23 @@
1-
#include "PredictorDecodePlugin.h"
2-
#include "cuda_utils.h"
3-
4-
#include <algorithm>
5-
#include <cstdint>
6-
71
#include <thrust/device_ptr.h>
82
#include <thrust/sequence.h>
93
#include <thrust/execution_policy.h>
104
#include <thrust/gather.h>
115
#include <thrust/system/cuda/detail/cub/device/device_radix_sort.cuh>
126
#include <thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh>
137

14-
namespace nvinfer1 {
8+
#include <algorithm>
9+
#include <cstdint>
1510

16-
int predictorDecode(int batchSize, const void *const *inputs, void **outputs, unsigned int num_boxes, unsigned int num_classes,
17-
unsigned int image_height, unsigned int image_width, const std::vector<float>& bbox_reg_weights, void *workspace, size_t workspace_size,
18-
cudaStream_t stream) {
11+
#include "PredictorDecodePlugin.h"
12+
#include "./cuda_utils.h"
13+
14+
namespace nvinfer1 {
1915

16+
int predictorDecode(int batchSize, const void *const *inputs,
17+
void **outputs, unsigned int num_boxes, unsigned int num_classes,
18+
unsigned int image_height, unsigned int image_width,
19+
const std::vector<float>& bbox_reg_weights, void *workspace,
20+
size_t workspace_size, cudaStream_t stream) {
2021
int scores_size = num_boxes * num_classes;
2122

2223
if (!workspace || !workspace_size) {
@@ -27,15 +28,22 @@ int predictorDecode(int batchSize, const void *const *inputs, void **outputs, un
2728
workspace_size += get_size_aligned<float>(scores_size); // scores_sorted
2829

2930
size_t temp_size_sort = 0;
30-
thrust::cuda_cub::cub::DeviceRadixSort::SortPairsDescending((void *)nullptr, temp_size_sort,
31-
(float *)nullptr, (float *)nullptr, (int *)nullptr, (int *)nullptr, scores_size);
31+
thrust::cuda_cub::cub::DeviceRadixSort::SortPairsDescending(
32+
static_cast<void*>(nullptr), temp_size_sort,
33+
static_cast<float*>(nullptr),
34+
static_cast<float*>(nullptr),
35+
static_cast<int*>(nullptr),
36+
static_cast<int*>(nullptr),
37+
scores_size);
3238
workspace_size += temp_size_sort;
3339

3440
return workspace_size;
3541
}
3642

3743
auto bbox_reg_weights_d = get_next_ptr<float>(bbox_reg_weights.size(), workspace, workspace_size);
38-
cudaMemcpyAsync(bbox_reg_weights_d, bbox_reg_weights.data(), bbox_reg_weights.size() * sizeof *bbox_reg_weights_d, cudaMemcpyHostToDevice, stream);
44+
cudaMemcpyAsync(bbox_reg_weights_d, bbox_reg_weights.data(),
45+
bbox_reg_weights.size() * sizeof *bbox_reg_weights_d,
46+
cudaMemcpyHostToDevice, stream);
3947

4048
auto on_stream = thrust::cuda::par.on(stream);
4149

@@ -79,8 +87,8 @@ int predictorDecode(int batchSize, const void *const *inputs, void **outputs, un
7987
boxes = float4{
8088
max(0.0f, pred_ctr_x - 0.5f * pred_w),
8189
max(0.0f, pred_ctr_y - 0.5f * pred_h),
82-
min(pred_ctr_x + 0.5f * pred_w, (float)image_width),
83-
min(pred_ctr_y + 0.5f * pred_h, (float)image_width)
90+
min(pred_ctr_x + 0.5f * pred_w, static_cast<float>(image_width)),
91+
min(pred_ctr_y + 0.5f * pred_h, static_cast<float>(image_width))
8492
};
8593

8694
// filter empty boxes

rcnn/PredictorDecodePlugin.h

+27-17
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,10 @@ using namespace nvinfer1;
1414
namespace nvinfer1 {
1515

1616
int predictorDecode(int batchSize,
17-
const void *const *inputs, void **outputs, unsigned int num_boxes, unsigned int num_classes, unsigned int image_height,
18-
unsigned int image_width, const std::vector<float>& bbox_reg_weights, void *workspace, size_t workspace_size, cudaStream_t stream);
17+
const void *const *inputs, void **outputs, unsigned int num_boxes,
18+
unsigned int num_classes, unsigned int image_height,
19+
unsigned int image_width, const std::vector<float>& bbox_reg_weights,
20+
void *workspace, size_t workspace_size, cudaStream_t stream);
1921

2022
/*
2123
input1: scores{N,C,1,1} N->nums C->num of classes
@@ -34,7 +36,7 @@ class PredictorDecodePlugin : public IPluginV2Ext {
3436
std::vector<float> _bbox_reg_weights;
3537
mutable int size = -1;
3638

37-
protected:
39+
protected:
3840
void deserialize(void const* data, size_t length) {
3941
const char* d = static_cast<const char*>(data);
4042
read(d, _num_boxes);
@@ -51,7 +53,9 @@ class PredictorDecodePlugin : public IPluginV2Ext {
5153
}
5254

5355
size_t getSerializationSize() const override {
54-
return sizeof(_num_boxes) + sizeof(_num_classes) + sizeof(_image_height) + sizeof(_image_width) + sizeof(size_t) + sizeof(float)*_bbox_reg_weights.size();
56+
return sizeof(_num_boxes) + sizeof(_num_classes) +
57+
sizeof(_image_height) + sizeof(_image_width) + sizeof(size_t) +
58+
sizeof(float)*_bbox_reg_weights.size();
5559
}
5660

5761
void serialize(void *buffer) const override {
@@ -66,14 +70,18 @@ class PredictorDecodePlugin : public IPluginV2Ext {
6670
}
6771
}
6872

69-
public:
70-
PredictorDecodePlugin(unsigned int num_boxes, unsigned int image_height, unsigned int image_width, std::vector<float> const& bbox_reg_weights)
71-
: _num_boxes(num_boxes), _image_height(image_height), _image_width(image_width), _bbox_reg_weights(bbox_reg_weights) {}
73+
public:
74+
PredictorDecodePlugin(unsigned int num_boxes, unsigned int image_height,
75+
unsigned int image_width, std::vector<float> const& bbox_reg_weights)
76+
: _num_boxes(num_boxes), _image_height(image_height),
77+
_image_width(image_width), _bbox_reg_weights(bbox_reg_weights) {}
7278

73-
PredictorDecodePlugin(unsigned int num_boxes, unsigned int num_classes, unsigned int image_height, unsigned int image_width,
74-
std::vector<float> const& bbox_reg_weights)
75-
: _num_boxes(num_boxes), _num_classes(num_classes), _image_height(image_height), _image_width(image_width),
76-
_bbox_reg_weights(bbox_reg_weights) {}
79+
PredictorDecodePlugin(unsigned int num_boxes, unsigned int num_classes,
80+
unsigned int image_height, unsigned int image_width,
81+
std::vector<float> const& bbox_reg_weights)
82+
: _num_boxes(num_boxes), _num_classes(num_classes),
83+
_image_height(image_height), _image_width(image_width),
84+
_bbox_reg_weights(bbox_reg_weights) {}
7785

7886
PredictorDecodePlugin(void const* data, size_t length) {
7987
this->deserialize(data, length);
@@ -108,17 +116,19 @@ class PredictorDecodePlugin : public IPluginV2Ext {
108116

109117
size_t getWorkspaceSize(int maxBatchSize) const override {
110118
if (size < 0) {
111-
size = predictorDecode(maxBatchSize, nullptr, nullptr, _num_boxes, _num_classes, _image_height, _image_width, _bbox_reg_weights,
112-
nullptr, 0, nullptr);
119+
size = predictorDecode(maxBatchSize, nullptr, nullptr,
120+
_num_boxes, _num_classes, _image_height, _image_width,
121+
_bbox_reg_weights, nullptr, 0, nullptr);
113122
}
114123
return size;
115124
}
116125

117126
int enqueue(int batchSize,
118127
const void *const *inputs, void **outputs,
119128
void *workspace, cudaStream_t stream) override {
120-
return predictorDecode(batchSize, inputs, outputs, _num_boxes, _num_classes, _image_height, _image_width, _bbox_reg_weights,
121-
workspace, getWorkspaceSize(batchSize), stream);
129+
return predictorDecode(batchSize, inputs, outputs, _num_boxes,
130+
_num_classes, _image_height, _image_width, _bbox_reg_weights,
131+
workspace, getWorkspaceSize(batchSize), stream);
122132
}
123133

124134
void destroy() override {
@@ -166,7 +176,7 @@ class PredictorDecodePlugin : public IPluginV2Ext {
166176
return new PredictorDecodePlugin(_num_boxes, _num_classes, _image_height, _image_width, _bbox_reg_weights);
167177
}
168178

169-
private:
179+
private:
170180
template<typename T> void write(char*& buffer, const T& val) const {
171181
*reinterpret_cast<T*>(buffer) = val;
172182
buffer += sizeof(T);
@@ -179,7 +189,7 @@ class PredictorDecodePlugin : public IPluginV2Ext {
179189
};
180190

181191
class PredictorDecodePluginCreator : public IPluginCreator {
182-
public:
192+
public:
183193
PredictorDecodePluginCreator() {}
184194

185195
const char *getPluginName() const override {

rcnn/README.md

+43-3
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,19 @@ The Pytorch implementation is [facebookresearch/detectron2](https://github.com/f
44

55
## Models
66

7-
- [x] Faster R-CNN(R50-C4)
7+
- [x] Faster R-CNN(C4)
88

99
- [ ] Mask R-CNN(R50-C4)
1010

1111
## Test Environment
1212

13-
- GTX2080Ti / Ubuntu16.04 / cuda10.2 / cudnn8.0.4 / TensorRT7.0.0 / OpenCV4.2
13+
- GTX2080Ti / Ubuntu16.04 / cuda10.2 / cudnn8.0.4 / TensorRT7.2.1 / OpenCV4.2
1414
- GTX2080Ti / win10 / cuda10.2 / cudnn8.0.4 / TensorRT7.2.1 / OpenCV4.2 / VS2017 (need to replace function corresponding to the dirent.h and add "--extended-lambda" in CUDA C/C++ -> Command Line -> Other options)
1515

16+
TensorRT7.2 is recomended because Resize layer in 7.0 with kLINEAR mode is a little different with opencv. You can also implement data preprocess out of tensorrt if you want to use TensorRT7.0 or more previous version.
17+
18+
**The result under fp32 is same to pytorch about 4 decimal places**!
19+
1620
## How to Run
1721

1822
1. generate .wts from pytorch with .pkl or .pth
@@ -48,6 +52,40 @@ sudo ./rcnn -d faster.engine ../samples
4852

4953
3. check the images generated, as follows. _zidane.jpg and _bus.jpg
5054

55+
## Backbone
56+
57+
#### R18, R34, R152
58+
59+
```
60+
1.download pretrained model
61+
R18: https://download.pytorch.org/models/resnet18-f37072fd.pth
62+
R34: https://download.pytorch.org/models/resnet34-b627a593.pth
63+
R152: https://download.pytorch.org/models/resnet152-394f9c45.pth
64+
2.convert pth to pkl by facebookresearch/detectron2/tools/convert-torchvision-to-d2.py
65+
3.set cfg.MODEL.RESNETS.DEPTH = 18(34,152),
66+
cfg.MODEL.RESNETS.STRIDE_IN_1X1 = False,
67+
cfg.MODEL.RESNETS.RES2_OUT_CHANNELS = 64, // for R18, R34
68+
cfg.MODEL.PIXEL_MEAN = [123.675, 116.280, 103.530],
69+
cfg.MODEL.PIXEL_STD = [58.395, 57.120, 57.375],
70+
cfg.INPUT.FORMAT = "RGB"
71+
and then train your own model
72+
4.set BACKBONE_RESNETTYPE = R18(R34, R152) in rcnn.cpp line 13
73+
5.modify PIXEL_MEAN and PIXEL_STD in rcnn.cpp
74+
6.set res2_out_channels=64 in BuildResNet in rcnn.cpp line 239 // for R18, R34
75+
7.generate wts file from your own model and build your engine, refer to how to run
76+
8.convert your image to RGB before inference
77+
```
78+
79+
#### R50, R101
80+
81+
```
82+
1.download pretrained model
83+
R50: https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_1x/137257644/model_final_721ade.pkl
84+
R101: https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_C4_3x/138204752/model_final_298dad.pkl
85+
2.set BACKBONE_RESNETTYPE = R50(R101) rcnn.cpp line 13
86+
3.follow how to run
87+
```
88+
5189
## NOTE
5290

5391
- if you meet the error below, just try to make again. The flag has been added in CMakeLists.txt
@@ -66,10 +104,12 @@ sudo ./rcnn -d faster.engine ../samples
66104

67105
1. quantizationType:fp32,fp16,int8. see BuildRcnnModel(rcnn.cpp line 276) for detail.
68106

69-
2. the using of int8 is same with [tensorrtx/yolov5](../yolov5/README.md), but it has no improvement comparing to fp16.
107+
2. the usage of int8 is same with [tensorrtx/yolov5](../yolov5/README.md), but it has no improvement comparing to fp16.
70108

71109
## Plugins
72110

111+
decode and nms plugins are modified from [retinanet-examples](https://github.com/NVIDIA/retinanet-examples/tree/master/csrc/plugins)
112+
73113
- RpnDecodePlugin: calculate coordinates of proposals which is the first n
74114

75115
```

0 commit comments

Comments
 (0)