Skip to content

Commit 5b94c93

Browse files
committed
yolov4 code cleanup
1 parent 4089c64 commit 5b94c93

File tree

7 files changed

+82
-159
lines changed

7 files changed

+82
-159
lines changed

yolov4/CMakeLists.txt

+7-10
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,13 @@ find_package(CUDA REQUIRED)
1313
set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_30;code=sm_30)
1414

1515
include_directories(${PROJECT_SOURCE_DIR}/include)
16-
if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
17-
message("embed_platform on")
18-
include_directories(/usr/local/cuda/targets/aarch64-linux/include)
19-
link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
20-
else()
21-
message("embed_platform off")
22-
include_directories(/usr/local/cuda/include)
23-
link_directories(/usr/local/cuda/lib64)
24-
endif()
25-
16+
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
17+
# cuda
18+
include_directories(/usr/local/cuda/include)
19+
link_directories(/usr/local/cuda/lib64)
20+
# tensorrt
21+
include_directories(/usr/include/x86_64-linux-gnu/)
22+
link_directories(/usr/lib/x86_64-linux-gnu/)
2623

2724
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
2825

yolov4/README.md

+20-24
Original file line numberDiff line numberDiff line change
@@ -2,39 +2,45 @@
22

33
The Pytorch implementation is from [ultralytics/yolov3](https://github.com/ultralytics/yolov3). It can load yolov4.cfg and yolov4.weights(from AlexeyAB/darknet).
44

5-
Following tricks are used in this yolov4:
5+
## Config
66

7-
- Three yololayer are implemented in one plugin to improve speed, codes derived from [lewes6369/TensorRT-Yolov3](https://github.com/lewes6369/TensorRT-Yolov3)
8-
- Mish activation, implemented in a plugin.
9-
- Batchnorm layer, implemented by scale layer.
7+
- Input shape `INPUT_H`, `INPUT_W` defined in yololayer.h
8+
- Number of classes `CLASS_NUM` defined in yololayer.h
9+
- FP16/FP32 can be selected by the macro `USE_FP16` in yolov4.cpp
10+
- GPU id can be selected by the macro `DEVICE` in yolov4.cpp
11+
- NMS thresh `NMS_THRESH` in yolov4.cpp
12+
- bbox confidence threshold `BBOX_CONF_THRESH` in yolov4.cpp
13+
- `BATCH_SIZE` in yolov4.cpp
1014

11-
## Excute:
15+
## How to run
1216

13-
```
1417
1. generate yolov4.wts from pytorch implementation with yolov4.cfg and yolov4.weights, or download .wts from model zoo
1518

19+
```
1620
git clone https://github.com/wang-xinyu/tensorrtx.git
1721
git clone https://github.com/ultralytics/yolov3.git
1822
// download yolov4.weights from https://github.com/AlexeyAB/darknet#pre-trained-models
19-
cd yolov3
20-
cp ../tensorrtx/yolov4/gen_wts.py .
23+
cp {tensorrtx}/yolov4/gen_wts.py {ultralytics/yolov3/}
24+
cd {ultralytics/yolov3/}
2125
python gen_wts.py yolov4.weights
2226
// a file 'yolov4.wts' will be generated.
2327
// the master branch of yolov3 should work, if not, you can checkout be87b41aa2fe59be8e62f4b488052b24ad0bd450
28+
```
2429

25-
2. put yolov4.wts into ./yolov4, build and run
30+
2. put yolov4.wts into {tensorrtx}/yolov4, build and run
2631

27-
mv yolov4.wts ../tensorrtx/yolov4/
28-
cd ../tensorrtx/yolov4
32+
```
33+
mv yolov4.wts {tensorrtx}/yolov4/
34+
cd {tensorrtx}/yolov4
2935
mkdir build
3036
cd build
3137
cmake ..
3238
make
33-
sudo ./yolov4 -s // serialize model to plan file i.e. 'yolov4.engine'
34-
sudo ./yolov4 -d ../../yolov3-spp/samples // deserialize plan file and run inference, the images in samples will be processed.
39+
sudo ./yolov4 -s // serialize model to plan file i.e. 'yolov4.engine'
40+
sudo ./yolov4 -d ../../yolov3-spp/samples // deserialize plan file and run inference, the images in samples will be processed.
41+
```
3542

3643
3. check the images generated, as follows. _zidane.jpg and _bus.jpg
37-
```
3844

3945
<p align="center">
4046
<img src="https://user-images.githubusercontent.com/15235574/80863728-cbd3a780-8cb0-11ea-8640-7983bb41c354.jpg">
@@ -44,16 +50,6 @@ sudo ./yolov4 -d ../../yolov3-spp/samples // deserialize plan file and run infe
4450
<img src="https://user-images.githubusercontent.com/15235574/80863730-cfffc500-8cb0-11ea-810e-94d693e71d80.jpg">
4551
</p>
4652

47-
## Config
48-
49-
- Input shape `INPUT_H`, `INPUT_W` defined in yololayer.h
50-
- Number of classes `CLASS_NUM` defined in yololayer.h
51-
- FP16/FP32 can be selected by the macro `USE_FP16` in yolov4.cpp
52-
- GPU id can be selected by the macro `DEVICE` in yolov4.cpp
53-
- NMS thresh `NMS_THRESH` in yolov4.cpp
54-
- bbox confidence threshold `BBOX_CONF_THRESH` in yolov4.cpp
55-
- `BATCH_SIZE` in yolov4.cpp
56-
5753
## More Information
5854

5955
See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)

yolov4/Utils.h

-94
This file was deleted.

yolov4/utils.h

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#ifndef __TRT_UTILS_H_
2+
#define __TRT_UTILS_H_
3+
4+
#include <iostream>
5+
#include <vector>
6+
#include <algorithm>
7+
#include <cudnn.h>
8+
9+
#ifndef CUDA_CHECK
10+
11+
#define CUDA_CHECK(callstr) \
12+
{ \
13+
cudaError_t error_code = callstr; \
14+
if (error_code != cudaSuccess) { \
15+
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
16+
assert(0); \
17+
} \
18+
}
19+
20+
#endif
21+
22+
namespace Tn
23+
{
24+
template<typename T>
25+
void write(char*& buffer, const T& val)
26+
{
27+
*reinterpret_cast<T*>(buffer) = val;
28+
buffer += sizeof(T);
29+
}
30+
31+
template<typename T>
32+
void read(const char*& buffer, T& val)
33+
{
34+
val = *reinterpret_cast<const T*>(buffer);
35+
buffer += sizeof(T);
36+
}
37+
}
38+
39+
#endif

yolov4/yololayer.cu

+2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
#include <assert.h>
12
#include "yololayer.h"
3+
#include "utils.h"
24

35
using namespace Yolo;
46

yolov4/yololayer.h

+2-6
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,9 @@
11
#ifndef _YOLO_LAYER_H
22
#define _YOLO_LAYER_H
33

4-
#include <assert.h>
5-
#include <cmath>
6-
#include <string.h>
7-
#include <cublas_v2.h>
8-
#include "NvInfer.h"
9-
#include "Utils.h"
104
#include <iostream>
5+
#include <vector>
6+
#include "NvInfer.h"
117

128
namespace Yolo
139
{

yolov4/yolov4.cpp

+12-25
Original file line numberDiff line numberDiff line change
@@ -7,23 +7,12 @@
77
#include <opencv2/opencv.hpp>
88
#include <dirent.h>
99
#include "NvInfer.h"
10-
#include "NvInferPlugin.h"
10+
#include "utils.h"
1111
#include "cuda_runtime_api.h"
1212
#include "logging.h"
1313
#include "yololayer.h"
1414
#include "mish.h"
1515

16-
#define CHECK(status) \
17-
do\
18-
{\
19-
auto ret = (status);\
20-
if (ret != 0)\
21-
{\
22-
std::cerr << "Cuda failure: " << ret << std::endl;\
23-
abort();\
24-
}\
25-
} while (0)
26-
2716
#define USE_FP16 // comment out this if want to use FP32
2817
#define DEVICE 0 // GPU id
2918
#define NMS_THRESH 0.4
@@ -57,7 +46,7 @@ cv::Mat preprocess_img(cv::Mat& img) {
5746
y = 0;
5847
}
5948
cv::Mat re(h, w, CV_8UC3);
60-
cv::resize(img, re, re.size(), 0, 0, cv::INTER_CUBIC);
49+
cv::resize(img, re, re.size());
6150
cv::Mat out(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128));
6251
re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
6352
return out;
@@ -180,7 +169,6 @@ IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, W
180169
float *mean = (float*)weightMap[lname + ".running_mean"].values;
181170
float *var = (float*)weightMap[lname + ".running_var"].values;
182171
int len = weightMap[lname + ".running_var"].count;
183-
std::cout << "len " << len << std::endl;
184172

185173
float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
186174
for (int i = 0; i < len; i++) {
@@ -209,7 +197,6 @@ IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, W
209197
}
210198

211199
ILayer* convBnMish(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int p, int linx) {
212-
std::cout << linx << std::endl;
213200
Weights emptywts{DataType::kFLOAT, nullptr, 0};
214201
IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts);
215202
assert(conv1);
@@ -227,7 +214,6 @@ ILayer* convBnMish(INetworkDefinition *network, std::map<std::string, Weights>&
227214
}
228215

229216
ILayer* convBnLeaky(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int p, int linx) {
230-
std::cout << linx << std::endl;
231217
Weights emptywts{DataType::kFLOAT, nullptr, 0};
232218
IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts);
233219
assert(conv1);
@@ -489,7 +475,6 @@ ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilder
489475
auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj);
490476

491477
yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
492-
std::cout << "set name out" << std::endl;
493478
network->markOutput(*yolo->getOutput(0));
494479

495480
// Build engine
@@ -498,8 +483,9 @@ ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilder
498483
#ifdef USE_FP16
499484
config->setFlag(BuilderFlag::kFP16);
500485
#endif
486+
std::cout << "Building tensorrt engine, please wait for a while..." << std::endl;
501487
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
502-
std::cout << "build out" << std::endl;
488+
std::cout << "Build engine successfully!" << std::endl;
503489

504490
// Don't need the network any more
505491
network->destroy();
@@ -528,6 +514,7 @@ void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
528514
// Close everything down
529515
engine->destroy();
530516
builder->destroy();
517+
config->destroy();
531518
}
532519

533520
void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
@@ -544,23 +531,23 @@ void doInference(IExecutionContext& context, float* input, float* output, int ba
544531
const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
545532

546533
// Create GPU buffers on device
547-
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
548-
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
534+
CUDA_CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
535+
CUDA_CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
549536

550537
// Create stream
551538
cudaStream_t stream;
552-
CHECK(cudaStreamCreate(&stream));
539+
CUDA_CHECK(cudaStreamCreate(&stream));
553540

554541
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
555-
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
542+
CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
556543
context.enqueue(batchSize, buffers, stream, nullptr);
557-
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
544+
CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
558545
cudaStreamSynchronize(stream);
559546

560547
// Release stream and buffers
561548
cudaStreamDestroy(stream);
562-
CHECK(cudaFree(buffers[inputIndex]));
563-
CHECK(cudaFree(buffers[outputIndex]));
549+
CUDA_CHECK(cudaFree(buffers[inputIndex]));
550+
CUDA_CHECK(cudaFree(buffers[outputIndex]));
564551
}
565552

566553
int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {

0 commit comments

Comments
 (0)