yolov4 code cleanup

wang-xinyu · wang-xinyu · commit 5b94c930f866 · 2020-12-04T17:13:48.000+08:00
diff --git a/yolov4/CMakeLists.txt b/yolov4/CMakeLists.txt
@@ -13,16 +13,13 @@ find_package(CUDA REQUIRED)
 set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_30;code=sm_30)
 
 include_directories(${PROJECT_SOURCE_DIR}/include)
-if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
-    message("embed_platform on")
-    include_directories(/usr/local/cuda/targets/aarch64-linux/include)
-    link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
-else()
-    message("embed_platform off")
-    include_directories(/usr/local/cuda/include)
-    link_directories(/usr/local/cuda/lib64)
-endif()
-
+# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
+# cuda
+include_directories(/usr/local/cuda/include)
+link_directories(/usr/local/cuda/lib64)
+# tensorrt
+include_directories(/usr/include/x86_64-linux-gnu/)
+link_directories(/usr/lib/x86_64-linux-gnu/)
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
 
diff --git a/yolov4/README.md b/yolov4/README.md
@@ -2,39 +2,45 @@
 
 The Pytorch implementation is from [ultralytics/yolov3](https://github.com/ultralytics/yolov3). It can load yolov4.cfg and yolov4.weights(from AlexeyAB/darknet).
 
-Following tricks are used in this yolov4:
+## Config
 
-- Three yololayer are implemented in one plugin to improve speed, codes derived from [lewes6369/TensorRT-Yolov3](https://github.com/lewes6369/TensorRT-Yolov3)
-- Mish activation, implemented in a plugin.
-- Batchnorm layer, implemented by scale layer.
+- Input shape `INPUT_H`, `INPUT_W` defined in yololayer.h
+- Number of classes `CLASS_NUM` defined in yololayer.h
+- FP16/FP32 can be selected by the macro `USE_FP16` in yolov4.cpp
+- GPU id can be selected by the macro `DEVICE` in yolov4.cpp
+- NMS thresh `NMS_THRESH` in yolov4.cpp
+- bbox confidence threshold `BBOX_CONF_THRESH` in yolov4.cpp
+- `BATCH_SIZE` in yolov4.cpp
 
-## Excute:
+## How to run
 
-```
 1. generate yolov4.wts from pytorch implementation with yolov4.cfg and yolov4.weights, or download .wts from model zoo
 
+```
 git clone https://github.com/wang-xinyu/tensorrtx.git
 git clone https://github.com/ultralytics/yolov3.git
 // download yolov4.weights from https://github.com/AlexeyAB/darknet#pre-trained-models
-cd yolov3
-cp ../tensorrtx/yolov4/gen_wts.py .
+cp {tensorrtx}/yolov4/gen_wts.py {ultralytics/yolov3/}
+cd {ultralytics/yolov3/}
 python gen_wts.py yolov4.weights
 // a file 'yolov4.wts' will be generated.
 // the master branch of yolov3 should work, if not, you can checkout be87b41aa2fe59be8e62f4b488052b24ad0bd450
+```
 
-2. put yolov4.wts into ./yolov4, build and run
+2. put yolov4.wts into {tensorrtx}/yolov4, build and run
 
-mv yolov4.wts ../tensorrtx/yolov4/
-cd ../tensorrtx/yolov4
+```
+mv yolov4.wts {tensorrtx}/yolov4/
+cd {tensorrtx}/yolov4
 mkdir build
 cd build
 cmake ..
 make
-sudo ./yolov4 -s             // serialize model to plan file i.e. 'yolov4.engine'
-sudo ./yolov4 -d  ../../yolov3-spp/samples // deserialize plan file and run inference, the images in samples will be processed.
+sudo ./yolov4 -s                          // serialize model to plan file i.e. 'yolov4.engine'
+sudo ./yolov4 -d ../../yolov3-spp/samples // deserialize plan file and run inference, the images in samples will be processed.
+```
 
 3. check the images generated, as follows. _zidane.jpg and _bus.jpg
-```
 
 <p align="center">
 <img src="https://user-images.githubusercontent.com/15235574/80863728-cbd3a780-8cb0-11ea-8640-7983bb41c354.jpg">
@@ -44,16 +50,6 @@ sudo ./yolov4 -d  ../../yolov3-spp/samples // deserialize plan file and run infe
 <img src="https://user-images.githubusercontent.com/15235574/80863730-cfffc500-8cb0-11ea-810e-94d693e71d80.jpg">
 </p>
 
-## Config
-
-- Input shape `INPUT_H`, `INPUT_W` defined in yololayer.h
-- Number of classes `CLASS_NUM` defined in yololayer.h
-- FP16/FP32 can be selected by the macro `USE_FP16` in yolov4.cpp
-- GPU id can be selected by the macro `DEVICE` in yolov4.cpp
-- NMS thresh `NMS_THRESH` in yolov4.cpp
-- bbox confidence threshold `BBOX_CONF_THRESH` in yolov4.cpp
-- `BATCH_SIZE` in yolov4.cpp
-
 ## More Information
 
 See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)
diff --git a/yolov4/Utils.h b/yolov4/Utils.h
diff --git a/yolov4/utils.h b/yolov4/utils.h
@@ -0,0 +1,39 @@
+#ifndef __TRT_UTILS_H_
+#define __TRT_UTILS_H_
+
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <cudnn.h>
+
+#ifndef CUDA_CHECK
+
+#define CUDA_CHECK(callstr)                                                                    \
+    {                                                                                          \
+        cudaError_t error_code = callstr;                                                      \
+        if (error_code != cudaSuccess) {                                                       \
+            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
+            assert(0);                                                                         \
+        }                                                                                      \
+    }
+
+#endif
+
+namespace Tn
+{
+    template<typename T> 
+    void write(char*& buffer, const T& val)
+    {
+        *reinterpret_cast<T*>(buffer) = val;
+        buffer += sizeof(T);
+    }
+
+    template<typename T> 
+    void read(const char*& buffer, T& val)
+    {
+        val = *reinterpret_cast<const T*>(buffer);
+        buffer += sizeof(T);
+    }
+}
+
+#endif
diff --git a/yolov4/yololayer.cu b/yolov4/yololayer.cu
@@ -1,4 +1,6 @@
+#include <assert.h>
 #include "yololayer.h"
+#include "utils.h"
 
 using namespace Yolo;
 
diff --git a/yolov4/yololayer.h b/yolov4/yololayer.h
@@ -1,13 +1,9 @@
 #ifndef _YOLO_LAYER_H
 #define _YOLO_LAYER_H
 
-#include <assert.h>
-#include <cmath>
-#include <string.h>
-#include <cublas_v2.h>
-#include "NvInfer.h"
-#include "Utils.h"
 #include <iostream>
+#include <vector>
+#include "NvInfer.h"
 
 namespace Yolo
 {
diff --git a/yolov4/yolov4.cpp b/yolov4/yolov4.cpp
@@ -7,23 +7,12 @@
 #include <opencv2/opencv.hpp>
 #include <dirent.h>
 #include "NvInfer.h"
-#include "NvInferPlugin.h"
+#include "utils.h"
 #include "cuda_runtime_api.h"
 #include "logging.h"
 #include "yololayer.h"
 #include "mish.h"
 
-#define CHECK(status) \
-    do\
-    {\
-        auto ret = (status);\
-        if (ret != 0)\
-        {\
-            std::cerr << "Cuda failure: " << ret << std::endl;\
-            abort();\
-        }\
-    } while (0)
-
 #define USE_FP16  // comment out this if want to use FP32
 #define DEVICE 0  // GPU id
 #define NMS_THRESH 0.4
@@ -57,7 +46,7 @@ cv::Mat preprocess_img(cv::Mat& img) {
         y = 0;
     }
     cv::Mat re(h, w, CV_8UC3);
-    cv::resize(img, re, re.size(), 0, 0, cv::INTER_CUBIC);
+    cv::resize(img, re, re.size());
     cv::Mat out(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128));
     re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
     return out;
@@ -180,7 +169,6 @@ IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, W
     float *mean = (float*)weightMap[lname + ".running_mean"].values;
     float *var = (float*)weightMap[lname + ".running_var"].values;
     int len = weightMap[lname + ".running_var"].count;
-    std::cout << "len " << len << std::endl;
 
     float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
     for (int i = 0; i < len; i++) {
@@ -209,7 +197,6 @@ IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, W
 }
 
 ILayer* convBnMish(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int p, int linx) {
-    std::cout << linx << std::endl;
     Weights emptywts{DataType::kFLOAT, nullptr, 0};
     IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts);
     assert(conv1);
@@ -227,7 +214,6 @@ ILayer* convBnMish(INetworkDefinition *network, std::map<std::string, Weights>&
 }
 
 ILayer* convBnLeaky(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int p, int linx) {
-    std::cout << linx << std::endl;
     Weights emptywts{DataType::kFLOAT, nullptr, 0};
     IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts);
     assert(conv1);
@@ -489,7 +475,6 @@ ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilder
     auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj);
 
     yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
-    std::cout << "set name out" << std::endl;
     network->markOutput(*yolo->getOutput(0));
 
     // Build engine
@@ -498,8 +483,9 @@ ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilder
 #ifdef USE_FP16
     config->setFlag(BuilderFlag::kFP16);
 #endif
+    std::cout << "Building tensorrt engine, please wait for a while..." << std::endl;
     ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
-    std::cout << "build out" << std::endl;
+    std::cout << "Build engine successfully!" << std::endl;
 
     // Don't need the network any more
     network->destroy();
@@ -528,6 +514,7 @@ void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
     // Close everything down
     engine->destroy();
     builder->destroy();
+    config->destroy();
 }
 
 void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
@@ -544,23 +531,23 @@ void doInference(IExecutionContext& context, float* input, float* output, int ba
     const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
 
     // Create GPU buffers on device
-    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
-    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
+    CUDA_CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
+    CUDA_CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
 
     // Create stream
     cudaStream_t stream;
-    CHECK(cudaStreamCreate(&stream));
+    CUDA_CHECK(cudaStreamCreate(&stream));
 
     // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
-    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
     context.enqueue(batchSize, buffers, stream, nullptr);
-    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
     cudaStreamSynchronize(stream);
 
     // Release stream and buffers
     cudaStreamDestroy(stream);
-    CHECK(cudaFree(buffers[inputIndex]));
-    CHECK(cudaFree(buffers[outputIndex]));
+    CUDA_CHECK(cudaFree(buffers[inputIndex]));
+    CUDA_CHECK(cudaFree(buffers[outputIndex]));
 }
 
 int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {