yolov8 p2

wang-xinyu · Apr 17, 2024 · 37ddb30 · 37ddb30
1 parent c889b84
commit 37ddb30
Show file tree

Hide file tree

Showing 9 changed files with 696 additions and 245 deletions.
diff --git a/yolov8/README.md b/yolov8/README.md
@@ -22,10 +22,10 @@ The tensorrt code is derived from [xiaocao-tian/yolov8_tensorrt](https://github.
 Currently, we support yolov8
 
 - For yolov8 , download .pt from [https://github.com/ultralytics/assets/releases](https://github.com/ultralytics/assets/releases), then follow how-to-run in current page.
-
+[README.md](..%2FREADME.md)
 ## Config
 
-- Choose the model n/s/m/l/x/n6/s6/m6/l6/x6 from command line arguments.
+- Choose the model n/s/m/l/x/n6/s6/m6/l6/[README.md](..%2FREADME.md)x6 from command line arguments.
 - Check more configs in [include/config.h](./include/config.h)
 
 ## How to Run, yolov8n as example
@@ -34,10 +34,13 @@ Currently, we support yolov8
 
 ```
 // download https://github.com/ultralytics/assets/releases/yolov8n.pt
+// download https://github.com/lindsayshuo/yolov8-p2/releases/download/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt (only for  10 cls p2 model)
 cp {tensorrtx}/yolov8/gen_wts.py {ultralytics}/ultralytics
 cd {ultralytics}/ultralytics
 python gen_wts.py -w yolov8n.pt -o yolov8n.wts -t detect
 // a file 'yolov8n.wts' will be generated.
+python gen_wts.py -w VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt -o VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts -t detect (only for  10 cls p2 model)
+// a file 'VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts' will be generated.
 ```
 
 2. build tensorrtx/yolov8 and run
@@ -51,12 +54,20 @@ cd build
 cp {ultralytics}/ultralytics/yolov8.wts {tensorrtx}/yolov8/build
 cmake ..
 make
-sudo ./yolov8_det -s [.wts] [.engine] [n/s/m/l/x/n6/s6/m6/l6/x6]  // serialize model to plan file
+sudo ./yolov8_det -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6]  // serialize model to plan file
 sudo ./yolov8_det -d [.engine] [image folder]  [c/g] // deserialize and run inference, the images in [image folder] will be processed.
 // For example yolov8
 sudo ./yolov8_det -s yolov8n.wts yolov8.engine n
 sudo ./yolov8_det -d yolov8n.engine ../images c //cpu postprocess
 sudo ./yolov8_det -d yolov8n.engine ../images g //gpu postprocess
+
+for p2 model:
+change the  "const static int kNumClass" in config.h to 10;
+sudo ./yolov8_det -s VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine x2
+wget https://github.com/lindsayshuo/yolov8-p2/releases/download/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last/0000008_01999_d_0000040.jpg
+cp -r 0000008_01999_d_0000040.jpg ../images
+sudo ./yolov8_det -d VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine ../images c //cpu postprocess
+sudo ./yolov8_det -d VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine ../images g //gpu postprocess
 ```
 
 ### Instance Segmentation

diff --git a/yolov8/include/block.h b/yolov8/include/block.h
@@ -26,4 +26,5 @@ nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std
                              nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname);
 
 nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
-                                       std::vector<nvinfer1::IConcatenationLayer*> dets, bool is_segmentation = false);
+                                       std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
+                                       int px_arry_num, bool is_segmentation);
diff --git a/yolov8/include/config.h b/yolov8/include/config.h
@@ -2,9 +2,9 @@
 //#define USE_FP32
 //#define USE_INT8
 
-const static char *kInputTensorName = "images";
-const static char *kOutputTensorName = "output";
-const static int kNumClass = 80;
+const static char* kInputTensorName = "images";
+const static char* kOutputTensorName = "output";
+const static int kNumClass = 10;
 const static int kBatchSize = 1;
 const static int kGpuId = 0;
 const static int kInputH = 640;
@@ -14,7 +14,6 @@ const static float kConfThresh = 0.5f;
 const static int kMaxInputImageSize = 3000 * 3000;
 const static int kMaxNumOutputBbox = 1000;
 
-
 // Classfication model's number of classes
 constexpr static int kClsNumClass = 1000;
 // Classfication model's input shape

diff --git a/yolov8/include/model.h b/yolov8/include/model.h
@@ -11,6 +11,10 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                               int& max_channels);
 
+nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
+                                              nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
+                                              int& max_channels);
+
 nvinfer1::IHostMemory* buildEngineYolov8Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw);
 

diff --git a/yolov8/plugin/yololayer.cu b/yolov8/plugin/yololayer.cu
@@ -1,45 +1,58 @@
-#include "yololayer.h"
-#include "types.h"
 #include <assert.h>
 #include <math.h>
-#include "cuda_utils.h"
-#include <vector>
 #include <iostream>
+#include <vector>
+#include "cuda_utils.h"
+#include "types.h"
+#include "yololayer.h"
 
 namespace Tn {
-    template<typename T>
-    void write(char*& buffer, const T& val) {
-        *reinterpret_cast<T*>(buffer) = val;
-        buffer += sizeof(T);
-    }
+template <typename T>
+void write(char*& buffer, const T& val) {
+    *reinterpret_cast<T*>(buffer) = val;
+    buffer += sizeof(T);
+}
 
-    template<typename T>
-    void read(const char*& buffer, T& val) {
-        val = *reinterpret_cast<const T*>(buffer);
-        buffer += sizeof(T);
-    }
+template <typename T>
+void read(const char*& buffer, T& val) {
+    val = *reinterpret_cast<const T*>(buffer);
+    buffer += sizeof(T);
+}
 }  // namespace Tn
 
-
 namespace nvinfer1 {
-YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation) {
+YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation,
+                                 const int* strides, int stridesLength) {
     mClassCount = classCount;
     mYoloV8NetWidth = netWidth;
     mYoloV8netHeight = netHeight;
     mMaxOutObject = maxOut;
+    mStridesLength = stridesLength;
+    mStrides = new int[stridesLength];
+    memcpy(mStrides, strides, stridesLength * sizeof(int));
     is_segmentation_ = is_segmentation;
 }
 
-YoloLayerPlugin::~YoloLayerPlugin() {}
+YoloLayerPlugin::~YoloLayerPlugin() {
+    if (mStrides != nullptr) {
+        delete[] mStrides;
+        mStrides = nullptr;
+    }
+}
 
 YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
     using namespace Tn;
-    const char* d = reinterpret_cast<const char*>(data), * a = d;
+    const char *d = reinterpret_cast<const char*>(data), *a = d;
     read(d, mClassCount);
     read(d, mThreadCount);
     read(d, mYoloV8NetWidth);
     read(d, mYoloV8netHeight);
     read(d, mMaxOutObject);
+    read(d, mStridesLength);
+    mStrides = new int[mStridesLength];
+    for (int i = 0; i < mStridesLength; ++i) {
+        read(d, mStrides[i]);
+    }
     read(d, is_segmentation_);
 
     assert(d == a + length);
@@ -48,26 +61,32 @@ YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
 void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {
 
     using namespace Tn;
-    char* d = static_cast<char*>(buffer), * a = d;
+    char *d = static_cast<char*>(buffer), *a = d;
     write(d, mClassCount);
     write(d, mThreadCount);
     write(d, mYoloV8NetWidth);
     write(d, mYoloV8netHeight);
     write(d, mMaxOutObject);
+    write(d, mStridesLength);
+    for (int i = 0; i < mStridesLength; ++i) {
+        write(d, mStrides[i]);
+    }
     write(d, is_segmentation_);
 
     assert(d == a + getSerializationSize());
 }
 
 size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT {
-    return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(is_segmentation_);
+    return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) +
+           sizeof(mMaxOutObject) + sizeof(mStridesLength) + sizeof(int) * mStridesLength + sizeof(is_segmentation_);
 }
 
 int YoloLayerPlugin::initialize() TRT_NOEXCEPT {
     return 0;
 }
 
-nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT {
+nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                                    int nbInputDims) TRT_NOEXCEPT {
     int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float);
     return nvinfer1::Dims3(total_size + 1, 1, 1);
 }
@@ -80,11 +99,13 @@ const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT {
     return mPluginNamespace;
 }
 
-nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT {
+nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+                                                      int nbInputs) const TRT_NOEXCEPT {
     return nvinfer1::DataType::kFLOAT;
 }
 
-bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT {
+bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
+                                                   int nbInputs) const TRT_NOEXCEPT {
 
     return false;
 }
@@ -94,9 +115,11 @@ bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOE
     return false;
 }
 
-void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput, nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT {};
+void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput,
+                                      nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{};
 
-void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {};
+void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
+                                      IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{};
 
 void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}
 
@@ -116,28 +139,33 @@ void YoloLayerPlugin::destroy() TRT_NOEXCEPT {
 
 nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT {
 
-    YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV8NetWidth, mYoloV8netHeight, mMaxOutObject, is_segmentation_);
+    YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV8NetWidth, mYoloV8netHeight, mMaxOutObject,
+                                             is_segmentation_, mStrides, mStridesLength);
     p->setPluginNamespace(mPluginNamespace);
     return p;
 }
 
-int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
+int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs,
+                             void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
 
     forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize);
     return 0;
 }
 
+__device__ float Logist(float data) {
+    return 1.0f / (1.0f + expf(-data));
+};
 
-__device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); };
-
-__global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject,
-                             const int grid_h, int grid_w, const int stride, int classes, int outputElem, bool is_segmentation) {
+__global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h,
+                             int grid_w, const int stride, int classes, int outputElem, bool is_segmentation) {
     int idx = threadIdx.x + blockDim.x * blockIdx.x;
-    if (idx >= numElements) return;
+    if (idx >= numElements)
+        return;
 
     int total_grid = grid_h * grid_w;
     int info_len = 4 + classes;
-    if (is_segmentation) info_len += 32;
+    if (is_segmentation)
+        info_len += 32;
     int batchIdx = idx / total_grid;
     int elemIdx = idx % total_grid;
     const float* curInput = input + batchIdx * total_grid * info_len;
@@ -153,10 +181,12 @@ __global__ void CalDetection(const float* input, float* output, int numElements,
         }
     }
 
-    if (max_cls_prob < 0.1) return;
+    if (max_cls_prob < 0.1)
+        return;
 
     int count = (int)atomicAdd(output + outputIdx, 1);
-    if (count >= maxoutobject) return;
+    if (count >= maxoutobject)
+        return;
     char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection);
     Detection* det = (Detection*)(data);
 
@@ -175,24 +205,33 @@ __global__ void CalDetection(const float* input, float* output, int numElements,
     }
 }
 
-void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,int mYoloV8NetWidth, int batchSize) {
+void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
+                                 int mYoloV8NetWidth, int batchSize) {
     int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
     cudaMemsetAsync(output, 0, sizeof(float), stream);
     for (int idx = 0; idx < batchSize; ++idx) {
         CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream));
     }
     int numElem = 0;
-    int grids[3][2] = { {mYoloV8netHeight / 8, mYoloV8NetWidth / 8}, {mYoloV8netHeight / 16, mYoloV8NetWidth / 16}, {mYoloV8netHeight / 32, mYoloV8NetWidth / 32} };
-    int strides[] = { 8, 16, 32 };
-    for (unsigned int i = 0; i < 3; i++) {
+
+    const int maxGrids = mStridesLength;
+    int grids[maxGrids][2];
+    for (int i = 0; i < maxGrids; ++i) {
+        grids[i][0] = mYoloV8netHeight / mStrides[i];
+        grids[i][1] = mYoloV8NetWidth / mStrides[i];
+    }
+
+    for (unsigned int i = 0; i < maxGrids; i++) {
         int grid_h = grids[i][0];
         int grid_w = grids[i][1];
-        int stride = strides[i];
+        int stride = mStrides[i];
         numElem = grid_h * grid_w * batchSize;
-        if (numElem < mThreadCount) mThreadCount = numElem;
+        if (numElem < mThreadCount)
+            mThreadCount = numElem;
 
-        CalDetection << <(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream >> >
-            (inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, outputElem, is_segmentation_);
+        CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>(
+                inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, outputElem,
+                is_segmentation_);
     }
 }
 
@@ -219,24 +258,29 @@ const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT {
 
 IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
     assert(fc->nbFields == 1);
-    assert(strcmp(fc->fields[0].name, "netinfo") == 0);
-    int* p_netinfo = (int*)(fc->fields[0].data);
-    int class_count = p_netinfo[0];
-    int input_w = p_netinfo[1];
-    int input_h = p_netinfo[2];
-    int max_output_object_count = p_netinfo[3];
-    bool is_segmentation = p_netinfo[4];
-    YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, is_segmentation);
+    assert(strcmp(fc->fields[0].name, "combinedInfo") == 0);
+    const int* combinedInfo = static_cast<const int*>(fc->fields[0].data);
+    int netinfo_count = 5;
+    int class_count = combinedInfo[0];
+    int input_w = combinedInfo[1];
+    int input_h = combinedInfo[2];
+    int max_output_object_count = combinedInfo[3];
+    bool is_segmentation = combinedInfo[4];
+    const int* px_arry = combinedInfo + netinfo_count;
+    int px_arry_length = fc->fields[0].length - netinfo_count;
+    YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, is_segmentation,
+                                               px_arry, px_arry_length);
     obj->setPluginNamespace(mNamespace.c_str());
     return obj;
 }
 
-IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT {
+IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData,
+                                                     size_t serialLength) TRT_NOEXCEPT {
     // This object will be deleted when the network is destroyed, which will
     // call YoloLayerPlugin::destroy()
     YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
     obj->setPluginNamespace(mNamespace.c_str());
     return obj;
 }
 
-} // namespace nvinfer1
+}  // namespace nvinfer1