From 37ddb3044f25706fa273aa55f3ddb9ede2f180e0 Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Wed, 17 Apr 2024 16:01:35 +0800
Subject: [PATCH 01/10] yolov8 p2

---
 yolov8/README.md           |  17 +-
 yolov8/include/block.h     |   3 +-
 yolov8/include/config.h    |   7 +-
 yolov8/include/model.h     |   4 +
 yolov8/plugin/yololayer.cu | 150 ++++++----
 yolov8/plugin/yololayer.h  | 130 ++++-----
 yolov8/src/block.cpp       |  56 ++--
 yolov8/src/model.cpp       | 550 ++++++++++++++++++++++++++++++-------
 yolov8/yolov8_det.cpp      |  24 +-
 9 files changed, 696 insertions(+), 245 deletions(-)

diff --git a/yolov8/README.md b/yolov8/README.md
index 4ca978ba..6653e1f8 100644
--- a/yolov8/README.md
+++ b/yolov8/README.md
@@ -22,10 +22,10 @@ The tensorrt code is derived from [xiaocao-tian/yolov8_tensorrt](https://github.
 Currently, we support yolov8
 
 - For yolov8 , download .pt from [https://github.com/ultralytics/assets/releases](https://github.com/ultralytics/assets/releases), then follow how-to-run in current page.
-
+[README.md](..%2FREADME.md)
 ## Config
 
-- Choose the model n/s/m/l/x/n6/s6/m6/l6/x6 from command line arguments.
+- Choose the model n/s/m/l/x/n6/s6/m6/l6/[README.md](..%2FREADME.md)x6 from command line arguments.
 - Check more configs in [include/config.h](./include/config.h)
 
 ## How to Run, yolov8n as example
@@ -34,10 +34,13 @@ Currently, we support yolov8
 
 ```
 // download https://github.com/ultralytics/assets/releases/yolov8n.pt
+// download https://github.com/lindsayshuo/yolov8-p2/releases/download/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt (only for  10 cls p2 model)
 cp {tensorrtx}/yolov8/gen_wts.py {ultralytics}/ultralytics
 cd {ultralytics}/ultralytics
 python gen_wts.py -w yolov8n.pt -o yolov8n.wts -t detect
 // a file 'yolov8n.wts' will be generated.
+python gen_wts.py -w VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt -o VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts -t detect (only for  10 cls p2 model)
+// a file 'VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts' will be generated.
 ```
 
 2. build tensorrtx/yolov8 and run
@@ -51,12 +54,20 @@ cd build
 cp {ultralytics}/ultralytics/yolov8.wts {tensorrtx}/yolov8/build
 cmake ..
 make
-sudo ./yolov8_det -s [.wts] [.engine] [n/s/m/l/x/n6/s6/m6/l6/x6]  // serialize model to plan file
+sudo ./yolov8_det -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6]  // serialize model to plan file
 sudo ./yolov8_det -d [.engine] [image folder]  [c/g] // deserialize and run inference, the images in [image folder] will be processed.
 // For example yolov8
 sudo ./yolov8_det -s yolov8n.wts yolov8.engine n
 sudo ./yolov8_det -d yolov8n.engine ../images c //cpu postprocess
 sudo ./yolov8_det -d yolov8n.engine ../images g //gpu postprocess
+
+for p2 model:
+change the  "const static int kNumClass" in config.h to 10;
+sudo ./yolov8_det -s VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine x2
+wget https://github.com/lindsayshuo/yolov8-p2/releases/download/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last/0000008_01999_d_0000040.jpg
+cp -r 0000008_01999_d_0000040.jpg ../images
+sudo ./yolov8_det -d VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine ../images c //cpu postprocess
+sudo ./yolov8_det -d VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine ../images g //gpu postprocess
 ```
 
 ### Instance Segmentation
diff --git a/yolov8/include/block.h b/yolov8/include/block.h
index 3df49c8c..ed4c6ad1 100644
--- a/yolov8/include/block.h
+++ b/yolov8/include/block.h
@@ -26,4 +26,5 @@ nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std
                              nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname);
 
 nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
-                                       std::vector<nvinfer1::IConcatenationLayer*> dets, bool is_segmentation = false);
+                                       std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
+                                       int px_arry_num, bool is_segmentation);
diff --git a/yolov8/include/config.h b/yolov8/include/config.h
index f124d5d9..6af9f1a9 100644
--- a/yolov8/include/config.h
+++ b/yolov8/include/config.h
@@ -2,9 +2,9 @@
 //#define USE_FP32
 //#define USE_INT8
 
-const static char *kInputTensorName = "images";
-const static char *kOutputTensorName = "output";
-const static int kNumClass = 80;
+const static char* kInputTensorName = "images";
+const static char* kOutputTensorName = "output";
+const static int kNumClass = 10;
 const static int kBatchSize = 1;
 const static int kGpuId = 0;
 const static int kInputH = 640;
@@ -14,7 +14,6 @@ const static float kConfThresh = 0.5f;
 const static int kMaxInputImageSize = 3000 * 3000;
 const static int kMaxNumOutputBbox = 1000;
 
-
 // Classfication model's number of classes
 constexpr static int kClsNumClass = 1000;
 // Classfication model's input shape
diff --git a/yolov8/include/model.h b/yolov8/include/model.h
index c8b30e98..0a2a40a3 100644
--- a/yolov8/include/model.h
+++ b/yolov8/include/model.h
@@ -11,6 +11,10 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                               int& max_channels);
 
+nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
+                                              nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
+                                              int& max_channels);
+
 nvinfer1::IHostMemory* buildEngineYolov8Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw);
 
diff --git a/yolov8/plugin/yololayer.cu b/yolov8/plugin/yololayer.cu
index bdc073cc..9b786de7 100755
--- a/yolov8/plugin/yololayer.cu
+++ b/yolov8/plugin/yololayer.cu
@@ -1,45 +1,58 @@
-#include "yololayer.h"
-#include "types.h"
 #include <assert.h>
 #include <math.h>
-#include "cuda_utils.h"
-#include <vector>
 #include <iostream>
+#include <vector>
+#include "cuda_utils.h"
+#include "types.h"
+#include "yololayer.h"
 
 namespace Tn {
-    template<typename T>
-    void write(char*& buffer, const T& val) {
-        *reinterpret_cast<T*>(buffer) = val;
-        buffer += sizeof(T);
-    }
+template <typename T>
+void write(char*& buffer, const T& val) {
+    *reinterpret_cast<T*>(buffer) = val;
+    buffer += sizeof(T);
+}
 
-    template<typename T>
-    void read(const char*& buffer, T& val) {
-        val = *reinterpret_cast<const T*>(buffer);
-        buffer += sizeof(T);
-    }
+template <typename T>
+void read(const char*& buffer, T& val) {
+    val = *reinterpret_cast<const T*>(buffer);
+    buffer += sizeof(T);
+}
 }  // namespace Tn
 
-
 namespace nvinfer1 {
-YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation) {
+YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation,
+                                 const int* strides, int stridesLength) {
     mClassCount = classCount;
     mYoloV8NetWidth = netWidth;
     mYoloV8netHeight = netHeight;
     mMaxOutObject = maxOut;
+    mStridesLength = stridesLength;
+    mStrides = new int[stridesLength];
+    memcpy(mStrides, strides, stridesLength * sizeof(int));
     is_segmentation_ = is_segmentation;
 }
 
-YoloLayerPlugin::~YoloLayerPlugin() {}
+YoloLayerPlugin::~YoloLayerPlugin() {
+    if (mStrides != nullptr) {
+        delete[] mStrides;
+        mStrides = nullptr;
+    }
+}
 
 YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
     using namespace Tn;
-    const char* d = reinterpret_cast<const char*>(data), * a = d;
+    const char *d = reinterpret_cast<const char*>(data), *a = d;
     read(d, mClassCount);
     read(d, mThreadCount);
     read(d, mYoloV8NetWidth);
     read(d, mYoloV8netHeight);
     read(d, mMaxOutObject);
+    read(d, mStridesLength);
+    mStrides = new int[mStridesLength];
+    for (int i = 0; i < mStridesLength; ++i) {
+        read(d, mStrides[i]);
+    }
     read(d, is_segmentation_);
 
     assert(d == a + length);
@@ -48,26 +61,32 @@ YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
 void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {
 
     using namespace Tn;
-    char* d = static_cast<char*>(buffer), * a = d;
+    char *d = static_cast<char*>(buffer), *a = d;
     write(d, mClassCount);
     write(d, mThreadCount);
     write(d, mYoloV8NetWidth);
     write(d, mYoloV8netHeight);
     write(d, mMaxOutObject);
+    write(d, mStridesLength);
+    for (int i = 0; i < mStridesLength; ++i) {
+        write(d, mStrides[i]);
+    }
     write(d, is_segmentation_);
 
     assert(d == a + getSerializationSize());
 }
 
 size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT {
-    return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(is_segmentation_);
+    return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) +
+           sizeof(mMaxOutObject) + sizeof(mStridesLength) + sizeof(int) * mStridesLength + sizeof(is_segmentation_);
 }
 
 int YoloLayerPlugin::initialize() TRT_NOEXCEPT {
     return 0;
 }
 
-nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT {
+nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                                    int nbInputDims) TRT_NOEXCEPT {
     int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float);
     return nvinfer1::Dims3(total_size + 1, 1, 1);
 }
@@ -80,11 +99,13 @@ const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT {
     return mPluginNamespace;
 }
 
-nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT {
+nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+                                                      int nbInputs) const TRT_NOEXCEPT {
     return nvinfer1::DataType::kFLOAT;
 }
 
-bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT {
+bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
+                                                   int nbInputs) const TRT_NOEXCEPT {
 
     return false;
 }
@@ -94,9 +115,11 @@ bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOE
     return false;
 }
 
-void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput, nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT {};
+void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput,
+                                      nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{};
 
-void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {};
+void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
+                                      IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{};
 
 void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}
 
@@ -116,28 +139,33 @@ void YoloLayerPlugin::destroy() TRT_NOEXCEPT {
 
 nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT {
 
-    YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV8NetWidth, mYoloV8netHeight, mMaxOutObject, is_segmentation_);
+    YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV8NetWidth, mYoloV8netHeight, mMaxOutObject,
+                                             is_segmentation_, mStrides, mStridesLength);
     p->setPluginNamespace(mPluginNamespace);
     return p;
 }
 
-int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
+int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs,
+                             void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
 
     forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize);
     return 0;
 }
 
+__device__ float Logist(float data) {
+    return 1.0f / (1.0f + expf(-data));
+};
 
-__device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); };
-
-__global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject,
-                             const int grid_h, int grid_w, const int stride, int classes, int outputElem, bool is_segmentation) {
+__global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h,
+                             int grid_w, const int stride, int classes, int outputElem, bool is_segmentation) {
     int idx = threadIdx.x + blockDim.x * blockIdx.x;
-    if (idx >= numElements) return;
+    if (idx >= numElements)
+        return;
 
     int total_grid = grid_h * grid_w;
     int info_len = 4 + classes;
-    if (is_segmentation) info_len += 32;
+    if (is_segmentation)
+        info_len += 32;
     int batchIdx = idx / total_grid;
     int elemIdx = idx % total_grid;
     const float* curInput = input + batchIdx * total_grid * info_len;
@@ -153,10 +181,12 @@ __global__ void CalDetection(const float* input, float* output, int numElements,
         }
     }
 
-    if (max_cls_prob < 0.1) return;
+    if (max_cls_prob < 0.1)
+        return;
 
     int count = (int)atomicAdd(output + outputIdx, 1);
-    if (count >= maxoutobject) return;
+    if (count >= maxoutobject)
+        return;
     char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection);
     Detection* det = (Detection*)(data);
 
@@ -175,24 +205,33 @@ __global__ void CalDetection(const float* input, float* output, int numElements,
     }
 }
 
-void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,int mYoloV8NetWidth, int batchSize) {
+void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
+                                 int mYoloV8NetWidth, int batchSize) {
     int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
     cudaMemsetAsync(output, 0, sizeof(float), stream);
     for (int idx = 0; idx < batchSize; ++idx) {
         CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream));
     }
     int numElem = 0;
-    int grids[3][2] = { {mYoloV8netHeight / 8, mYoloV8NetWidth / 8}, {mYoloV8netHeight / 16, mYoloV8NetWidth / 16}, {mYoloV8netHeight / 32, mYoloV8NetWidth / 32} };
-    int strides[] = { 8, 16, 32 };
-    for (unsigned int i = 0; i < 3; i++) {
+
+    const int maxGrids = mStridesLength;
+    int grids[maxGrids][2];
+    for (int i = 0; i < maxGrids; ++i) {
+        grids[i][0] = mYoloV8netHeight / mStrides[i];
+        grids[i][1] = mYoloV8NetWidth / mStrides[i];
+    }
+
+    for (unsigned int i = 0; i < maxGrids; i++) {
         int grid_h = grids[i][0];
         int grid_w = grids[i][1];
-        int stride = strides[i];
+        int stride = mStrides[i];
         numElem = grid_h * grid_w * batchSize;
-        if (numElem < mThreadCount) mThreadCount = numElem;
+        if (numElem < mThreadCount)
+            mThreadCount = numElem;
 
-        CalDetection << <(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream >> >
-            (inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, outputElem, is_segmentation_);
+        CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>(
+                inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, outputElem,
+                is_segmentation_);
     }
 }
 
@@ -219,19 +258,24 @@ const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT {
 
 IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
     assert(fc->nbFields == 1);
-    assert(strcmp(fc->fields[0].name, "netinfo") == 0);
-    int* p_netinfo = (int*)(fc->fields[0].data);
-    int class_count = p_netinfo[0];
-    int input_w = p_netinfo[1];
-    int input_h = p_netinfo[2];
-    int max_output_object_count = p_netinfo[3];
-    bool is_segmentation = p_netinfo[4];
-    YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, is_segmentation);
+    assert(strcmp(fc->fields[0].name, "combinedInfo") == 0);
+    const int* combinedInfo = static_cast<const int*>(fc->fields[0].data);
+    int netinfo_count = 5;
+    int class_count = combinedInfo[0];
+    int input_w = combinedInfo[1];
+    int input_h = combinedInfo[2];
+    int max_output_object_count = combinedInfo[3];
+    bool is_segmentation = combinedInfo[4];
+    const int* px_arry = combinedInfo + netinfo_count;
+    int px_arry_length = fc->fields[0].length - netinfo_count;
+    YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, is_segmentation,
+                                               px_arry, px_arry_length);
     obj->setPluginNamespace(mNamespace.c_str());
     return obj;
 }
 
-IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT {
+IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData,
+                                                     size_t serialLength) TRT_NOEXCEPT {
     // This object will be deleted when the network is destroyed, which will
     // call YoloLayerPlugin::destroy()
     YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
@@ -239,4 +283,4 @@ IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const voi
     return obj;
 }
 
-} // namespace nvinfer1
+}  // namespace nvinfer1
diff --git a/yolov8/plugin/yololayer.h b/yolov8/plugin/yololayer.h
index 514c1f12..9496e7fc 100644
--- a/yolov8/plugin/yololayer.h
+++ b/yolov8/plugin/yololayer.h
@@ -1,102 +1,106 @@
 #pragma once
-#include "macros.h"
-#include "NvInfer.h"
 #include <string>
 #include <vector>
+#include "NvInfer.h"
 #include "macros.h"
 namespace nvinfer1 {
 class API YoloLayerPlugin : public IPluginV2IOExt {
-public:
-        YoloLayerPlugin(int classCount, int netWdith, int netHeight, int maxOut, bool is_segmentation);
-        YoloLayerPlugin(const void* data, size_t length);
-        ~YoloLayerPlugin();
+   public:
+    YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation, const int* strides,
+                    int stridesLength);
 
-        int getNbOutputs() const TRT_NOEXCEPT override {
-            return 1;
-        }
+    YoloLayerPlugin(const void* data, size_t length);
+    ~YoloLayerPlugin();
 
-        nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;
+    int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
 
-        int initialize() TRT_NOEXCEPT override;
+    nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;
 
-        virtual void terminate() TRT_NOEXCEPT override {}
+    int initialize() TRT_NOEXCEPT override;
 
-        virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }
+    virtual void terminate() TRT_NOEXCEPT override {}
 
-        virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+    virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }
 
-        virtual size_t getSerializationSize() const TRT_NOEXCEPT override;
+    virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace,
+                        cudaStream_t stream) TRT_NOEXCEPT override;
 
-        virtual void serialize(void* buffer) const TRT_NOEXCEPT override;
+    virtual size_t getSerializationSize() const TRT_NOEXCEPT override;
 
-        bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override {
-            return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
-        }
+    virtual void serialize(void* buffer) const TRT_NOEXCEPT override;
 
+    bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs,
+                                   int nbOutputs) const TRT_NOEXCEPT override {
+        return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
+    }
 
-        const char* getPluginType() const TRT_NOEXCEPT override;
+    const char* getPluginType() const TRT_NOEXCEPT override;
 
-        const char* getPluginVersion() const TRT_NOEXCEPT override;
+    const char* getPluginVersion() const TRT_NOEXCEPT override;
 
-        void destroy() TRT_NOEXCEPT override;
+    void destroy() TRT_NOEXCEPT override;
 
-        IPluginV2IOExt* clone() const TRT_NOEXCEPT override;
+    IPluginV2IOExt* clone() const TRT_NOEXCEPT override;
 
-        void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;
+    void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;
 
-        const char* getPluginNamespace() const TRT_NOEXCEPT override;
+    const char* getPluginNamespace() const TRT_NOEXCEPT override;
 
-        nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const TRT_NOEXCEPT;
+    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
+                                         int32_t nbInputs) const TRT_NOEXCEPT;
 
-        bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override;
+    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
+                                      int nbInputs) const TRT_NOEXCEPT override;
 
-        bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;
+    bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;
 
-        void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
+    void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
+                         IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
 
-        void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out, int32_t nbOutput) TRT_NOEXCEPT override;
+    void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out,
+                         int32_t nbOutput) TRT_NOEXCEPT override;
 
-        void detachFromContext() TRT_NOEXCEPT override;
+    void detachFromContext() TRT_NOEXCEPT override;
 
-    private:
-        void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight, int mYoloV8NetWidth, int batchSize);
-        int mThreadCount = 256;
-        const char* mPluginNamespace;
-        int mClassCount;
-        int mYoloV8NetWidth;
-        int mYoloV8netHeight;
-        int mMaxOutObject;
-        bool is_segmentation_;
-    };
+   private:
+    void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
+                    int mYoloV8NetWidth, int batchSize);
+    int mThreadCount = 256;
+    const char* mPluginNamespace;
+    int mClassCount;
+    int mYoloV8NetWidth;
+    int mYoloV8netHeight;
+    int mMaxOutObject;
+    bool is_segmentation_;
+    int* mStrides;
+    int mStridesLength;
+};
 
 class API YoloPluginCreator : public IPluginCreator {
-public:
-        YoloPluginCreator();
-        ~YoloPluginCreator() override = default;
-
-        const char* getPluginName() const TRT_NOEXCEPT override;
+   public:
+    YoloPluginCreator();
+    ~YoloPluginCreator() override = default;
 
-        const char* getPluginVersion() const TRT_NOEXCEPT override;
+    const char* getPluginName() const TRT_NOEXCEPT override;
 
-        const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
+    const char* getPluginVersion() const TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2IOExt* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+    const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+    nvinfer1::IPluginV2IOExt* createPlugin(const char* name,
+                                           const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
 
-        void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override {
-            mNamespace = libNamespace;
-        }
+    nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData,
+                                                size_t serialLength) TRT_NOEXCEPT override;
 
-        const char* getPluginNamespace() const TRT_NOEXCEPT override {
-            return mNamespace.c_str();
-        }
+    void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; }
 
-    private:
-        std::string mNamespace;
-        static PluginFieldCollection mFC;
-        static std::vector<PluginField> mPluginAttributes;
-    };
-    REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
-} // namespace nvinfer1
+    const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }
 
+   private:
+    std::string mNamespace;
+    static PluginFieldCollection mFC;
+    static std::vector<PluginField> mPluginAttributes;
+};
+REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
+}  // namespace nvinfer1
diff --git a/yolov8/src/block.cpp b/yolov8/src/block.cpp
index 96cf4e12..713f86ef 100644
--- a/yolov8/src/block.cpp
+++ b/yolov8/src/block.cpp
@@ -219,23 +219,47 @@ nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std
 }
 
 nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
-                                       std::vector<nvinfer1::IConcatenationLayer*> dets, bool is_segmentation) {
+                                       std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
+                                       int px_arry_num, bool is_segmentation) {
     auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
-
-    nvinfer1::PluginField plugin_fields[1];
-    int netinfo[5] = {kNumClass, kInputW, kInputH, kMaxNumOutputBbox, is_segmentation};
-    plugin_fields[0].data = netinfo;
-    plugin_fields[0].length = 5;
-    plugin_fields[0].name = "netinfo";
-    plugin_fields[0].type = nvinfer1::PluginFieldType::kFLOAT32;
-    nvinfer1::PluginFieldCollection plugin_data;
-    plugin_data.nbFields = 1;
-    plugin_data.fields = plugin_fields;
-    nvinfer1::IPluginV2* plugin_obj = creator->createPlugin("yololayer", &plugin_data);
-    std::vector<nvinfer1::ITensor*> input_tensors;
+    const int netinfo_count = 5;  // Assuming the first 5 elements are for netinfo as per existing code.
+    const int total_count = netinfo_count + px_arry_num;  // Total number of elements for netinfo and px_arry combined.
+
+    std::vector<int> combinedInfo(total_count);
+    // Fill in the first 5 elements as per existing netinfo.
+    combinedInfo[0] = kNumClass;
+    combinedInfo[1] = kInputW;
+    combinedInfo[2] = kInputH;
+    combinedInfo[3] = kMaxNumOutputBbox;
+    combinedInfo[4] = is_segmentation;
+
+    // Copy the contents of px_arry into the combinedInfo vector after the initial 5 elements.
+    std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count);
+
+    // Now let's create the PluginField object to hold this combined information.
+    nvinfer1::PluginField pluginField;
+    pluginField.name = "combinedInfo";  // This can be any name that the plugin will recognize
+    pluginField.data = combinedInfo.data();
+    pluginField.type = nvinfer1::PluginFieldType::kINT32;
+    pluginField.length = combinedInfo.size();
+
+    // Create the PluginFieldCollection to hold the PluginField object.
+    nvinfer1::PluginFieldCollection pluginFieldCollection;
+    pluginFieldCollection.nbFields = 1;  // We have just one field, but it's a combined array
+    pluginFieldCollection.fields = &pluginField;
+
+    // Create the plugin object using the PluginFieldCollection.
+    nvinfer1::IPluginV2* pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection);
+
+    // We assume that the plugin is to be added onto the network.
+    // Prepare input tensors for the YOLO Layer.
+    std::vector<nvinfer1::ITensor*> inputTensors;
     for (auto det : dets) {
-        input_tensors.push_back(det->getOutput(0));
+        inputTensors.push_back(det->getOutput(0));  // Assuming each IConcatenationLayer has one output tensor.
     }
-    auto yolo = network->addPluginV2(&input_tensors[0], input_tensors.size(), *plugin_obj);
-    return yolo;
+
+    // Add the plugin to the network using the prepared input tensors.
+    nvinfer1::IPluginV2Layer* yoloLayer = network->addPluginV2(inputTensors.data(), inputTensors.size(), *pluginObject);
+
+    return yoloLayer;  // Return the added YOLO layer.
 }
diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp
index e0e7f088..9cfc8dd5 100644
--- a/yolov8/src/model.cpp
+++ b/yolov8/src/model.cpp
@@ -106,7 +106,6 @@ nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer
     nvinfer1::IElementWiseLayer* conv9 =
             SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
                  get_width(1024, gw, max_channels), 5, "model.9");
-
     /*******************************************************************************************************
     *********************************************  YOLOV8 HEAD  ********************************************
     *******************************************************************************************************/
@@ -118,6 +117,7 @@ nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer
 
     nvinfer1::ITensor* inputTensor11[] = {upsample10->getOutput(0), conv6->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat11 = network->addConcatenation(inputTensor11, 2);
+
     nvinfer1::IElementWiseLayer* conv12 =
             C2F(network, weightMap, *cat11->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12");
@@ -220,48 +220,54 @@ nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer
     *********************************************  YOLOV8 DETECT  ******************************************
     *******************************************************************************************************/
 
+    int strides[] = {8, 16, 32};
+    int stridesLength = sizeof(strides) / sizeof(int);
+
     nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0));
-    shuffle22_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 8) * (kInputW / 8)});
-
-    nvinfer1::ISliceLayer* split22_0_0 =
-            network->addSlice(*shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0},
-                              nvinfer1::Dims2{64, (kInputH / 8) * (kInputW / 8)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::ISliceLayer* split22_0_1 =
-            network->addSlice(*shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0},
-                              nvinfer1::Dims2{kNumClass, (kInputH / 8) * (kInputW / 8)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::IShuffleLayer* dfl22_0 = DFL(network, weightMap, *split22_0_0->getOutput(0), 4,
-                                           (kInputH / 8) * (kInputW / 8), 1, 1, 0, "model.22.dfl.conv.weight");
+    shuffle22_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
+    nvinfer1::ISliceLayer* split22_0_0 = network->addSlice(
+            *shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split22_0_1 = network->addSlice(
+            *shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl22_0 =
+            DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
+                1, 0, "model.22.dfl.conv.weight");
     nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 2);
 
     nvinfer1::IShuffleLayer* shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0));
-    shuffle22_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 16) * (kInputW / 16)});
-    nvinfer1::ISliceLayer* split22_1_0 =
-            network->addSlice(*shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0},
-                              nvinfer1::Dims2{64, (kInputH / 16) * (kInputW / 16)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::ISliceLayer* split22_1_1 =
-            network->addSlice(*shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0},
-                              nvinfer1::Dims2{kNumClass, (kInputH / 16) * (kInputW / 16)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::IShuffleLayer* dfl22_1 = DFL(network, weightMap, *split22_1_0->getOutput(0), 4,
-                                           (kInputH / 16) * (kInputW / 16), 1, 1, 0, "model.22.dfl.conv.weight");
+    shuffle22_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
+    nvinfer1::ISliceLayer* split22_1_0 = network->addSlice(
+            *shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split22_1_1 = network->addSlice(
+            *shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl22_1 =
+            DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
+                1, 0, "model.22.dfl.conv.weight");
     nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 2);
 
     nvinfer1::IShuffleLayer* shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0));
-    shuffle22_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 32) * (kInputW / 32)});
-    nvinfer1::ISliceLayer* split22_2_0 =
-            network->addSlice(*shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0},
-                              nvinfer1::Dims2{64, (kInputH / 32) * (kInputW / 32)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::ISliceLayer* split22_2_1 =
-            network->addSlice(*shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0},
-                              nvinfer1::Dims2{kNumClass, (kInputH / 32) * (kInputW / 32)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::IShuffleLayer* dfl22_2 = DFL(network, weightMap, *split22_2_0->getOutput(0), 4,
-                                           (kInputH / 32) * (kInputW / 32), 1, 1, 0, "model.22.dfl.conv.weight");
+    shuffle22_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
+    nvinfer1::ISliceLayer* split22_2_0 = network->addSlice(
+            *shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split22_2_1 = network->addSlice(
+            *shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl22_2 =
+            DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
+                1, 0, "model.22.dfl.conv.weight");
     nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 2);
 
     nvinfer1::IPluginV2Layer* yolo =
-            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2});
+            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2},
+                         strides, stridesLength, false);
     yolo->getOutput(0)->setName(kOutputTensorName);
     network->markOutput(*yolo->getOutput(0));
 
@@ -295,7 +301,21 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                               int& max_channels) {
     std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
+    for (const auto& kv : weightMap) {
+        if (kv.first.find("conv.weight") != std::string::npos ||
+            kv.first.find("linear.weight") != std::string::npos) {  // 检查 conv.weight 或 linear.weight
+            std::cout << "Weight name: " << kv.first << ", ";
+            std::cout << "Count: " << kv.second.count << ", ";
+            std::cout << "Type: "
+                      << (kv.second.type == nvinfer1::DataType::kFLOAT  ? "FLOAT"
+                          : kv.second.type == nvinfer1::DataType::kHALF ? "HALF"
+                                                                        : "INT8")
+                      << std::endl;
+        }
+    }
+
     nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
+    std::cout << "gd: " << gd << ", gw: " << gw << std::endl;
     /*******************************************************************************************************
     ******************************************  YOLOV8 INPUT  **********************************************
     *******************************************************************************************************/
@@ -321,15 +341,18 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
     // 22466
     nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                                              get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
+
     nvinfer1::IElementWiseLayer* conv7 =
             convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(768, gw, max_channels), 3, 2, 1, "model.7");
     nvinfer1::IElementWiseLayer* conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(768, gw, max_channels),
                                              get_width(768, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");
+
     nvinfer1::IElementWiseLayer* conv9 =
             convBnSiLU(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.9");
     nvinfer1::IElementWiseLayer* conv10 =
             C2F(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.10");
+
     nvinfer1::IElementWiseLayer* conv11 =
             SPPF(network, weightMap, *conv10->getOutput(0), get_width(1024, gw, max_channels),
                  get_width(1024, gw, max_channels), 5, "model.11");
@@ -413,9 +436,12 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
             network->addConvolutionNd(*conv30_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                       weightMap["model.30.cv2.0.2.weight"], weightMap["model.30.cv2.0.2.bias"]);
     conv30_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+
     conv30_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+
     nvinfer1::IElementWiseLayer* conv30_cv3_0_0 =
             convBnSiLU(network, weightMap, *conv20->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.0.0");
+
     nvinfer1::IElementWiseLayer* conv30_cv3_0_1 = convBnSiLU(network, weightMap, *conv30_cv3_0_0->getOutput(0),
                                                              base_out_channel, 3, 1, 1, "model.30.cv3.0.1");
     nvinfer1::IConvolutionLayer* conv30_cv3_0_2 =
@@ -495,67 +521,391 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
     /*******************************************************************************************************
     *********************************************  YOLOV8 DETECT  ******************************************
     *******************************************************************************************************/
+    int strides[] = {8, 16, 32, 64};
+    int stridesLength = sizeof(strides) / sizeof(int);
+
     // P3 processing steps (remains unchanged)
     nvinfer1::IShuffleLayer* shuffle30_0 =
             network->addShuffle(*cat30_0->getOutput(0));  // Reusing the previous cat30_0 as P3 concatenation layer
-    shuffle30_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 8) * (kInputW / 8)});
-    nvinfer1::ISliceLayer* split30_0_0 =
-            network->addSlice(*shuffle30_0->getOutput(0), nvinfer1::Dims2{0, 0},
-                              nvinfer1::Dims2{64, (kInputH / 8) * (kInputW / 8)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::ISliceLayer* split30_0_1 =
-            network->addSlice(*shuffle30_0->getOutput(0), nvinfer1::Dims2{64, 0},
-                              nvinfer1::Dims2{kNumClass, (kInputH / 8) * (kInputW / 8)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::IShuffleLayer* dfl30_0 = DFL(network, weightMap, *split30_0_0->getOutput(0), 4,
-                                           (kInputH / 8) * (kInputW / 8), 1, 1, 0, "model.30.dfl.conv.weight");
+    shuffle30_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
+    nvinfer1::ISliceLayer* split30_0_0 = network->addSlice(
+            *shuffle30_0->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split30_0_1 = network->addSlice(
+            *shuffle30_0->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl30_0 =
+            DFL(network, weightMap, *split30_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
+                1, 0, "model.30.dfl.conv.weight");
     nvinfer1::ITensor* inputTensor30_dfl_0[] = {dfl30_0->getOutput(0), split30_0_1->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat30_dfl_0 = network->addConcatenation(inputTensor30_dfl_0, 2);
 
     // P4 processing steps (remains unchanged)
     nvinfer1::IShuffleLayer* shuffle30_1 =
             network->addShuffle(*cat30_1->getOutput(0));  // Reusing the previous cat30_1 as P4 concatenation layer
-    shuffle30_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 16) * (kInputW / 16)});
-    nvinfer1::ISliceLayer* split30_1_0 =
-            network->addSlice(*shuffle30_1->getOutput(0), nvinfer1::Dims2{0, 0},
-                              nvinfer1::Dims2{64, (kInputH / 16) * (kInputW / 16)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::ISliceLayer* split30_1_1 =
-            network->addSlice(*shuffle30_1->getOutput(0), nvinfer1::Dims2{64, 0},
-                              nvinfer1::Dims2{kNumClass, (kInputH / 16) * (kInputW / 16)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::IShuffleLayer* dfl30_1 = DFL(network, weightMap, *split30_1_0->getOutput(0), 4,
-                                           (kInputH / 16) * (kInputW / 16), 1, 1, 0, "model.30.dfl.conv.weight");
+    shuffle30_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
+    nvinfer1::ISliceLayer* split30_1_0 = network->addSlice(
+            *shuffle30_1->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split30_1_1 = network->addSlice(
+            *shuffle30_1->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl30_1 =
+            DFL(network, weightMap, *split30_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
+                1, 0, "model.30.dfl.conv.weight");
     nvinfer1::ITensor* inputTensor30_dfl_1[] = {dfl30_1->getOutput(0), split30_1_1->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat30_dfl_1 = network->addConcatenation(inputTensor30_dfl_1, 2);
 
     // P5 processing steps (remains unchanged)
     nvinfer1::IShuffleLayer* shuffle30_2 =
             network->addShuffle(*cat30_2->getOutput(0));  // Reusing the previous cat30_2 as P5 concatenation layer
-    shuffle30_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 32) * (kInputW / 32)});
-    nvinfer1::ISliceLayer* split30_2_0 =
-            network->addSlice(*shuffle30_2->getOutput(0), nvinfer1::Dims2{0, 0},
-                              nvinfer1::Dims2{64, (kInputH / 32) * (kInputW / 32)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::ISliceLayer* split30_2_1 =
-            network->addSlice(*shuffle30_2->getOutput(0), nvinfer1::Dims2{64, 0},
-                              nvinfer1::Dims2{kNumClass, (kInputH / 32) * (kInputW / 32)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::IShuffleLayer* dfl30_2 = DFL(network, weightMap, *split30_2_0->getOutput(0), 4,
-                                           (kInputH / 32) * (kInputW / 32), 1, 1, 0, "model.30.dfl.conv.weight");
+    shuffle30_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
+    nvinfer1::ISliceLayer* split30_2_0 = network->addSlice(
+            *shuffle30_2->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split30_2_1 = network->addSlice(
+            *shuffle30_2->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl30_2 =
+            DFL(network, weightMap, *split30_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
+                1, 0, "model.30.dfl.conv.weight");
     nvinfer1::ITensor* inputTensor30_dfl_2[] = {dfl30_2->getOutput(0), split30_2_1->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat30_dfl_2 = network->addConcatenation(inputTensor30_dfl_2, 2);
 
     // P6 processing steps
     nvinfer1::IShuffleLayer* shuffle30_3 = network->addShuffle(*cat30_3->getOutput(0));
-    shuffle30_3->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 64) * (kInputW / 64)});
-    nvinfer1::ISliceLayer* split30_3_0 =
-            network->addSlice(*shuffle30_3->getOutput(0), nvinfer1::Dims2{0, 0},
-                              nvinfer1::Dims2{64, (kInputH / 64) * (kInputW / 64)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::ISliceLayer* split30_3_1 =
-            network->addSlice(*shuffle30_3->getOutput(0), nvinfer1::Dims2{64, 0},
-                              nvinfer1::Dims2{kNumClass, (kInputH / 64) * (kInputW / 64)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::IShuffleLayer* dfl30_3 = DFL(network, weightMap, *split30_3_0->getOutput(0), 4,
-                                           (kInputH / 64) * (kInputW / 64), 1, 1, 0, "model.30.dfl.conv.weight");
+    shuffle30_3->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])});
+    nvinfer1::ISliceLayer* split30_3_0 = network->addSlice(
+            *shuffle30_3->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split30_3_1 = network->addSlice(
+            *shuffle30_3->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl30_3 =
+            DFL(network, weightMap, *split30_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1,
+                1, 0, "model.30.dfl.conv.weight");
     nvinfer1::ITensor* inputTensor30_dfl_3[] = {dfl30_3->getOutput(0), split30_3_1->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat30_dfl_3 = network->addConcatenation(inputTensor30_dfl_3, 2);
 
     nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(
-            network, std::vector<nvinfer1::IConcatenationLayer*>{cat30_dfl_0, cat30_dfl_1, cat30_dfl_2, cat30_dfl_3});
+            network, std::vector<nvinfer1::IConcatenationLayer*>{cat30_dfl_0, cat30_dfl_1, cat30_dfl_2, cat30_dfl_3},
+            strides, stridesLength, false);
+    yolo->getOutput(0)->setName(kOutputTensorName);
+    network->markOutput(*yolo->getOutput(0));
+
+    builder->setMaxBatchSize(kBatchSize);
+    config->setMaxWorkspaceSize(16 * (1 << 20));
+
+#if defined(USE_FP16)
+    config->setFlag(nvinfer1::BuilderFlag::kFP16);
+#elif defined(USE_INT8)
+    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
+    assert(builder->platformHasFastInt8());
+    config->setFlag(nvinfer1::BuilderFlag::kINT8);
+    auto* calibrator =
+            new Int8EntropyCalibrator2(1, kInputW, kInputH, "../coco_calib/", "int8calib.table", kInputTensorName);
+    config->setInt8Calibrator(calibrator);
+#endif
+
+    std::cout << "Building engine, please wait for a while..." << std::endl;
+    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
+    std::cout << "Build engine successfully!" << std::endl;
+
+    delete network;
+
+    for (auto& mem : weightMap) {
+        free((void*)(mem.second.values));
+    }
+    return serialized_model;
+}
+
+nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
+                                              nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
+                                              int& max_channels) {
+
+    std::cout << "buildEngineYolov8DetP2  " << std::endl;
+
+    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
+    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
+
+    /*******************************************************************************************************
+    ******************************************  YOLOV8 INPUT  **********************************************
+    *******************************************************************************************************/
+    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW});
+    assert(data);
+
+    /*******************************************************************************************************
+    *****************************************  YOLOV8 BACKBONE  ********************************************
+    *******************************************************************************************************/
+    nvinfer1::IElementWiseLayer* conv0 =
+            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
+    nvinfer1::IElementWiseLayer* conv1 =
+            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1");
+    // 11233
+    nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
+                                             get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
+    nvinfer1::IElementWiseLayer* conv3 =
+            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3");
+    // 22466
+    nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
+                                             get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
+    nvinfer1::IElementWiseLayer* conv5 =
+            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5");
+    // 22466
+    nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
+                                             get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
+    nvinfer1::IElementWiseLayer* conv7 =
+            convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7");
+    // 11233
+    nvinfer1::IElementWiseLayer* conv8 =
+            C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
+                get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");
+    nvinfer1::IElementWiseLayer* conv9 =
+            SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
+                 get_width(1024, gw, max_channels), 5, "model.9");
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 HEAD  ********************************************
+    *******************************************************************************************************/
+    // Head
+    float scale[] = {1.0, 2.0, 2.0};  // scale used for upsampling
+
+    // P4
+    nvinfer1::IResizeLayer* upsample10 = network->addResize(
+            *conv9->getOutput(0));  // Assuming conv9 is the last layer of the backbone as per P5 in your first section.
+    upsample10->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
+    upsample10->setScales(scale, 3);
+    nvinfer1::ITensor* concat11_inputs[] = {
+            upsample10->getOutput(0),
+            conv6->getOutput(0)};  // Assuming conv6 corresponds to "backbone P4" as per your pseudocode
+    nvinfer1::IConcatenationLayer* concat11 = network->addConcatenation(concat11_inputs, 2);
+    nvinfer1::IElementWiseLayer* conv12 =
+            C2F(network, weightMap, *concat11->getOutput(0), get_width(512, gw, max_channels),
+                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12");
+
+    // P3
+    nvinfer1::IResizeLayer* upsample13 = network->addResize(*conv12->getOutput(0));
+    upsample13->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
+    upsample13->setScales(scale, 3);
+    nvinfer1::ITensor* concat14_inputs[] = {upsample13->getOutput(0),
+                                            conv4->getOutput(0)};  // Assuming conv4 corresponds to "backbone P3"
+    nvinfer1::IConcatenationLayer* concat14 = network->addConcatenation(concat14_inputs, 2);
+    nvinfer1::IElementWiseLayer* conv15 =
+            C2F(network, weightMap, *concat14->getOutput(0), get_width(256, gw, max_channels),
+                get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15");
+
+    // P2
+    nvinfer1::IResizeLayer* upsample16 = network->addResize(*conv15->getOutput(0));
+    upsample16->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
+    upsample16->setScales(scale, 3);
+    nvinfer1::ITensor* concat17_inputs[] = {upsample16->getOutput(0),
+                                            conv2->getOutput(0)};  // Assuming conv2 corresponds to "backbone P2"
+    nvinfer1::IConcatenationLayer* concat17 = network->addConcatenation(concat17_inputs, 2);
+    nvinfer1::IElementWiseLayer* conv18 =
+            C2F(network, weightMap, *concat17->getOutput(0), get_width(128, gw, max_channels),
+                get_width(128, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18");
+
+    // Additional layers for P3, P4, P5
+    // Downsample and concatenate for P3
+    nvinfer1::IElementWiseLayer* conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0),
+                                                     get_width(128, gw, max_channels), 3, 2, 1, "model.19");
+    nvinfer1::ITensor* concat20_inputs[] = {
+            conv19->getOutput(0), conv15->getOutput(0)};  // concatenate with higher-resolution feature map from P3
+    nvinfer1::IConcatenationLayer* concat20 = network->addConcatenation(concat20_inputs, 2);
+    nvinfer1::IElementWiseLayer* conv21 =
+            C2F(network, weightMap, *concat20->getOutput(0), get_width(256, gw, max_channels),
+                get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21");
+
+    // Downsample and concatenate for P4
+    nvinfer1::IElementWiseLayer* conv22 = convBnSiLU(network, weightMap, *conv21->getOutput(0),
+                                                     get_width(256, gw, max_channels), 3, 2, 1, "model.22");
+    nvinfer1::ITensor* concat23_inputs[] = {
+            conv22->getOutput(0), conv12->getOutput(0)};  // concatenate with higher-resolution feature map from P4
+    nvinfer1::IConcatenationLayer* concat23 = network->addConcatenation(concat23_inputs, 2);
+    nvinfer1::IElementWiseLayer* conv24 =
+            C2F(network, weightMap, *concat23->getOutput(0), get_width(512, gw, max_channels),
+                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.24");
+
+    // Downsample and concatenate for P5
+    nvinfer1::IElementWiseLayer* conv25 = convBnSiLU(network, weightMap, *conv24->getOutput(0),
+                                                     get_width(512, gw, max_channels), 3, 2, 1, "model.25");
+    nvinfer1::ITensor* concat26_inputs[] = {
+            conv25->getOutput(0), conv9->getOutput(0)};  // concatenate with higher-resolution feature map from P5
+    nvinfer1::IConcatenationLayer* concat26 = network->addConcatenation(concat26_inputs, 2);
+    nvinfer1::IElementWiseLayer* conv27 =
+            C2F(network, weightMap, *concat26->getOutput(0), get_width(1024, gw, max_channels),
+                get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.27");
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 OUTPUT  ******************************************
+    *******************************************************************************************************/
+    int base_in_channel = 64;
+    int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(128, gw, max_channels);
+
+    std::cout << "base_in_channel is : " << base_in_channel << std::endl;
+    std::cout << "base_out_channel is : " << base_out_channel << std::endl;
+
+    // output0
+    nvinfer1::IElementWiseLayer* conv28_cv2_0_0 =
+            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.0.0");
+    nvinfer1::IElementWiseLayer* conv28_cv2_0_1 =
+            convBnSiLU(network, weightMap, *conv28_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.0.1");
+    nvinfer1::IConvolutionLayer* conv28_cv2_0_2 =
+            network->addConvolutionNd(*conv28_cv2_0_1->getOutput(0), base_in_channel, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.28.cv2.0.2.weight"], weightMap["model.28.cv2.0.2.bias"]);
+    conv28_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer* conv28_cv3_0_0 =
+            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.0.0");
+    nvinfer1::IElementWiseLayer* conv28_cv3_0_1 = convBnSiLU(network, weightMap, *conv28_cv3_0_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.28.cv3.0.1");
+    nvinfer1::IConvolutionLayer* conv28_cv3_0_2 =
+            network->addConvolutionNd(*conv28_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.28.cv3.0.2.weight"], weightMap["model.28.cv3.0.2.bias"]);
+    conv28_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1});
+    conv28_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor* inputTensor28_0[] = {conv28_cv2_0_2->getOutput(0), conv28_cv3_0_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat28_0 = network->addConcatenation(inputTensor28_0, 2);
+
+    // output1
+    nvinfer1::IElementWiseLayer* conv28_cv2_1_0 =
+            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.1.0");
+    nvinfer1::IElementWiseLayer* conv28_cv2_1_1 =
+            convBnSiLU(network, weightMap, *conv28_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.1.1");
+    nvinfer1::IConvolutionLayer* conv28_cv2_1_2 =
+            network->addConvolutionNd(*conv28_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.28.cv2.1.2.weight"], weightMap["model.28.cv2.1.2.bias"]);
+    conv28_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer* conv28_cv3_1_0 =
+            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.1.0");
+    nvinfer1::IElementWiseLayer* conv28_cv3_1_1 = convBnSiLU(network, weightMap, *conv28_cv3_1_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.28.cv3.1.1");
+    nvinfer1::IConvolutionLayer* conv28_cv3_1_2 =
+            network->addConvolutionNd(*conv28_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.28.cv3.1.2.weight"], weightMap["model.28.cv3.1.2.bias"]);
+    conv28_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor* inputTensor28_1[] = {conv28_cv2_1_2->getOutput(0), conv28_cv3_1_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat28_1 = network->addConcatenation(inputTensor28_1, 2);
+
+    // output2
+    nvinfer1::IElementWiseLayer* conv28_cv2_2_0 =
+            convBnSiLU(network, weightMap, *conv24->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.2.0");
+    nvinfer1::IElementWiseLayer* conv28_cv2_2_1 =
+            convBnSiLU(network, weightMap, *conv28_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.2.1");
+    nvinfer1::IConvolutionLayer* conv28_cv2_2_2 =
+            network->addConvolution(*conv28_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                    weightMap["model.28.cv2.2.2.weight"], weightMap["model.28.cv2.2.2.bias"]);
+    conv28_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer* conv28_cv3_2_0 =
+            convBnSiLU(network, weightMap, *conv24->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.2.0");
+    nvinfer1::IElementWiseLayer* conv28_cv3_2_1 = convBnSiLU(network, weightMap, *conv28_cv3_2_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.28.cv3.2.1");
+    nvinfer1::IConvolutionLayer* conv28_cv3_2_2 =
+            network->addConvolution(*conv28_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                    weightMap["model.28.cv3.2.2.weight"], weightMap["model.28.cv3.2.2.bias"]);
+    conv28_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor* inputTensor28_2[] = {conv28_cv2_2_2->getOutput(0), conv28_cv3_2_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat28_2 = network->addConcatenation(inputTensor28_2, 2);
+
+    // output3
+    nvinfer1::IElementWiseLayer* conv28_cv2_3_0 =
+            convBnSiLU(network, weightMap, *conv27->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.3.0");
+    nvinfer1::IElementWiseLayer* conv28_cv2_3_1 =
+            convBnSiLU(network, weightMap, *conv28_cv2_3_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.3.1");
+    nvinfer1::IConvolutionLayer* conv28_cv2_3_2 =
+            network->addConvolution(*conv28_cv2_3_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                    weightMap["model.28.cv2.3.2.weight"], weightMap["model.28.cv2.3.2.bias"]);
+    conv28_cv2_3_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv2_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer* conv28_cv3_3_0 =
+            convBnSiLU(network, weightMap, *conv27->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.3.0");
+    nvinfer1::IElementWiseLayer* conv28_cv3_3_1 = convBnSiLU(network, weightMap, *conv28_cv3_3_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.28.cv3.3.1");
+    nvinfer1::IConvolutionLayer* conv28_cv3_3_2 =
+            network->addConvolution(*conv28_cv3_3_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                    weightMap["model.28.cv3.3.2.weight"], weightMap["model.28.cv3.3.2.bias"]);
+    conv28_cv3_3_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv3_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor* inputTensor28_3[] = {conv28_cv2_3_2->getOutput(0), conv28_cv3_3_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat28_3 = network->addConcatenation(inputTensor28_3, 2);
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 DETECT  ******************************************
+    *******************************************************************************************************/
+
+    int strides[] = {4, 8, 16, 32};
+    int stridesLength = sizeof(strides) / sizeof(int);
+
+    // P2 processing steps (remains unchanged)
+    std::cout << "kNumClass is : " << kNumClass << std::endl;
+    std::cout << "kInputH is : " << kInputH << std::endl;
+    nvinfer1::IShuffleLayer* shuffle28_0 = network->addShuffle(*cat28_0->getOutput(0));
+    shuffle28_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
+    nvinfer1::ISliceLayer* split28_0_0 = network->addSlice(
+            *shuffle28_0->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split28_0_1 = network->addSlice(
+            *shuffle28_0->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl28_0 =
+            DFL(network, weightMap, *split28_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
+                1, 0, "model.28.dfl.conv.weight");
+    nvinfer1::ITensor* inputTensor28_dfl_0[] = {dfl28_0->getOutput(0), split28_0_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat28_dfl_0 = network->addConcatenation(inputTensor28_dfl_0, 2);
+
+    // P3 processing steps (remains unchanged)
+    nvinfer1::IShuffleLayer* shuffle28_1 = network->addShuffle(*cat28_1->getOutput(0));
+    shuffle28_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
+    nvinfer1::ISliceLayer* split28_1_0 = network->addSlice(
+            *shuffle28_1->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split28_1_1 = network->addSlice(
+            *shuffle28_1->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl28_1 =
+            DFL(network, weightMap, *split28_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
+                1, 0, "model.28.dfl.conv.weight");
+    nvinfer1::ITensor* inputTensor28_dfl_1[] = {dfl28_1->getOutput(0), split28_1_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat28_dfl_1 = network->addConcatenation(inputTensor28_dfl_1, 2);
+
+    // P4 processing steps (remains unchanged)
+    nvinfer1::IShuffleLayer* shuffle28_2 = network->addShuffle(*cat28_2->getOutput(0));
+    shuffle28_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
+    nvinfer1::ISliceLayer* split28_2_0 = network->addSlice(
+            *shuffle28_2->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split28_2_1 = network->addSlice(
+            *shuffle28_2->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl28_2 =
+            DFL(network, weightMap, *split28_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
+                1, 0, "model.28.dfl.conv.weight");
+    nvinfer1::ITensor* inputTensor28_dfl_2[] = {dfl28_2->getOutput(0), split28_2_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat28_dfl_2 = network->addConcatenation(inputTensor28_dfl_2, 2);
+
+    // P5 processing steps
+    nvinfer1::IShuffleLayer* shuffle28_3 = network->addShuffle(*cat28_3->getOutput(0));
+    shuffle28_3->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])});
+    nvinfer1::ISliceLayer* split28_3_0 = network->addSlice(
+            *shuffle28_3->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split28_3_1 = network->addSlice(
+            *shuffle28_3->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl28_3 =
+            DFL(network, weightMap, *split28_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1,
+                1, 0, "model.28.dfl.conv.weight");
+    nvinfer1::ITensor* inputTensor28_dfl_3[] = {dfl28_3->getOutput(0), split28_3_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat28_dfl_3 = network->addConcatenation(inputTensor28_dfl_3, 2);
+
+    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(
+            network, std::vector<nvinfer1::IConcatenationLayer*>{cat28_dfl_0, cat28_dfl_1, cat28_dfl_2, cat28_dfl_3},
+            strides, stridesLength, false);
+
     yolo->getOutput(0)->setName(kOutputTensorName);
     network->markOutput(*yolo->getOutput(0));
 
@@ -820,39 +1170,44 @@ nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer
     *********************************************  YOLOV8 DETECT  ******************************************
     *******************************************************************************************************/
 
-    nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0));
-    shuffle22_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 8) * (kInputW / 8)});
+    int strides[] = {8, 16, 32};
+    int stridesLength = sizeof(strides) / sizeof(int);
 
-    nvinfer1::ISliceLayer* split22_0_0 =
-            network->addSlice(*shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0},
-                              nvinfer1::Dims2{64, (kInputH / 8) * (kInputW / 8)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::ISliceLayer* split22_0_1 =
-            network->addSlice(*shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0},
-                              nvinfer1::Dims2{kNumClass, (kInputH / 8) * (kInputW / 8)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::IShuffleLayer* dfl22_0 = DFL(network, weightMap, *split22_0_0->getOutput(0), 4,
-                                           (kInputH / 8) * (kInputW / 8), 1, 1, 0, "model.22.dfl.conv.weight");
+    nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0));
+    shuffle22_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
+    nvinfer1::ISliceLayer* split22_0_0 = network->addSlice(
+            *shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split22_0_1 = network->addSlice(
+            *shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl22_0 =
+            DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
+                1, 0, "model.22.dfl.conv.weight");
 
     nvinfer1::IShuffleLayer* shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0));
-    shuffle22_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 16) * (kInputW / 16)});
-    nvinfer1::ISliceLayer* split22_1_0 =
-            network->addSlice(*shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0},
-                              nvinfer1::Dims2{64, (kInputH / 16) * (kInputW / 16)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::ISliceLayer* split22_1_1 =
-            network->addSlice(*shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0},
-                              nvinfer1::Dims2{kNumClass, (kInputH / 16) * (kInputW / 16)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::IShuffleLayer* dfl22_1 = DFL(network, weightMap, *split22_1_0->getOutput(0), 4,
-                                           (kInputH / 16) * (kInputW / 16), 1, 1, 0, "model.22.dfl.conv.weight");
+    shuffle22_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
+    nvinfer1::ISliceLayer* split22_1_0 = network->addSlice(
+            *shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split22_1_1 = network->addSlice(
+            *shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl22_1 =
+            DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
+                1, 0, "model.22.dfl.conv.weight");
 
     nvinfer1::IShuffleLayer* shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0));
-    shuffle22_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 32) * (kInputW / 32)});
-    nvinfer1::ISliceLayer* split22_2_0 =
-            network->addSlice(*shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0},
-                              nvinfer1::Dims2{64, (kInputH / 32) * (kInputW / 32)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::ISliceLayer* split22_2_1 =
-            network->addSlice(*shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0},
-                              nvinfer1::Dims2{kNumClass, (kInputH / 32) * (kInputW / 32)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::IShuffleLayer* dfl22_2 = DFL(network, weightMap, *split22_2_0->getOutput(0), 4,
-                                           (kInputH / 32) * (kInputW / 32), 1, 1, 0, "model.22.dfl.conv.weight");
+    shuffle22_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
+    nvinfer1::ISliceLayer* split22_2_0 = network->addSlice(
+            *shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split22_2_1 = network->addSlice(
+            *shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl22_2 =
+            DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
+                1, 0, "model.22.dfl.conv.weight");
 
     // det0
     auto proto_coef_0 = ProtoCoef(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0", 6400, gw);
@@ -872,8 +1227,9 @@ nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer
                                                 proto_coef_2->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 3);
 
-    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(
-            network, std::vector<nvinfer1::IConcatenationLayer*>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2}, true);
+    nvinfer1::IPluginV2Layer* yolo =
+            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2},
+                         strides, stridesLength, true);
     yolo->getOutput(0)->setName(kOutputTensorName);
     network->markOutput(*yolo->getOutput(0));
 
diff --git a/yolov8/yolov8_det.cpp b/yolov8/yolov8_det.cpp
index 9fb55c87..b3fee0f9 100644
--- a/yolov8/yolov8_det.cpp
+++ b/yolov8/yolov8_det.cpp
@@ -13,14 +13,17 @@ Logger gLogger;
 using namespace nvinfer1;
 const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
 
-void serialize_engine(std::string& wts_name, std::string& engine_name, bool& is_p6, std::string& sub_type, float& gd,
+void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd,
                       float& gw, int& max_channels) {
     IBuilder* builder = createInferBuilder(gLogger);
     IBuilderConfig* config = builder->createBuilderConfig();
     IHostMemory* serialized_engine = nullptr;
 
-    if (is_p6) {
+    if (is_p == 6) {
+        std::cout << "Subtype: " << is_p << std::endl;
         serialized_engine = buildEngineYolov8DetP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
+    } else if (is_p == 2) {
+        serialized_engine = buildEngineYolov8DetP2(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
     } else {
         serialized_engine = buildEngineYolov8Det(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
     }
@@ -116,7 +119,7 @@ void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, flo
     CUDA_CHECK(cudaStreamSynchronize(stream));
 }
 
-bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, bool& is_p6, std::string& img_dir,
+bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir,
                 std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
     if (argc < 4)
         return false;
@@ -149,7 +152,11 @@ bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, bo
             return false;
         }
         if (sub_type.size() == 2 && sub_type[1] == '6') {
-            is_p6 = true;
+            std::cout << "Subtype: " << sub_type[1] << std::endl;
+            is_p = 6;
+        } else if (sub_type.size() == 2 && sub_type[1] == '2') {
+            std::cout << "Subtype: " << sub_type[1] << std::endl;
+            is_p = 2;
         }
     } else if (std::string(argv[1]) == "-d" && argc == 5) {
         engine = std::string(argv[2]);
@@ -169,14 +176,15 @@ int main(int argc, char** argv) {
     std::string sub_type = "";
     std::string cuda_post_process = "";
     int model_bboxes;
-    bool is_p6 = false;
+    int is_p = 0;
     float gd = 0.0f, gw = 0.0f;
     int max_channels = 0;
 
-    if (!parse_args(argc, argv, wts_name, engine_name, is_p6, img_dir, sub_type, cuda_post_process, gd, gw,
+    if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw,
                     max_channels)) {
         std::cerr << "Arguments not right!" << std::endl;
-        std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n6/s6/m6/l6/x6]  // serialize model to plan file"
+        std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6]  // serialize model to "
+                     "plan file"
                   << std::endl;
         std::cerr << "./yolov8 -d [.engine] ../samples  [c/g]// deserialize plan file and run inference" << std::endl;
         return -1;
@@ -184,7 +192,7 @@ int main(int argc, char** argv) {
 
     // Create a model using the API directly and serialize it to a file
     if (!wts_name.empty()) {
-        serialize_engine(wts_name, engine_name, is_p6, sub_type, gd, gw, max_channels);
+        serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels);
         return 0;
     }
 

From df11eab9056201646d729588f54f80f12bd50e46 Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Wed, 17 Apr 2024 16:08:31 +0800
Subject: [PATCH 02/10] yolov8 p2

---
 yolov8/README.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/yolov8/README.md b/yolov8/README.md
index 6653e1f8..1ec527b3 100644
--- a/yolov8/README.md
+++ b/yolov8/README.md
@@ -21,11 +21,10 @@ The tensorrt code is derived from [xiaocao-tian/yolov8_tensorrt](https://github.
 
 Currently, we support yolov8
 
-- For yolov8 , download .pt from [https://github.com/ultralytics/assets/releases](https://github.com/ultralytics/assets/releases), then follow how-to-run in current page.
-[README.md](..%2FREADME.md)
+- For yolov8 , download .pt from https://github.com/ultralytics/assets/releases, then follow how-to-run in current page.
 ## Config
 
-- Choose the model n/s/m/l/x/n6/s6/m6/l6/[README.md](..%2FREADME.md)x6 from command line arguments.
+- Choose the model n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6 from command line arguments.
 - Check more configs in [include/config.h](./include/config.h)
 
 ## How to Run, yolov8n as example

From ee640a87e876e5f4b26d33ee2be2a7e0f24fae33 Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Wed, 17 Apr 2024 16:09:54 +0800
Subject: [PATCH 03/10] yolov8 p2

---
 yolov8/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yolov8/README.md b/yolov8/README.md
index 1ec527b3..e77af8d5 100644
--- a/yolov8/README.md
+++ b/yolov8/README.md
@@ -21,7 +21,7 @@ The tensorrt code is derived from [xiaocao-tian/yolov8_tensorrt](https://github.
 
 Currently, we support yolov8
 
-- For yolov8 , download .pt from https://github.com/ultralytics/assets/releases, then follow how-to-run in current page.
+- For yolov8 , download .pt from [https://github.com/ultralytics/assets/releases](https://github.com/ultralytics/assets/releases), then follow how-to-run in current page.
 ## Config
 
 - Choose the model n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6 from command line arguments.

From cea11955f9167845bd0d88c7c790849e2f3638d5 Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Wed, 17 Apr 2024 16:10:57 +0800
Subject: [PATCH 04/10] yolov8 p2

---
 yolov8/include/config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yolov8/include/config.h b/yolov8/include/config.h
index 6af9f1a9..d283860d 100644
--- a/yolov8/include/config.h
+++ b/yolov8/include/config.h
@@ -4,7 +4,7 @@
 
 const static char* kInputTensorName = "images";
 const static char* kOutputTensorName = "output";
-const static int kNumClass = 10;
+const static int kNumClass = 80;
 const static int kBatchSize = 1;
 const static int kGpuId = 0;
 const static int kInputH = 640;

From 20ae8f4ea44083e0aa618c143a4adf3182d97fa1 Mon Sep 17 00:00:00 2001
From: lindsayshuo <932695342@qq.com>
Date: Wed, 17 Apr 2024 17:19:04 +0800
Subject: [PATCH 05/10] Update yolov8_det.cpp

---
 yolov8/yolov8_det.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/yolov8/yolov8_det.cpp b/yolov8/yolov8_det.cpp
index b3fee0f9..b5f478bb 100644
--- a/yolov8/yolov8_det.cpp
+++ b/yolov8/yolov8_det.cpp
@@ -20,7 +20,6 @@ void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p
     IHostMemory* serialized_engine = nullptr;
 
     if (is_p == 6) {
-        std::cout << "Subtype: " << is_p << std::endl;
         serialized_engine = buildEngineYolov8DetP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
     } else if (is_p == 2) {
         serialized_engine = buildEngineYolov8DetP2(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
@@ -152,10 +151,8 @@ bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, in
             return false;
         }
         if (sub_type.size() == 2 && sub_type[1] == '6') {
-            std::cout << "Subtype: " << sub_type[1] << std::endl;
             is_p = 6;
         } else if (sub_type.size() == 2 && sub_type[1] == '2') {
-            std::cout << "Subtype: " << sub_type[1] << std::endl;
             is_p = 2;
         }
     } else if (std::string(argv[1]) == "-d" && argc == 5) {

From 6ce06d5383f8163c9eff0a2eefd2e8bbac1033eb Mon Sep 17 00:00:00 2001
From: lindsayshuo <932695342@qq.com>
Date: Wed, 17 Apr 2024 17:20:22 +0800
Subject: [PATCH 06/10] Update model.cpp

---
 yolov8/src/model.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp
index 9cfc8dd5..56c5d6e7 100644
--- a/yolov8/src/model.cpp
+++ b/yolov8/src/model.cpp
@@ -622,9 +622,6 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
 nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                               int& max_channels) {
-
-    std::cout << "buildEngineYolov8DetP2  " << std::endl;
-
     std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
     nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
 

From 246cb7509a37eeef84b26b35b5f63570b612e66f Mon Sep 17 00:00:00 2001
From: lindsayshuo <932695342@qq.com>
Date: Wed, 17 Apr 2024 17:21:36 +0800
Subject: [PATCH 07/10] Update model.cpp

---
 yolov8/src/model.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp
index 56c5d6e7..5e85a7a5 100644
--- a/yolov8/src/model.cpp
+++ b/yolov8/src/model.cpp
@@ -739,9 +739,6 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinf
     int base_in_channel = 64;
     int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(128, gw, max_channels);
 
-    std::cout << "base_in_channel is : " << base_in_channel << std::endl;
-    std::cout << "base_out_channel is : " << base_out_channel << std::endl;
-
     // output0
     nvinfer1::IElementWiseLayer* conv28_cv2_0_0 =
             convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.0.0");
@@ -838,8 +835,6 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinf
     int stridesLength = sizeof(strides) / sizeof(int);
 
     // P2 processing steps (remains unchanged)
-    std::cout << "kNumClass is : " << kNumClass << std::endl;
-    std::cout << "kInputH is : " << kInputH << std::endl;
     nvinfer1::IShuffleLayer* shuffle28_0 = network->addShuffle(*cat28_0->getOutput(0));
     shuffle28_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
     nvinfer1::ISliceLayer* split28_0_0 = network->addSlice(

From ebba1f9396f9454cb3fe0b4365ed4d396587eb55 Mon Sep 17 00:00:00 2001
From: lindsayshuo <932695342@qq.com>
Date: Wed, 17 Apr 2024 17:22:22 +0800
Subject: [PATCH 08/10] Update model.cpp

---
 yolov8/src/model.cpp | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp
index 5e85a7a5..08a7b8fd 100644
--- a/yolov8/src/model.cpp
+++ b/yolov8/src/model.cpp
@@ -301,19 +301,6 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                               int& max_channels) {
     std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
-    for (const auto& kv : weightMap) {
-        if (kv.first.find("conv.weight") != std::string::npos ||
-            kv.first.find("linear.weight") != std::string::npos) {  // 检查 conv.weight 或 linear.weight
-            std::cout << "Weight name: " << kv.first << ", ";
-            std::cout << "Count: " << kv.second.count << ", ";
-            std::cout << "Type: "
-                      << (kv.second.type == nvinfer1::DataType::kFLOAT  ? "FLOAT"
-                          : kv.second.type == nvinfer1::DataType::kHALF ? "HALF"
-                                                                        : "INT8")
-                      << std::endl;
-        }
-    }
-
     nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
     std::cout << "gd: " << gd << ", gw: " << gw << std::endl;
     /*******************************************************************************************************

From a149c50f3d957a790b5b41c7d037c72ba018e474 Mon Sep 17 00:00:00 2001
From: lindsayshuo <932695342@qq.com>
Date: Wed, 17 Apr 2024 17:23:12 +0800
Subject: [PATCH 09/10] Update model.cpp

---
 yolov8/src/model.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp
index 08a7b8fd..316b1301 100644
--- a/yolov8/src/model.cpp
+++ b/yolov8/src/model.cpp
@@ -302,7 +302,6 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
                                               int& max_channels) {
     std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
     nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
-    std::cout << "gd: " << gd << ", gw: " << gw << std::endl;
     /*******************************************************************************************************
     ******************************************  YOLOV8 INPUT  **********************************************
     *******************************************************************************************************/

From 917210a4f75e881850fdc726d556ef23f6a17b22 Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Tue, 23 Apr 2024 10:08:22 +0800
Subject: [PATCH 10/10] fix strides

---
 yolov8/src/model.cpp | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp
index 316b1301..0b105c97 100644
--- a/yolov8/src/model.cpp
+++ b/yolov8/src/model.cpp
@@ -20,6 +20,15 @@ static int get_depth(int x, float gd) {
     return std::max<int>(r, 1);
 }
 
+void calculateStrides(nvinfer1::IElementWiseLayer* conv_layers[], int size, int reference_size, int strides[]) {
+    for (int i = 0; i < size; ++i) {
+        nvinfer1::ILayer* layer = conv_layers[i];
+        nvinfer1::Dims dims = layer->getOutput(0)->getDimensions();
+        int feature_map_size = dims.d[1];
+        strides[i] = reference_size / feature_map_size;
+    }
+}
+
 static nvinfer1::IElementWiseLayer* Proto(nvinfer1::INetworkDefinition* network,
                                           std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input,
                                           std::string lname, float gw, int max_channels) {
@@ -220,7 +229,9 @@ nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer
     *********************************************  YOLOV8 DETECT  ******************************************
     *******************************************************************************************************/
 
-    int strides[] = {8, 16, 32};
+    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
+    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
+    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
     int stridesLength = sizeof(strides) / sizeof(int);
 
     nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0));
@@ -507,7 +518,9 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
     /*******************************************************************************************************
     *********************************************  YOLOV8 DETECT  ******************************************
     *******************************************************************************************************/
-    int strides[] = {8, 16, 32, 64};
+    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7, conv9};
+    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
+    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
     int stridesLength = sizeof(strides) / sizeof(int);
 
     // P3 processing steps (remains unchanged)
@@ -817,7 +830,9 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinf
     *********************************************  YOLOV8 DETECT  ******************************************
     *******************************************************************************************************/
 
-    int strides[] = {4, 8, 16, 32};
+    nvinfer1::IElementWiseLayer* conv_layers[] = {conv1, conv3, conv5, conv7};
+    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
+    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
     int stridesLength = sizeof(strides) / sizeof(int);
 
     // P2 processing steps (remains unchanged)
@@ -1148,7 +1163,9 @@ nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer
     *********************************************  YOLOV8 DETECT  ******************************************
     *******************************************************************************************************/
 
-    int strides[] = {8, 16, 32};
+    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
+    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
+    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
     int stridesLength = sizeof(strides) / sizeof(int);
 
     nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0));