From 37ddb3044f25706fa273aa55f3ddb9ede2f180e0 Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Wed, 17 Apr 2024 16:01:35 +0800
Subject: [PATCH 01/24] yolov8 p2

---
 yolov8/README.md           |  17 +-
 yolov8/include/block.h     |   3 +-
 yolov8/include/config.h    |   7 +-
 yolov8/include/model.h     |   4 +
 yolov8/plugin/yololayer.cu | 150 ++++++----
 yolov8/plugin/yololayer.h  | 130 ++++-----
 yolov8/src/block.cpp       |  56 ++--
 yolov8/src/model.cpp       | 550 ++++++++++++++++++++++++++++++-------
 yolov8/yolov8_det.cpp      |  24 +-
 9 files changed, 696 insertions(+), 245 deletions(-)

diff --git a/yolov8/README.md b/yolov8/README.md
index 4ca978ba..6653e1f8 100644
--- a/yolov8/README.md
+++ b/yolov8/README.md
@@ -22,10 +22,10 @@ The tensorrt code is derived from [xiaocao-tian/yolov8_tensorrt](https://github.
 Currently, we support yolov8
 
 - For yolov8 , download .pt from [https://github.com/ultralytics/assets/releases](https://github.com/ultralytics/assets/releases), then follow how-to-run in current page.
-
+[README.md](..%2FREADME.md)
 ## Config
 
-- Choose the model n/s/m/l/x/n6/s6/m6/l6/x6 from command line arguments.
+- Choose the model n/s/m/l/x/n6/s6/m6/l6/[README.md](..%2FREADME.md)x6 from command line arguments.
 - Check more configs in [include/config.h](./include/config.h)
 
 ## How to Run, yolov8n as example
@@ -34,10 +34,13 @@ Currently, we support yolov8
 
 ```
 // download https://github.com/ultralytics/assets/releases/yolov8n.pt
+// download https://github.com/lindsayshuo/yolov8-p2/releases/download/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt (only for  10 cls p2 model)
 cp {tensorrtx}/yolov8/gen_wts.py {ultralytics}/ultralytics
 cd {ultralytics}/ultralytics
 python gen_wts.py -w yolov8n.pt -o yolov8n.wts -t detect
 // a file 'yolov8n.wts' will be generated.
+python gen_wts.py -w VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt -o VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts -t detect (only for  10 cls p2 model)
+// a file 'VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts' will be generated.
 ```
 
 2. build tensorrtx/yolov8 and run
@@ -51,12 +54,20 @@ cd build
 cp {ultralytics}/ultralytics/yolov8.wts {tensorrtx}/yolov8/build
 cmake ..
 make
-sudo ./yolov8_det -s [.wts] [.engine] [n/s/m/l/x/n6/s6/m6/l6/x6]  // serialize model to plan file
+sudo ./yolov8_det -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6]  // serialize model to plan file
 sudo ./yolov8_det -d [.engine] [image folder]  [c/g] // deserialize and run inference, the images in [image folder] will be processed.
 // For example yolov8
 sudo ./yolov8_det -s yolov8n.wts yolov8.engine n
 sudo ./yolov8_det -d yolov8n.engine ../images c //cpu postprocess
 sudo ./yolov8_det -d yolov8n.engine ../images g //gpu postprocess
+
+for p2 model:
+change the  "const static int kNumClass" in config.h to 10;
+sudo ./yolov8_det -s VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine x2
+wget https://github.com/lindsayshuo/yolov8-p2/releases/download/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last/0000008_01999_d_0000040.jpg
+cp -r 0000008_01999_d_0000040.jpg ../images
+sudo ./yolov8_det -d VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine ../images c //cpu postprocess
+sudo ./yolov8_det -d VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine ../images g //gpu postprocess
 ```
 
 ### Instance Segmentation
diff --git a/yolov8/include/block.h b/yolov8/include/block.h
index 3df49c8c..ed4c6ad1 100644
--- a/yolov8/include/block.h
+++ b/yolov8/include/block.h
@@ -26,4 +26,5 @@ nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std
                              nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname);
 
 nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
-                                       std::vector<nvinfer1::IConcatenationLayer*> dets, bool is_segmentation = false);
+                                       std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
+                                       int px_arry_num, bool is_segmentation);
diff --git a/yolov8/include/config.h b/yolov8/include/config.h
index f124d5d9..6af9f1a9 100644
--- a/yolov8/include/config.h
+++ b/yolov8/include/config.h
@@ -2,9 +2,9 @@
 //#define USE_FP32
 //#define USE_INT8
 
-const static char *kInputTensorName = "images";
-const static char *kOutputTensorName = "output";
-const static int kNumClass = 80;
+const static char* kInputTensorName = "images";
+const static char* kOutputTensorName = "output";
+const static int kNumClass = 10;
 const static int kBatchSize = 1;
 const static int kGpuId = 0;
 const static int kInputH = 640;
@@ -14,7 +14,6 @@ const static float kConfThresh = 0.5f;
 const static int kMaxInputImageSize = 3000 * 3000;
 const static int kMaxNumOutputBbox = 1000;
 
-
 // Classfication model's number of classes
 constexpr static int kClsNumClass = 1000;
 // Classfication model's input shape
diff --git a/yolov8/include/model.h b/yolov8/include/model.h
index c8b30e98..0a2a40a3 100644
--- a/yolov8/include/model.h
+++ b/yolov8/include/model.h
@@ -11,6 +11,10 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                               int& max_channels);
 
+nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
+                                              nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
+                                              int& max_channels);
+
 nvinfer1::IHostMemory* buildEngineYolov8Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw);
 
diff --git a/yolov8/plugin/yololayer.cu b/yolov8/plugin/yololayer.cu
index bdc073cc..9b786de7 100755
--- a/yolov8/plugin/yololayer.cu
+++ b/yolov8/plugin/yololayer.cu
@@ -1,45 +1,58 @@
-#include "yololayer.h"
-#include "types.h"
 #include <assert.h>
 #include <math.h>
-#include "cuda_utils.h"
-#include <vector>
 #include <iostream>
+#include <vector>
+#include "cuda_utils.h"
+#include "types.h"
+#include "yololayer.h"
 
 namespace Tn {
-    template<typename T>
-    void write(char*& buffer, const T& val) {
-        *reinterpret_cast<T*>(buffer) = val;
-        buffer += sizeof(T);
-    }
+template <typename T>
+void write(char*& buffer, const T& val) {
+    *reinterpret_cast<T*>(buffer) = val;
+    buffer += sizeof(T);
+}
 
-    template<typename T>
-    void read(const char*& buffer, T& val) {
-        val = *reinterpret_cast<const T*>(buffer);
-        buffer += sizeof(T);
-    }
+template <typename T>
+void read(const char*& buffer, T& val) {
+    val = *reinterpret_cast<const T*>(buffer);
+    buffer += sizeof(T);
+}
 }  // namespace Tn
 
-
 namespace nvinfer1 {
-YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation) {
+YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation,
+                                 const int* strides, int stridesLength) {
     mClassCount = classCount;
     mYoloV8NetWidth = netWidth;
     mYoloV8netHeight = netHeight;
     mMaxOutObject = maxOut;
+    mStridesLength = stridesLength;
+    mStrides = new int[stridesLength];
+    memcpy(mStrides, strides, stridesLength * sizeof(int));
     is_segmentation_ = is_segmentation;
 }
 
-YoloLayerPlugin::~YoloLayerPlugin() {}
+YoloLayerPlugin::~YoloLayerPlugin() {
+    if (mStrides != nullptr) {
+        delete[] mStrides;
+        mStrides = nullptr;
+    }
+}
 
 YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
     using namespace Tn;
-    const char* d = reinterpret_cast<const char*>(data), * a = d;
+    const char *d = reinterpret_cast<const char*>(data), *a = d;
     read(d, mClassCount);
     read(d, mThreadCount);
     read(d, mYoloV8NetWidth);
     read(d, mYoloV8netHeight);
     read(d, mMaxOutObject);
+    read(d, mStridesLength);
+    mStrides = new int[mStridesLength];
+    for (int i = 0; i < mStridesLength; ++i) {
+        read(d, mStrides[i]);
+    }
     read(d, is_segmentation_);
 
     assert(d == a + length);
@@ -48,26 +61,32 @@ YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
 void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {
 
     using namespace Tn;
-    char* d = static_cast<char*>(buffer), * a = d;
+    char *d = static_cast<char*>(buffer), *a = d;
     write(d, mClassCount);
     write(d, mThreadCount);
     write(d, mYoloV8NetWidth);
     write(d, mYoloV8netHeight);
     write(d, mMaxOutObject);
+    write(d, mStridesLength);
+    for (int i = 0; i < mStridesLength; ++i) {
+        write(d, mStrides[i]);
+    }
     write(d, is_segmentation_);
 
     assert(d == a + getSerializationSize());
 }
 
 size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT {
-    return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(is_segmentation_);
+    return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) +
+           sizeof(mMaxOutObject) + sizeof(mStridesLength) + sizeof(int) * mStridesLength + sizeof(is_segmentation_);
 }
 
 int YoloLayerPlugin::initialize() TRT_NOEXCEPT {
     return 0;
 }
 
-nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT {
+nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                                    int nbInputDims) TRT_NOEXCEPT {
     int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float);
     return nvinfer1::Dims3(total_size + 1, 1, 1);
 }
@@ -80,11 +99,13 @@ const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT {
     return mPluginNamespace;
 }
 
-nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT {
+nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+                                                      int nbInputs) const TRT_NOEXCEPT {
     return nvinfer1::DataType::kFLOAT;
 }
 
-bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT {
+bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
+                                                   int nbInputs) const TRT_NOEXCEPT {
 
     return false;
 }
@@ -94,9 +115,11 @@ bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOE
     return false;
 }
 
-void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput, nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT {};
+void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput,
+                                      nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{};
 
-void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {};
+void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
+                                      IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{};
 
 void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}
 
@@ -116,28 +139,33 @@ void YoloLayerPlugin::destroy() TRT_NOEXCEPT {
 
 nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT {
 
-    YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV8NetWidth, mYoloV8netHeight, mMaxOutObject, is_segmentation_);
+    YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV8NetWidth, mYoloV8netHeight, mMaxOutObject,
+                                             is_segmentation_, mStrides, mStridesLength);
     p->setPluginNamespace(mPluginNamespace);
     return p;
 }
 
-int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
+int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs,
+                             void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
 
     forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize);
     return 0;
 }
 
+__device__ float Logist(float data) {
+    return 1.0f / (1.0f + expf(-data));
+};
 
-__device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); };
-
-__global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject,
-                             const int grid_h, int grid_w, const int stride, int classes, int outputElem, bool is_segmentation) {
+__global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h,
+                             int grid_w, const int stride, int classes, int outputElem, bool is_segmentation) {
     int idx = threadIdx.x + blockDim.x * blockIdx.x;
-    if (idx >= numElements) return;
+    if (idx >= numElements)
+        return;
 
     int total_grid = grid_h * grid_w;
     int info_len = 4 + classes;
-    if (is_segmentation) info_len += 32;
+    if (is_segmentation)
+        info_len += 32;
     int batchIdx = idx / total_grid;
     int elemIdx = idx % total_grid;
     const float* curInput = input + batchIdx * total_grid * info_len;
@@ -153,10 +181,12 @@ __global__ void CalDetection(const float* input, float* output, int numElements,
         }
     }
 
-    if (max_cls_prob < 0.1) return;
+    if (max_cls_prob < 0.1)
+        return;
 
     int count = (int)atomicAdd(output + outputIdx, 1);
-    if (count >= maxoutobject) return;
+    if (count >= maxoutobject)
+        return;
     char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection);
     Detection* det = (Detection*)(data);
 
@@ -175,24 +205,33 @@ __global__ void CalDetection(const float* input, float* output, int numElements,
     }
 }
 
-void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,int mYoloV8NetWidth, int batchSize) {
+void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
+                                 int mYoloV8NetWidth, int batchSize) {
     int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
     cudaMemsetAsync(output, 0, sizeof(float), stream);
     for (int idx = 0; idx < batchSize; ++idx) {
         CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream));
     }
     int numElem = 0;
-    int grids[3][2] = { {mYoloV8netHeight / 8, mYoloV8NetWidth / 8}, {mYoloV8netHeight / 16, mYoloV8NetWidth / 16}, {mYoloV8netHeight / 32, mYoloV8NetWidth / 32} };
-    int strides[] = { 8, 16, 32 };
-    for (unsigned int i = 0; i < 3; i++) {
+
+    const int maxGrids = mStridesLength;
+    int grids[maxGrids][2];
+    for (int i = 0; i < maxGrids; ++i) {
+        grids[i][0] = mYoloV8netHeight / mStrides[i];
+        grids[i][1] = mYoloV8NetWidth / mStrides[i];
+    }
+
+    for (unsigned int i = 0; i < maxGrids; i++) {
         int grid_h = grids[i][0];
         int grid_w = grids[i][1];
-        int stride = strides[i];
+        int stride = mStrides[i];
         numElem = grid_h * grid_w * batchSize;
-        if (numElem < mThreadCount) mThreadCount = numElem;
+        if (numElem < mThreadCount)
+            mThreadCount = numElem;
 
-        CalDetection << <(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream >> >
-            (inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, outputElem, is_segmentation_);
+        CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>(
+                inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, outputElem,
+                is_segmentation_);
     }
 }
 
@@ -219,19 +258,24 @@ const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT {
 
 IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
     assert(fc->nbFields == 1);
-    assert(strcmp(fc->fields[0].name, "netinfo") == 0);
-    int* p_netinfo = (int*)(fc->fields[0].data);
-    int class_count = p_netinfo[0];
-    int input_w = p_netinfo[1];
-    int input_h = p_netinfo[2];
-    int max_output_object_count = p_netinfo[3];
-    bool is_segmentation = p_netinfo[4];
-    YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, is_segmentation);
+    assert(strcmp(fc->fields[0].name, "combinedInfo") == 0);
+    const int* combinedInfo = static_cast<const int*>(fc->fields[0].data);
+    int netinfo_count = 5;
+    int class_count = combinedInfo[0];
+    int input_w = combinedInfo[1];
+    int input_h = combinedInfo[2];
+    int max_output_object_count = combinedInfo[3];
+    bool is_segmentation = combinedInfo[4];
+    const int* px_arry = combinedInfo + netinfo_count;
+    int px_arry_length = fc->fields[0].length - netinfo_count;
+    YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, is_segmentation,
+                                               px_arry, px_arry_length);
     obj->setPluginNamespace(mNamespace.c_str());
     return obj;
 }
 
-IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT {
+IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData,
+                                                     size_t serialLength) TRT_NOEXCEPT {
     // This object will be deleted when the network is destroyed, which will
     // call YoloLayerPlugin::destroy()
     YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
@@ -239,4 +283,4 @@ IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const voi
     return obj;
 }
 
-} // namespace nvinfer1
+}  // namespace nvinfer1
diff --git a/yolov8/plugin/yololayer.h b/yolov8/plugin/yololayer.h
index 514c1f12..9496e7fc 100644
--- a/yolov8/plugin/yololayer.h
+++ b/yolov8/plugin/yololayer.h
@@ -1,102 +1,106 @@
 #pragma once
-#include "macros.h"
-#include "NvInfer.h"
 #include <string>
 #include <vector>
+#include "NvInfer.h"
 #include "macros.h"
 namespace nvinfer1 {
 class API YoloLayerPlugin : public IPluginV2IOExt {
-public:
-        YoloLayerPlugin(int classCount, int netWdith, int netHeight, int maxOut, bool is_segmentation);
-        YoloLayerPlugin(const void* data, size_t length);
-        ~YoloLayerPlugin();
+   public:
+    YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation, const int* strides,
+                    int stridesLength);
 
-        int getNbOutputs() const TRT_NOEXCEPT override {
-            return 1;
-        }
+    YoloLayerPlugin(const void* data, size_t length);
+    ~YoloLayerPlugin();
 
-        nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;
+    int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
 
-        int initialize() TRT_NOEXCEPT override;
+    nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;
 
-        virtual void terminate() TRT_NOEXCEPT override {}
+    int initialize() TRT_NOEXCEPT override;
 
-        virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }
+    virtual void terminate() TRT_NOEXCEPT override {}
 
-        virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+    virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }
 
-        virtual size_t getSerializationSize() const TRT_NOEXCEPT override;
+    virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace,
+                        cudaStream_t stream) TRT_NOEXCEPT override;
 
-        virtual void serialize(void* buffer) const TRT_NOEXCEPT override;
+    virtual size_t getSerializationSize() const TRT_NOEXCEPT override;
 
-        bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override {
-            return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
-        }
+    virtual void serialize(void* buffer) const TRT_NOEXCEPT override;
 
+    bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs,
+                                   int nbOutputs) const TRT_NOEXCEPT override {
+        return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
+    }
 
-        const char* getPluginType() const TRT_NOEXCEPT override;
+    const char* getPluginType() const TRT_NOEXCEPT override;
 
-        const char* getPluginVersion() const TRT_NOEXCEPT override;
+    const char* getPluginVersion() const TRT_NOEXCEPT override;
 
-        void destroy() TRT_NOEXCEPT override;
+    void destroy() TRT_NOEXCEPT override;
 
-        IPluginV2IOExt* clone() const TRT_NOEXCEPT override;
+    IPluginV2IOExt* clone() const TRT_NOEXCEPT override;
 
-        void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;
+    void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;
 
-        const char* getPluginNamespace() const TRT_NOEXCEPT override;
+    const char* getPluginNamespace() const TRT_NOEXCEPT override;
 
-        nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const TRT_NOEXCEPT;
+    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
+                                         int32_t nbInputs) const TRT_NOEXCEPT;
 
-        bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override;
+    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
+                                      int nbInputs) const TRT_NOEXCEPT override;
 
-        bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;
+    bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;
 
-        void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
+    void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
+                         IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
 
-        void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out, int32_t nbOutput) TRT_NOEXCEPT override;
+    void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out,
+                         int32_t nbOutput) TRT_NOEXCEPT override;
 
-        void detachFromContext() TRT_NOEXCEPT override;
+    void detachFromContext() TRT_NOEXCEPT override;
 
-    private:
-        void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight, int mYoloV8NetWidth, int batchSize);
-        int mThreadCount = 256;
-        const char* mPluginNamespace;
-        int mClassCount;
-        int mYoloV8NetWidth;
-        int mYoloV8netHeight;
-        int mMaxOutObject;
-        bool is_segmentation_;
-    };
+   private:
+    void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
+                    int mYoloV8NetWidth, int batchSize);
+    int mThreadCount = 256;
+    const char* mPluginNamespace;
+    int mClassCount;
+    int mYoloV8NetWidth;
+    int mYoloV8netHeight;
+    int mMaxOutObject;
+    bool is_segmentation_;
+    int* mStrides;
+    int mStridesLength;
+};
 
 class API YoloPluginCreator : public IPluginCreator {
-public:
-        YoloPluginCreator();
-        ~YoloPluginCreator() override = default;
-
-        const char* getPluginName() const TRT_NOEXCEPT override;
+   public:
+    YoloPluginCreator();
+    ~YoloPluginCreator() override = default;
 
-        const char* getPluginVersion() const TRT_NOEXCEPT override;
+    const char* getPluginName() const TRT_NOEXCEPT override;
 
-        const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
+    const char* getPluginVersion() const TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2IOExt* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+    const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
 
-        nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+    nvinfer1::IPluginV2IOExt* createPlugin(const char* name,
+                                           const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
 
-        void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override {
-            mNamespace = libNamespace;
-        }
+    nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData,
+                                                size_t serialLength) TRT_NOEXCEPT override;
 
-        const char* getPluginNamespace() const TRT_NOEXCEPT override {
-            return mNamespace.c_str();
-        }
+    void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; }
 
-    private:
-        std::string mNamespace;
-        static PluginFieldCollection mFC;
-        static std::vector<PluginField> mPluginAttributes;
-    };
-    REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
-} // namespace nvinfer1
+    const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }
 
+   private:
+    std::string mNamespace;
+    static PluginFieldCollection mFC;
+    static std::vector<PluginField> mPluginAttributes;
+};
+REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
+}  // namespace nvinfer1
diff --git a/yolov8/src/block.cpp b/yolov8/src/block.cpp
index 96cf4e12..713f86ef 100644
--- a/yolov8/src/block.cpp
+++ b/yolov8/src/block.cpp
@@ -219,23 +219,47 @@ nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std
 }
 
 nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
-                                       std::vector<nvinfer1::IConcatenationLayer*> dets, bool is_segmentation) {
+                                       std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
+                                       int px_arry_num, bool is_segmentation) {
     auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
-
-    nvinfer1::PluginField plugin_fields[1];
-    int netinfo[5] = {kNumClass, kInputW, kInputH, kMaxNumOutputBbox, is_segmentation};
-    plugin_fields[0].data = netinfo;
-    plugin_fields[0].length = 5;
-    plugin_fields[0].name = "netinfo";
-    plugin_fields[0].type = nvinfer1::PluginFieldType::kFLOAT32;
-    nvinfer1::PluginFieldCollection plugin_data;
-    plugin_data.nbFields = 1;
-    plugin_data.fields = plugin_fields;
-    nvinfer1::IPluginV2* plugin_obj = creator->createPlugin("yololayer", &plugin_data);
-    std::vector<nvinfer1::ITensor*> input_tensors;
+    const int netinfo_count = 5;  // Assuming the first 5 elements are for netinfo as per existing code.
+    const int total_count = netinfo_count + px_arry_num;  // Total number of elements for netinfo and px_arry combined.
+
+    std::vector<int> combinedInfo(total_count);
+    // Fill in the first 5 elements as per existing netinfo.
+    combinedInfo[0] = kNumClass;
+    combinedInfo[1] = kInputW;
+    combinedInfo[2] = kInputH;
+    combinedInfo[3] = kMaxNumOutputBbox;
+    combinedInfo[4] = is_segmentation;
+
+    // Copy the contents of px_arry into the combinedInfo vector after the initial 5 elements.
+    std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count);
+
+    // Now let's create the PluginField object to hold this combined information.
+    nvinfer1::PluginField pluginField;
+    pluginField.name = "combinedInfo";  // This can be any name that the plugin will recognize
+    pluginField.data = combinedInfo.data();
+    pluginField.type = nvinfer1::PluginFieldType::kINT32;
+    pluginField.length = combinedInfo.size();
+
+    // Create the PluginFieldCollection to hold the PluginField object.
+    nvinfer1::PluginFieldCollection pluginFieldCollection;
+    pluginFieldCollection.nbFields = 1;  // We have just one field, but it's a combined array
+    pluginFieldCollection.fields = &pluginField;
+
+    // Create the plugin object using the PluginFieldCollection.
+    nvinfer1::IPluginV2* pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection);
+
+    // We assume that the plugin is to be added onto the network.
+    // Prepare input tensors for the YOLO Layer.
+    std::vector<nvinfer1::ITensor*> inputTensors;
     for (auto det : dets) {
-        input_tensors.push_back(det->getOutput(0));
+        inputTensors.push_back(det->getOutput(0));  // Assuming each IConcatenationLayer has one output tensor.
     }
-    auto yolo = network->addPluginV2(&input_tensors[0], input_tensors.size(), *plugin_obj);
-    return yolo;
+
+    // Add the plugin to the network using the prepared input tensors.
+    nvinfer1::IPluginV2Layer* yoloLayer = network->addPluginV2(inputTensors.data(), inputTensors.size(), *pluginObject);
+
+    return yoloLayer;  // Return the added YOLO layer.
 }
diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp
index e0e7f088..9cfc8dd5 100644
--- a/yolov8/src/model.cpp
+++ b/yolov8/src/model.cpp
@@ -106,7 +106,6 @@ nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer
     nvinfer1::IElementWiseLayer* conv9 =
             SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
                  get_width(1024, gw, max_channels), 5, "model.9");
-
     /*******************************************************************************************************
     *********************************************  YOLOV8 HEAD  ********************************************
     *******************************************************************************************************/
@@ -118,6 +117,7 @@ nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer
 
     nvinfer1::ITensor* inputTensor11[] = {upsample10->getOutput(0), conv6->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat11 = network->addConcatenation(inputTensor11, 2);
+
     nvinfer1::IElementWiseLayer* conv12 =
             C2F(network, weightMap, *cat11->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12");
@@ -220,48 +220,54 @@ nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer
     *********************************************  YOLOV8 DETECT  ******************************************
     *******************************************************************************************************/
 
+    int strides[] = {8, 16, 32};
+    int stridesLength = sizeof(strides) / sizeof(int);
+
     nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0));
-    shuffle22_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 8) * (kInputW / 8)});
-
-    nvinfer1::ISliceLayer* split22_0_0 =
-            network->addSlice(*shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0},
-                              nvinfer1::Dims2{64, (kInputH / 8) * (kInputW / 8)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::ISliceLayer* split22_0_1 =
-            network->addSlice(*shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0},
-                              nvinfer1::Dims2{kNumClass, (kInputH / 8) * (kInputW / 8)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::IShuffleLayer* dfl22_0 = DFL(network, weightMap, *split22_0_0->getOutput(0), 4,
-                                           (kInputH / 8) * (kInputW / 8), 1, 1, 0, "model.22.dfl.conv.weight");
+    shuffle22_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
+    nvinfer1::ISliceLayer* split22_0_0 = network->addSlice(
+            *shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split22_0_1 = network->addSlice(
+            *shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl22_0 =
+            DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
+                1, 0, "model.22.dfl.conv.weight");
     nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 2);
 
     nvinfer1::IShuffleLayer* shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0));
-    shuffle22_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 16) * (kInputW / 16)});
-    nvinfer1::ISliceLayer* split22_1_0 =
-            network->addSlice(*shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0},
-                              nvinfer1::Dims2{64, (kInputH / 16) * (kInputW / 16)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::ISliceLayer* split22_1_1 =
-            network->addSlice(*shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0},
-                              nvinfer1::Dims2{kNumClass, (kInputH / 16) * (kInputW / 16)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::IShuffleLayer* dfl22_1 = DFL(network, weightMap, *split22_1_0->getOutput(0), 4,
-                                           (kInputH / 16) * (kInputW / 16), 1, 1, 0, "model.22.dfl.conv.weight");
+    shuffle22_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
+    nvinfer1::ISliceLayer* split22_1_0 = network->addSlice(
+            *shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split22_1_1 = network->addSlice(
+            *shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl22_1 =
+            DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
+                1, 0, "model.22.dfl.conv.weight");
     nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 2);
 
     nvinfer1::IShuffleLayer* shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0));
-    shuffle22_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 32) * (kInputW / 32)});
-    nvinfer1::ISliceLayer* split22_2_0 =
-            network->addSlice(*shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0},
-                              nvinfer1::Dims2{64, (kInputH / 32) * (kInputW / 32)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::ISliceLayer* split22_2_1 =
-            network->addSlice(*shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0},
-                              nvinfer1::Dims2{kNumClass, (kInputH / 32) * (kInputW / 32)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::IShuffleLayer* dfl22_2 = DFL(network, weightMap, *split22_2_0->getOutput(0), 4,
-                                           (kInputH / 32) * (kInputW / 32), 1, 1, 0, "model.22.dfl.conv.weight");
+    shuffle22_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
+    nvinfer1::ISliceLayer* split22_2_0 = network->addSlice(
+            *shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split22_2_1 = network->addSlice(
+            *shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl22_2 =
+            DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
+                1, 0, "model.22.dfl.conv.weight");
     nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 2);
 
     nvinfer1::IPluginV2Layer* yolo =
-            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2});
+            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2},
+                         strides, stridesLength, false);
     yolo->getOutput(0)->setName(kOutputTensorName);
     network->markOutput(*yolo->getOutput(0));
 
@@ -295,7 +301,21 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                               int& max_channels) {
     std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
+    for (const auto& kv : weightMap) {
+        if (kv.first.find("conv.weight") != std::string::npos ||
+            kv.first.find("linear.weight") != std::string::npos) {  // 检查 conv.weight 或 linear.weight
+            std::cout << "Weight name: " << kv.first << ", ";
+            std::cout << "Count: " << kv.second.count << ", ";
+            std::cout << "Type: "
+                      << (kv.second.type == nvinfer1::DataType::kFLOAT  ? "FLOAT"
+                          : kv.second.type == nvinfer1::DataType::kHALF ? "HALF"
+                                                                        : "INT8")
+                      << std::endl;
+        }
+    }
+
     nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
+    std::cout << "gd: " << gd << ", gw: " << gw << std::endl;
     /*******************************************************************************************************
     ******************************************  YOLOV8 INPUT  **********************************************
     *******************************************************************************************************/
@@ -321,15 +341,18 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
     // 22466
     nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                                              get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
+
     nvinfer1::IElementWiseLayer* conv7 =
             convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(768, gw, max_channels), 3, 2, 1, "model.7");
     nvinfer1::IElementWiseLayer* conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(768, gw, max_channels),
                                              get_width(768, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");
+
     nvinfer1::IElementWiseLayer* conv9 =
             convBnSiLU(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.9");
     nvinfer1::IElementWiseLayer* conv10 =
             C2F(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.10");
+
     nvinfer1::IElementWiseLayer* conv11 =
             SPPF(network, weightMap, *conv10->getOutput(0), get_width(1024, gw, max_channels),
                  get_width(1024, gw, max_channels), 5, "model.11");
@@ -413,9 +436,12 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
             network->addConvolutionNd(*conv30_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                       weightMap["model.30.cv2.0.2.weight"], weightMap["model.30.cv2.0.2.bias"]);
     conv30_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+
     conv30_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+
     nvinfer1::IElementWiseLayer* conv30_cv3_0_0 =
             convBnSiLU(network, weightMap, *conv20->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.0.0");
+
     nvinfer1::IElementWiseLayer* conv30_cv3_0_1 = convBnSiLU(network, weightMap, *conv30_cv3_0_0->getOutput(0),
                                                              base_out_channel, 3, 1, 1, "model.30.cv3.0.1");
     nvinfer1::IConvolutionLayer* conv30_cv3_0_2 =
@@ -495,67 +521,391 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
     /*******************************************************************************************************
     *********************************************  YOLOV8 DETECT  ******************************************
     *******************************************************************************************************/
+    int strides[] = {8, 16, 32, 64};
+    int stridesLength = sizeof(strides) / sizeof(int);
+
     // P3 processing steps (remains unchanged)
     nvinfer1::IShuffleLayer* shuffle30_0 =
             network->addShuffle(*cat30_0->getOutput(0));  // Reusing the previous cat30_0 as P3 concatenation layer
-    shuffle30_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 8) * (kInputW / 8)});
-    nvinfer1::ISliceLayer* split30_0_0 =
-            network->addSlice(*shuffle30_0->getOutput(0), nvinfer1::Dims2{0, 0},
-                              nvinfer1::Dims2{64, (kInputH / 8) * (kInputW / 8)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::ISliceLayer* split30_0_1 =
-            network->addSlice(*shuffle30_0->getOutput(0), nvinfer1::Dims2{64, 0},
-                              nvinfer1::Dims2{kNumClass, (kInputH / 8) * (kInputW / 8)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::IShuffleLayer* dfl30_0 = DFL(network, weightMap, *split30_0_0->getOutput(0), 4,
-                                           (kInputH / 8) * (kInputW / 8), 1, 1, 0, "model.30.dfl.conv.weight");
+    shuffle30_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
+    nvinfer1::ISliceLayer* split30_0_0 = network->addSlice(
+            *shuffle30_0->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split30_0_1 = network->addSlice(
+            *shuffle30_0->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl30_0 =
+            DFL(network, weightMap, *split30_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
+                1, 0, "model.30.dfl.conv.weight");
     nvinfer1::ITensor* inputTensor30_dfl_0[] = {dfl30_0->getOutput(0), split30_0_1->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat30_dfl_0 = network->addConcatenation(inputTensor30_dfl_0, 2);
 
     // P4 processing steps (remains unchanged)
     nvinfer1::IShuffleLayer* shuffle30_1 =
             network->addShuffle(*cat30_1->getOutput(0));  // Reusing the previous cat30_1 as P4 concatenation layer
-    shuffle30_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 16) * (kInputW / 16)});
-    nvinfer1::ISliceLayer* split30_1_0 =
-            network->addSlice(*shuffle30_1->getOutput(0), nvinfer1::Dims2{0, 0},
-                              nvinfer1::Dims2{64, (kInputH / 16) * (kInputW / 16)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::ISliceLayer* split30_1_1 =
-            network->addSlice(*shuffle30_1->getOutput(0), nvinfer1::Dims2{64, 0},
-                              nvinfer1::Dims2{kNumClass, (kInputH / 16) * (kInputW / 16)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::IShuffleLayer* dfl30_1 = DFL(network, weightMap, *split30_1_0->getOutput(0), 4,
-                                           (kInputH / 16) * (kInputW / 16), 1, 1, 0, "model.30.dfl.conv.weight");
+    shuffle30_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
+    nvinfer1::ISliceLayer* split30_1_0 = network->addSlice(
+            *shuffle30_1->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split30_1_1 = network->addSlice(
+            *shuffle30_1->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl30_1 =
+            DFL(network, weightMap, *split30_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
+                1, 0, "model.30.dfl.conv.weight");
     nvinfer1::ITensor* inputTensor30_dfl_1[] = {dfl30_1->getOutput(0), split30_1_1->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat30_dfl_1 = network->addConcatenation(inputTensor30_dfl_1, 2);
 
     // P5 processing steps (remains unchanged)
     nvinfer1::IShuffleLayer* shuffle30_2 =
             network->addShuffle(*cat30_2->getOutput(0));  // Reusing the previous cat30_2 as P5 concatenation layer
-    shuffle30_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 32) * (kInputW / 32)});
-    nvinfer1::ISliceLayer* split30_2_0 =
-            network->addSlice(*shuffle30_2->getOutput(0), nvinfer1::Dims2{0, 0},
-                              nvinfer1::Dims2{64, (kInputH / 32) * (kInputW / 32)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::ISliceLayer* split30_2_1 =
-            network->addSlice(*shuffle30_2->getOutput(0), nvinfer1::Dims2{64, 0},
-                              nvinfer1::Dims2{kNumClass, (kInputH / 32) * (kInputW / 32)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::IShuffleLayer* dfl30_2 = DFL(network, weightMap, *split30_2_0->getOutput(0), 4,
-                                           (kInputH / 32) * (kInputW / 32), 1, 1, 0, "model.30.dfl.conv.weight");
+    shuffle30_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
+    nvinfer1::ISliceLayer* split30_2_0 = network->addSlice(
+            *shuffle30_2->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split30_2_1 = network->addSlice(
+            *shuffle30_2->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl30_2 =
+            DFL(network, weightMap, *split30_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
+                1, 0, "model.30.dfl.conv.weight");
     nvinfer1::ITensor* inputTensor30_dfl_2[] = {dfl30_2->getOutput(0), split30_2_1->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat30_dfl_2 = network->addConcatenation(inputTensor30_dfl_2, 2);
 
     // P6 processing steps
     nvinfer1::IShuffleLayer* shuffle30_3 = network->addShuffle(*cat30_3->getOutput(0));
-    shuffle30_3->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 64) * (kInputW / 64)});
-    nvinfer1::ISliceLayer* split30_3_0 =
-            network->addSlice(*shuffle30_3->getOutput(0), nvinfer1::Dims2{0, 0},
-                              nvinfer1::Dims2{64, (kInputH / 64) * (kInputW / 64)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::ISliceLayer* split30_3_1 =
-            network->addSlice(*shuffle30_3->getOutput(0), nvinfer1::Dims2{64, 0},
-                              nvinfer1::Dims2{kNumClass, (kInputH / 64) * (kInputW / 64)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::IShuffleLayer* dfl30_3 = DFL(network, weightMap, *split30_3_0->getOutput(0), 4,
-                                           (kInputH / 64) * (kInputW / 64), 1, 1, 0, "model.30.dfl.conv.weight");
+    shuffle30_3->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])});
+    nvinfer1::ISliceLayer* split30_3_0 = network->addSlice(
+            *shuffle30_3->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split30_3_1 = network->addSlice(
+            *shuffle30_3->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl30_3 =
+            DFL(network, weightMap, *split30_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1,
+                1, 0, "model.30.dfl.conv.weight");
     nvinfer1::ITensor* inputTensor30_dfl_3[] = {dfl30_3->getOutput(0), split30_3_1->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat30_dfl_3 = network->addConcatenation(inputTensor30_dfl_3, 2);
 
     nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(
-            network, std::vector<nvinfer1::IConcatenationLayer*>{cat30_dfl_0, cat30_dfl_1, cat30_dfl_2, cat30_dfl_3});
+            network, std::vector<nvinfer1::IConcatenationLayer*>{cat30_dfl_0, cat30_dfl_1, cat30_dfl_2, cat30_dfl_3},
+            strides, stridesLength, false);
+    yolo->getOutput(0)->setName(kOutputTensorName);
+    network->markOutput(*yolo->getOutput(0));
+
+    builder->setMaxBatchSize(kBatchSize);
+    config->setMaxWorkspaceSize(16 * (1 << 20));
+
+#if defined(USE_FP16)
+    config->setFlag(nvinfer1::BuilderFlag::kFP16);
+#elif defined(USE_INT8)
+    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
+    assert(builder->platformHasFastInt8());
+    config->setFlag(nvinfer1::BuilderFlag::kINT8);
+    auto* calibrator =
+            new Int8EntropyCalibrator2(1, kInputW, kInputH, "../coco_calib/", "int8calib.table", kInputTensorName);
+    config->setInt8Calibrator(calibrator);
+#endif
+
+    std::cout << "Building engine, please wait for a while..." << std::endl;
+    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
+    std::cout << "Build engine successfully!" << std::endl;
+
+    delete network;
+
+    for (auto& mem : weightMap) {
+        free((void*)(mem.second.values));
+    }
+    return serialized_model;
+}
+
+nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
+                                              nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
+                                              int& max_channels) {
+
+    std::cout << "buildEngineYolov8DetP2  " << std::endl;
+
+    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
+    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
+
+    /*******************************************************************************************************
+    ******************************************  YOLOV8 INPUT  **********************************************
+    *******************************************************************************************************/
+    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW});
+    assert(data);
+
+    /*******************************************************************************************************
+    *****************************************  YOLOV8 BACKBONE  ********************************************
+    *******************************************************************************************************/
+    nvinfer1::IElementWiseLayer* conv0 =
+            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
+    nvinfer1::IElementWiseLayer* conv1 =
+            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1");
+    // 11233
+    nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
+                                             get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
+    nvinfer1::IElementWiseLayer* conv3 =
+            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3");
+    // 22466
+    nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
+                                             get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
+    nvinfer1::IElementWiseLayer* conv5 =
+            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5");
+    // 22466
+    nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
+                                             get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
+    nvinfer1::IElementWiseLayer* conv7 =
+            convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7");
+    // 11233
+    nvinfer1::IElementWiseLayer* conv8 =
+            C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
+                get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");
+    nvinfer1::IElementWiseLayer* conv9 =
+            SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
+                 get_width(1024, gw, max_channels), 5, "model.9");
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 HEAD  ********************************************
+    *******************************************************************************************************/
+    // Head
+    float scale[] = {1.0, 2.0, 2.0};  // scale used for upsampling
+
+    // P4
+    nvinfer1::IResizeLayer* upsample10 = network->addResize(
+            *conv9->getOutput(0));  // Assuming conv9 is the last layer of the backbone as per P5 in your first section.
+    upsample10->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
+    upsample10->setScales(scale, 3);
+    nvinfer1::ITensor* concat11_inputs[] = {
+            upsample10->getOutput(0),
+            conv6->getOutput(0)};  // Assuming conv6 corresponds to "backbone P4" as per your pseudocode
+    nvinfer1::IConcatenationLayer* concat11 = network->addConcatenation(concat11_inputs, 2);
+    nvinfer1::IElementWiseLayer* conv12 =
+            C2F(network, weightMap, *concat11->getOutput(0), get_width(512, gw, max_channels),
+                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12");
+
+    // P3
+    nvinfer1::IResizeLayer* upsample13 = network->addResize(*conv12->getOutput(0));
+    upsample13->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
+    upsample13->setScales(scale, 3);
+    nvinfer1::ITensor* concat14_inputs[] = {upsample13->getOutput(0),
+                                            conv4->getOutput(0)};  // Assuming conv4 corresponds to "backbone P3"
+    nvinfer1::IConcatenationLayer* concat14 = network->addConcatenation(concat14_inputs, 2);
+    nvinfer1::IElementWiseLayer* conv15 =
+            C2F(network, weightMap, *concat14->getOutput(0), get_width(256, gw, max_channels),
+                get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15");
+
+    // P2
+    nvinfer1::IResizeLayer* upsample16 = network->addResize(*conv15->getOutput(0));
+    upsample16->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
+    upsample16->setScales(scale, 3);
+    nvinfer1::ITensor* concat17_inputs[] = {upsample16->getOutput(0),
+                                            conv2->getOutput(0)};  // Assuming conv2 corresponds to "backbone P2"
+    nvinfer1::IConcatenationLayer* concat17 = network->addConcatenation(concat17_inputs, 2);
+    nvinfer1::IElementWiseLayer* conv18 =
+            C2F(network, weightMap, *concat17->getOutput(0), get_width(128, gw, max_channels),
+                get_width(128, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18");
+
+    // Additional layers for P3, P4, P5
+    // Downsample and concatenate for P3
+    nvinfer1::IElementWiseLayer* conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0),
+                                                     get_width(128, gw, max_channels), 3, 2, 1, "model.19");
+    nvinfer1::ITensor* concat20_inputs[] = {
+            conv19->getOutput(0), conv15->getOutput(0)};  // concatenate with higher-resolution feature map from P3
+    nvinfer1::IConcatenationLayer* concat20 = network->addConcatenation(concat20_inputs, 2);
+    nvinfer1::IElementWiseLayer* conv21 =
+            C2F(network, weightMap, *concat20->getOutput(0), get_width(256, gw, max_channels),
+                get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21");
+
+    // Downsample and concatenate for P4
+    nvinfer1::IElementWiseLayer* conv22 = convBnSiLU(network, weightMap, *conv21->getOutput(0),
+                                                     get_width(256, gw, max_channels), 3, 2, 1, "model.22");
+    nvinfer1::ITensor* concat23_inputs[] = {
+            conv22->getOutput(0), conv12->getOutput(0)};  // concatenate with higher-resolution feature map from P4
+    nvinfer1::IConcatenationLayer* concat23 = network->addConcatenation(concat23_inputs, 2);
+    nvinfer1::IElementWiseLayer* conv24 =
+            C2F(network, weightMap, *concat23->getOutput(0), get_width(512, gw, max_channels),
+                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.24");
+
+    // Downsample and concatenate for P5
+    nvinfer1::IElementWiseLayer* conv25 = convBnSiLU(network, weightMap, *conv24->getOutput(0),
+                                                     get_width(512, gw, max_channels), 3, 2, 1, "model.25");
+    nvinfer1::ITensor* concat26_inputs[] = {
+            conv25->getOutput(0), conv9->getOutput(0)};  // concatenate with higher-resolution feature map from P5
+    nvinfer1::IConcatenationLayer* concat26 = network->addConcatenation(concat26_inputs, 2);
+    nvinfer1::IElementWiseLayer* conv27 =
+            C2F(network, weightMap, *concat26->getOutput(0), get_width(1024, gw, max_channels),
+                get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.27");
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 OUTPUT  ******************************************
+    *******************************************************************************************************/
+    int base_in_channel = 64;
+    int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(128, gw, max_channels);
+
+    std::cout << "base_in_channel is : " << base_in_channel << std::endl;
+    std::cout << "base_out_channel is : " << base_out_channel << std::endl;
+
+    // output0
+    nvinfer1::IElementWiseLayer* conv28_cv2_0_0 =
+            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.0.0");
+    nvinfer1::IElementWiseLayer* conv28_cv2_0_1 =
+            convBnSiLU(network, weightMap, *conv28_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.0.1");
+    nvinfer1::IConvolutionLayer* conv28_cv2_0_2 =
+            network->addConvolutionNd(*conv28_cv2_0_1->getOutput(0), base_in_channel, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.28.cv2.0.2.weight"], weightMap["model.28.cv2.0.2.bias"]);
+    conv28_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer* conv28_cv3_0_0 =
+            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.0.0");
+    nvinfer1::IElementWiseLayer* conv28_cv3_0_1 = convBnSiLU(network, weightMap, *conv28_cv3_0_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.28.cv3.0.1");
+    nvinfer1::IConvolutionLayer* conv28_cv3_0_2 =
+            network->addConvolutionNd(*conv28_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.28.cv3.0.2.weight"], weightMap["model.28.cv3.0.2.bias"]);
+    conv28_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1});
+    conv28_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor* inputTensor28_0[] = {conv28_cv2_0_2->getOutput(0), conv28_cv3_0_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat28_0 = network->addConcatenation(inputTensor28_0, 2);
+
+    // output1
+    nvinfer1::IElementWiseLayer* conv28_cv2_1_0 =
+            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.1.0");
+    nvinfer1::IElementWiseLayer* conv28_cv2_1_1 =
+            convBnSiLU(network, weightMap, *conv28_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.1.1");
+    nvinfer1::IConvolutionLayer* conv28_cv2_1_2 =
+            network->addConvolutionNd(*conv28_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.28.cv2.1.2.weight"], weightMap["model.28.cv2.1.2.bias"]);
+    conv28_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer* conv28_cv3_1_0 =
+            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.1.0");
+    nvinfer1::IElementWiseLayer* conv28_cv3_1_1 = convBnSiLU(network, weightMap, *conv28_cv3_1_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.28.cv3.1.1");
+    nvinfer1::IConvolutionLayer* conv28_cv3_1_2 =
+            network->addConvolutionNd(*conv28_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.28.cv3.1.2.weight"], weightMap["model.28.cv3.1.2.bias"]);
+    conv28_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor* inputTensor28_1[] = {conv28_cv2_1_2->getOutput(0), conv28_cv3_1_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat28_1 = network->addConcatenation(inputTensor28_1, 2);
+
+    // output2
+    nvinfer1::IElementWiseLayer* conv28_cv2_2_0 =
+            convBnSiLU(network, weightMap, *conv24->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.2.0");
+    nvinfer1::IElementWiseLayer* conv28_cv2_2_1 =
+            convBnSiLU(network, weightMap, *conv28_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.2.1");
+    nvinfer1::IConvolutionLayer* conv28_cv2_2_2 =
+            network->addConvolution(*conv28_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                    weightMap["model.28.cv2.2.2.weight"], weightMap["model.28.cv2.2.2.bias"]);
+    conv28_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer* conv28_cv3_2_0 =
+            convBnSiLU(network, weightMap, *conv24->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.2.0");
+    nvinfer1::IElementWiseLayer* conv28_cv3_2_1 = convBnSiLU(network, weightMap, *conv28_cv3_2_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.28.cv3.2.1");
+    nvinfer1::IConvolutionLayer* conv28_cv3_2_2 =
+            network->addConvolution(*conv28_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                    weightMap["model.28.cv3.2.2.weight"], weightMap["model.28.cv3.2.2.bias"]);
+    conv28_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor* inputTensor28_2[] = {conv28_cv2_2_2->getOutput(0), conv28_cv3_2_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat28_2 = network->addConcatenation(inputTensor28_2, 2);
+
+    // output3
+    nvinfer1::IElementWiseLayer* conv28_cv2_3_0 =
+            convBnSiLU(network, weightMap, *conv27->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.3.0");
+    nvinfer1::IElementWiseLayer* conv28_cv2_3_1 =
+            convBnSiLU(network, weightMap, *conv28_cv2_3_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.3.1");
+    nvinfer1::IConvolutionLayer* conv28_cv2_3_2 =
+            network->addConvolution(*conv28_cv2_3_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                    weightMap["model.28.cv2.3.2.weight"], weightMap["model.28.cv2.3.2.bias"]);
+    conv28_cv2_3_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv2_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer* conv28_cv3_3_0 =
+            convBnSiLU(network, weightMap, *conv27->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.3.0");
+    nvinfer1::IElementWiseLayer* conv28_cv3_3_1 = convBnSiLU(network, weightMap, *conv28_cv3_3_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.28.cv3.3.1");
+    nvinfer1::IConvolutionLayer* conv28_cv3_3_2 =
+            network->addConvolution(*conv28_cv3_3_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                    weightMap["model.28.cv3.3.2.weight"], weightMap["model.28.cv3.3.2.bias"]);
+    conv28_cv3_3_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv3_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor* inputTensor28_3[] = {conv28_cv2_3_2->getOutput(0), conv28_cv3_3_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat28_3 = network->addConcatenation(inputTensor28_3, 2);
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 DETECT  ******************************************
+    *******************************************************************************************************/
+
+    int strides[] = {4, 8, 16, 32};
+    int stridesLength = sizeof(strides) / sizeof(int);
+
+    // P2 processing steps (remains unchanged)
+    std::cout << "kNumClass is : " << kNumClass << std::endl;
+    std::cout << "kInputH is : " << kInputH << std::endl;
+    nvinfer1::IShuffleLayer* shuffle28_0 = network->addShuffle(*cat28_0->getOutput(0));
+    shuffle28_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
+    nvinfer1::ISliceLayer* split28_0_0 = network->addSlice(
+            *shuffle28_0->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split28_0_1 = network->addSlice(
+            *shuffle28_0->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl28_0 =
+            DFL(network, weightMap, *split28_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
+                1, 0, "model.28.dfl.conv.weight");
+    nvinfer1::ITensor* inputTensor28_dfl_0[] = {dfl28_0->getOutput(0), split28_0_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat28_dfl_0 = network->addConcatenation(inputTensor28_dfl_0, 2);
+
+    // P3 processing steps (remains unchanged)
+    nvinfer1::IShuffleLayer* shuffle28_1 = network->addShuffle(*cat28_1->getOutput(0));
+    shuffle28_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
+    nvinfer1::ISliceLayer* split28_1_0 = network->addSlice(
+            *shuffle28_1->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split28_1_1 = network->addSlice(
+            *shuffle28_1->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl28_1 =
+            DFL(network, weightMap, *split28_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
+                1, 0, "model.28.dfl.conv.weight");
+    nvinfer1::ITensor* inputTensor28_dfl_1[] = {dfl28_1->getOutput(0), split28_1_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat28_dfl_1 = network->addConcatenation(inputTensor28_dfl_1, 2);
+
+    // P4 processing steps (remains unchanged)
+    nvinfer1::IShuffleLayer* shuffle28_2 = network->addShuffle(*cat28_2->getOutput(0));
+    shuffle28_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
+    nvinfer1::ISliceLayer* split28_2_0 = network->addSlice(
+            *shuffle28_2->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split28_2_1 = network->addSlice(
+            *shuffle28_2->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl28_2 =
+            DFL(network, weightMap, *split28_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
+                1, 0, "model.28.dfl.conv.weight");
+    nvinfer1::ITensor* inputTensor28_dfl_2[] = {dfl28_2->getOutput(0), split28_2_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat28_dfl_2 = network->addConcatenation(inputTensor28_dfl_2, 2);
+
+    // P5 processing steps
+    nvinfer1::IShuffleLayer* shuffle28_3 = network->addShuffle(*cat28_3->getOutput(0));
+    shuffle28_3->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])});
+    nvinfer1::ISliceLayer* split28_3_0 = network->addSlice(
+            *shuffle28_3->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split28_3_1 = network->addSlice(
+            *shuffle28_3->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl28_3 =
+            DFL(network, weightMap, *split28_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1,
+                1, 0, "model.28.dfl.conv.weight");
+    nvinfer1::ITensor* inputTensor28_dfl_3[] = {dfl28_3->getOutput(0), split28_3_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat28_dfl_3 = network->addConcatenation(inputTensor28_dfl_3, 2);
+
+    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(
+            network, std::vector<nvinfer1::IConcatenationLayer*>{cat28_dfl_0, cat28_dfl_1, cat28_dfl_2, cat28_dfl_3},
+            strides, stridesLength, false);
+
     yolo->getOutput(0)->setName(kOutputTensorName);
     network->markOutput(*yolo->getOutput(0));
 
@@ -820,39 +1170,44 @@ nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer
     *********************************************  YOLOV8 DETECT  ******************************************
     *******************************************************************************************************/
 
-    nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0));
-    shuffle22_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 8) * (kInputW / 8)});
+    int strides[] = {8, 16, 32};
+    int stridesLength = sizeof(strides) / sizeof(int);
 
-    nvinfer1::ISliceLayer* split22_0_0 =
-            network->addSlice(*shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0},
-                              nvinfer1::Dims2{64, (kInputH / 8) * (kInputW / 8)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::ISliceLayer* split22_0_1 =
-            network->addSlice(*shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0},
-                              nvinfer1::Dims2{kNumClass, (kInputH / 8) * (kInputW / 8)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::IShuffleLayer* dfl22_0 = DFL(network, weightMap, *split22_0_0->getOutput(0), 4,
-                                           (kInputH / 8) * (kInputW / 8), 1, 1, 0, "model.22.dfl.conv.weight");
+    nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0));
+    shuffle22_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
+    nvinfer1::ISliceLayer* split22_0_0 = network->addSlice(
+            *shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split22_0_1 = network->addSlice(
+            *shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl22_0 =
+            DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
+                1, 0, "model.22.dfl.conv.weight");
 
     nvinfer1::IShuffleLayer* shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0));
-    shuffle22_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 16) * (kInputW / 16)});
-    nvinfer1::ISliceLayer* split22_1_0 =
-            network->addSlice(*shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0},
-                              nvinfer1::Dims2{64, (kInputH / 16) * (kInputW / 16)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::ISliceLayer* split22_1_1 =
-            network->addSlice(*shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0},
-                              nvinfer1::Dims2{kNumClass, (kInputH / 16) * (kInputW / 16)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::IShuffleLayer* dfl22_1 = DFL(network, weightMap, *split22_1_0->getOutput(0), 4,
-                                           (kInputH / 16) * (kInputW / 16), 1, 1, 0, "model.22.dfl.conv.weight");
+    shuffle22_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
+    nvinfer1::ISliceLayer* split22_1_0 = network->addSlice(
+            *shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split22_1_1 = network->addSlice(
+            *shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl22_1 =
+            DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
+                1, 0, "model.22.dfl.conv.weight");
 
     nvinfer1::IShuffleLayer* shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0));
-    shuffle22_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 32) * (kInputW / 32)});
-    nvinfer1::ISliceLayer* split22_2_0 =
-            network->addSlice(*shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0},
-                              nvinfer1::Dims2{64, (kInputH / 32) * (kInputW / 32)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::ISliceLayer* split22_2_1 =
-            network->addSlice(*shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0},
-                              nvinfer1::Dims2{kNumClass, (kInputH / 32) * (kInputW / 32)}, nvinfer1::Dims2{1, 1});
-    nvinfer1::IShuffleLayer* dfl22_2 = DFL(network, weightMap, *split22_2_0->getOutput(0), 4,
-                                           (kInputH / 32) * (kInputW / 32), 1, 1, 0, "model.22.dfl.conv.weight");
+    shuffle22_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
+    nvinfer1::ISliceLayer* split22_2_0 = network->addSlice(
+            *shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split22_2_1 = network->addSlice(
+            *shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl22_2 =
+            DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
+                1, 0, "model.22.dfl.conv.weight");
 
     // det0
     auto proto_coef_0 = ProtoCoef(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0", 6400, gw);
@@ -872,8 +1227,9 @@ nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer
                                                 proto_coef_2->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 3);
 
-    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(
-            network, std::vector<nvinfer1::IConcatenationLayer*>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2}, true);
+    nvinfer1::IPluginV2Layer* yolo =
+            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2},
+                         strides, stridesLength, true);
     yolo->getOutput(0)->setName(kOutputTensorName);
     network->markOutput(*yolo->getOutput(0));
 
diff --git a/yolov8/yolov8_det.cpp b/yolov8/yolov8_det.cpp
index 9fb55c87..b3fee0f9 100644
--- a/yolov8/yolov8_det.cpp
+++ b/yolov8/yolov8_det.cpp
@@ -13,14 +13,17 @@ Logger gLogger;
 using namespace nvinfer1;
 const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
 
-void serialize_engine(std::string& wts_name, std::string& engine_name, bool& is_p6, std::string& sub_type, float& gd,
+void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd,
                       float& gw, int& max_channels) {
     IBuilder* builder = createInferBuilder(gLogger);
     IBuilderConfig* config = builder->createBuilderConfig();
     IHostMemory* serialized_engine = nullptr;
 
-    if (is_p6) {
+    if (is_p == 6) {
+        std::cout << "Subtype: " << is_p << std::endl;
         serialized_engine = buildEngineYolov8DetP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
+    } else if (is_p == 2) {
+        serialized_engine = buildEngineYolov8DetP2(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
     } else {
         serialized_engine = buildEngineYolov8Det(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
     }
@@ -116,7 +119,7 @@ void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, flo
     CUDA_CHECK(cudaStreamSynchronize(stream));
 }
 
-bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, bool& is_p6, std::string& img_dir,
+bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir,
                 std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
     if (argc < 4)
         return false;
@@ -149,7 +152,11 @@ bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, bo
             return false;
         }
         if (sub_type.size() == 2 && sub_type[1] == '6') {
-            is_p6 = true;
+            std::cout << "Subtype: " << sub_type[1] << std::endl;
+            is_p = 6;
+        } else if (sub_type.size() == 2 && sub_type[1] == '2') {
+            std::cout << "Subtype: " << sub_type[1] << std::endl;
+            is_p = 2;
         }
     } else if (std::string(argv[1]) == "-d" && argc == 5) {
         engine = std::string(argv[2]);
@@ -169,14 +176,15 @@ int main(int argc, char** argv) {
     std::string sub_type = "";
     std::string cuda_post_process = "";
     int model_bboxes;
-    bool is_p6 = false;
+    int is_p = 0;
     float gd = 0.0f, gw = 0.0f;
     int max_channels = 0;
 
-    if (!parse_args(argc, argv, wts_name, engine_name, is_p6, img_dir, sub_type, cuda_post_process, gd, gw,
+    if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw,
                     max_channels)) {
         std::cerr << "Arguments not right!" << std::endl;
-        std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n6/s6/m6/l6/x6]  // serialize model to plan file"
+        std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6]  // serialize model to "
+                     "plan file"
                   << std::endl;
         std::cerr << "./yolov8 -d [.engine] ../samples  [c/g]// deserialize plan file and run inference" << std::endl;
         return -1;
@@ -184,7 +192,7 @@ int main(int argc, char** argv) {
 
     // Create a model using the API directly and serialize it to a file
     if (!wts_name.empty()) {
-        serialize_engine(wts_name, engine_name, is_p6, sub_type, gd, gw, max_channels);
+        serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels);
         return 0;
     }
 

From df11eab9056201646d729588f54f80f12bd50e46 Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Wed, 17 Apr 2024 16:08:31 +0800
Subject: [PATCH 02/24] yolov8 p2

---
 yolov8/README.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/yolov8/README.md b/yolov8/README.md
index 6653e1f8..1ec527b3 100644
--- a/yolov8/README.md
+++ b/yolov8/README.md
@@ -21,11 +21,10 @@ The tensorrt code is derived from [xiaocao-tian/yolov8_tensorrt](https://github.
 
 Currently, we support yolov8
 
-- For yolov8 , download .pt from [https://github.com/ultralytics/assets/releases](https://github.com/ultralytics/assets/releases), then follow how-to-run in current page.
-[README.md](..%2FREADME.md)
+- For yolov8 , download .pt from https://github.com/ultralytics/assets/releases, then follow how-to-run in current page.
 ## Config
 
-- Choose the model n/s/m/l/x/n6/s6/m6/l6/[README.md](..%2FREADME.md)x6 from command line arguments.
+- Choose the model n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6 from command line arguments.
 - Check more configs in [include/config.h](./include/config.h)
 
 ## How to Run, yolov8n as example

From ee640a87e876e5f4b26d33ee2be2a7e0f24fae33 Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Wed, 17 Apr 2024 16:09:54 +0800
Subject: [PATCH 03/24] yolov8 p2

---
 yolov8/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yolov8/README.md b/yolov8/README.md
index 1ec527b3..e77af8d5 100644
--- a/yolov8/README.md
+++ b/yolov8/README.md
@@ -21,7 +21,7 @@ The tensorrt code is derived from [xiaocao-tian/yolov8_tensorrt](https://github.
 
 Currently, we support yolov8
 
-- For yolov8 , download .pt from https://github.com/ultralytics/assets/releases, then follow how-to-run in current page.
+- For yolov8 , download .pt from [https://github.com/ultralytics/assets/releases](https://github.com/ultralytics/assets/releases), then follow how-to-run in current page.
 ## Config
 
 - Choose the model n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6 from command line arguments.

From cea11955f9167845bd0d88c7c790849e2f3638d5 Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Wed, 17 Apr 2024 16:10:57 +0800
Subject: [PATCH 04/24] yolov8 p2

---
 yolov8/include/config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yolov8/include/config.h b/yolov8/include/config.h
index 6af9f1a9..d283860d 100644
--- a/yolov8/include/config.h
+++ b/yolov8/include/config.h
@@ -4,7 +4,7 @@
 
 const static char* kInputTensorName = "images";
 const static char* kOutputTensorName = "output";
-const static int kNumClass = 10;
+const static int kNumClass = 80;
 const static int kBatchSize = 1;
 const static int kGpuId = 0;
 const static int kInputH = 640;

From 20ae8f4ea44083e0aa618c143a4adf3182d97fa1 Mon Sep 17 00:00:00 2001
From: lindsayshuo <932695342@qq.com>
Date: Wed, 17 Apr 2024 17:19:04 +0800
Subject: [PATCH 05/24] Update yolov8_det.cpp

---
 yolov8/yolov8_det.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/yolov8/yolov8_det.cpp b/yolov8/yolov8_det.cpp
index b3fee0f9..b5f478bb 100644
--- a/yolov8/yolov8_det.cpp
+++ b/yolov8/yolov8_det.cpp
@@ -20,7 +20,6 @@ void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p
     IHostMemory* serialized_engine = nullptr;
 
     if (is_p == 6) {
-        std::cout << "Subtype: " << is_p << std::endl;
         serialized_engine = buildEngineYolov8DetP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
     } else if (is_p == 2) {
         serialized_engine = buildEngineYolov8DetP2(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
@@ -152,10 +151,8 @@ bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, in
             return false;
         }
         if (sub_type.size() == 2 && sub_type[1] == '6') {
-            std::cout << "Subtype: " << sub_type[1] << std::endl;
             is_p = 6;
         } else if (sub_type.size() == 2 && sub_type[1] == '2') {
-            std::cout << "Subtype: " << sub_type[1] << std::endl;
             is_p = 2;
         }
     } else if (std::string(argv[1]) == "-d" && argc == 5) {

From 6ce06d5383f8163c9eff0a2eefd2e8bbac1033eb Mon Sep 17 00:00:00 2001
From: lindsayshuo <932695342@qq.com>
Date: Wed, 17 Apr 2024 17:20:22 +0800
Subject: [PATCH 06/24] Update model.cpp

---
 yolov8/src/model.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp
index 9cfc8dd5..56c5d6e7 100644
--- a/yolov8/src/model.cpp
+++ b/yolov8/src/model.cpp
@@ -622,9 +622,6 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
 nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                               int& max_channels) {
-
-    std::cout << "buildEngineYolov8DetP2  " << std::endl;
-
     std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
     nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
 

From 246cb7509a37eeef84b26b35b5f63570b612e66f Mon Sep 17 00:00:00 2001
From: lindsayshuo <932695342@qq.com>
Date: Wed, 17 Apr 2024 17:21:36 +0800
Subject: [PATCH 07/24] Update model.cpp

---
 yolov8/src/model.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp
index 56c5d6e7..5e85a7a5 100644
--- a/yolov8/src/model.cpp
+++ b/yolov8/src/model.cpp
@@ -739,9 +739,6 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinf
     int base_in_channel = 64;
     int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(128, gw, max_channels);
 
-    std::cout << "base_in_channel is : " << base_in_channel << std::endl;
-    std::cout << "base_out_channel is : " << base_out_channel << std::endl;
-
     // output0
     nvinfer1::IElementWiseLayer* conv28_cv2_0_0 =
             convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.0.0");
@@ -838,8 +835,6 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinf
     int stridesLength = sizeof(strides) / sizeof(int);
 
     // P2 processing steps (remains unchanged)
-    std::cout << "kNumClass is : " << kNumClass << std::endl;
-    std::cout << "kInputH is : " << kInputH << std::endl;
     nvinfer1::IShuffleLayer* shuffle28_0 = network->addShuffle(*cat28_0->getOutput(0));
     shuffle28_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
     nvinfer1::ISliceLayer* split28_0_0 = network->addSlice(

From ebba1f9396f9454cb3fe0b4365ed4d396587eb55 Mon Sep 17 00:00:00 2001
From: lindsayshuo <932695342@qq.com>
Date: Wed, 17 Apr 2024 17:22:22 +0800
Subject: [PATCH 08/24] Update model.cpp

---
 yolov8/src/model.cpp | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp
index 5e85a7a5..08a7b8fd 100644
--- a/yolov8/src/model.cpp
+++ b/yolov8/src/model.cpp
@@ -301,19 +301,6 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                               int& max_channels) {
     std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
-    for (const auto& kv : weightMap) {
-        if (kv.first.find("conv.weight") != std::string::npos ||
-            kv.first.find("linear.weight") != std::string::npos) {  // 检查 conv.weight 或 linear.weight
-            std::cout << "Weight name: " << kv.first << ", ";
-            std::cout << "Count: " << kv.second.count << ", ";
-            std::cout << "Type: "
-                      << (kv.second.type == nvinfer1::DataType::kFLOAT  ? "FLOAT"
-                          : kv.second.type == nvinfer1::DataType::kHALF ? "HALF"
-                                                                        : "INT8")
-                      << std::endl;
-        }
-    }
-
     nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
     std::cout << "gd: " << gd << ", gw: " << gw << std::endl;
     /*******************************************************************************************************

From a149c50f3d957a790b5b41c7d037c72ba018e474 Mon Sep 17 00:00:00 2001
From: lindsayshuo <932695342@qq.com>
Date: Wed, 17 Apr 2024 17:23:12 +0800
Subject: [PATCH 09/24] Update model.cpp

---
 yolov8/src/model.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp
index 08a7b8fd..316b1301 100644
--- a/yolov8/src/model.cpp
+++ b/yolov8/src/model.cpp
@@ -302,7 +302,6 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
                                               int& max_channels) {
     std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
     nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
-    std::cout << "gd: " << gd << ", gw: " << gw << std::endl;
     /*******************************************************************************************************
     ******************************************  YOLOV8 INPUT  **********************************************
     *******************************************************************************************************/

From 917210a4f75e881850fdc726d556ef23f6a17b22 Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Tue, 23 Apr 2024 10:08:22 +0800
Subject: [PATCH 10/24] fix strides

---
 yolov8/src/model.cpp | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp
index 316b1301..0b105c97 100644
--- a/yolov8/src/model.cpp
+++ b/yolov8/src/model.cpp
@@ -20,6 +20,15 @@ static int get_depth(int x, float gd) {
     return std::max<int>(r, 1);
 }
 
+void calculateStrides(nvinfer1::IElementWiseLayer* conv_layers[], int size, int reference_size, int strides[]) {
+    for (int i = 0; i < size; ++i) {
+        nvinfer1::ILayer* layer = conv_layers[i];
+        nvinfer1::Dims dims = layer->getOutput(0)->getDimensions();
+        int feature_map_size = dims.d[1];
+        strides[i] = reference_size / feature_map_size;
+    }
+}
+
 static nvinfer1::IElementWiseLayer* Proto(nvinfer1::INetworkDefinition* network,
                                           std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input,
                                           std::string lname, float gw, int max_channels) {
@@ -220,7 +229,9 @@ nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer
     *********************************************  YOLOV8 DETECT  ******************************************
     *******************************************************************************************************/
 
-    int strides[] = {8, 16, 32};
+    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
+    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
+    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
     int stridesLength = sizeof(strides) / sizeof(int);
 
     nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0));
@@ -507,7 +518,9 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
     /*******************************************************************************************************
     *********************************************  YOLOV8 DETECT  ******************************************
     *******************************************************************************************************/
-    int strides[] = {8, 16, 32, 64};
+    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7, conv9};
+    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
+    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
     int stridesLength = sizeof(strides) / sizeof(int);
 
     // P3 processing steps (remains unchanged)
@@ -817,7 +830,9 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinf
     *********************************************  YOLOV8 DETECT  ******************************************
     *******************************************************************************************************/
 
-    int strides[] = {4, 8, 16, 32};
+    nvinfer1::IElementWiseLayer* conv_layers[] = {conv1, conv3, conv5, conv7};
+    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
+    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
     int stridesLength = sizeof(strides) / sizeof(int);
 
     // P2 processing steps (remains unchanged)
@@ -1148,7 +1163,9 @@ nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer
     *********************************************  YOLOV8 DETECT  ******************************************
     *******************************************************************************************************/
 
-    int strides[] = {8, 16, 32};
+    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
+    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
+    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
     int stridesLength = sizeof(strides) / sizeof(int);
 
     nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0));

From cfb1d32b207bf6a460ccd0f609ea4cb769eefeb2 Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Fri, 26 Apr 2024 17:20:43 +0800
Subject: [PATCH 11/24] yolov8 pose

---
 yolov8/CMakeLists.txt        |  10 +-
 yolov8/README.md             |  11 ++
 yolov8/include/block.h       |   2 +-
 yolov8/include/config.h      |   2 +
 yolov8/include/model.h       |   4 +
 yolov8/include/postprocess.h |  25 ++-
 yolov8/include/types.h       |  14 +-
 yolov8/plugin/yololayer.cu   |  92 +++++++---
 yolov8/plugin/yololayer.h    |   7 +-
 yolov8/src/block.cpp         |  15 +-
 yolov8/src/model.cpp         | 344 +++++++++++++++++++++++++++++++++++
 yolov8/src/postprocess.cpp   |  62 +++++++
 yolov8/yolov8_pose.cpp       | 276 ++++++++++++++++++++++++++++
 13 files changed, 815 insertions(+), 49 deletions(-)
 create mode 100644 yolov8/yolov8_pose.cpp

diff --git a/yolov8/CMakeLists.txt b/yolov8/CMakeLists.txt
index d5332a57..a702b341 100644
--- a/yolov8/CMakeLists.txt
+++ b/yolov8/CMakeLists.txt
@@ -27,8 +27,8 @@ else()
   # tensorrt
   include_directories(/home/lindsay/TensorRT-8.4.1.5/include)
   link_directories(/home/lindsay/TensorRT-8.4.1.5/lib)
-#  include_directories(/home/lindsay/TensorRT-7.2.3.4/include)
-#  link_directories(/home/lindsay/TensorRT-7.2.3.4/lib)
+  #  include_directories(/home/lindsay/TensorRT-7.2.3.4/include)
+  #  link_directories(/home/lindsay/TensorRT-7.2.3.4/lib)
 
 
 endif()
@@ -51,5 +51,9 @@ target_link_libraries(yolov8_det ${OpenCV_LIBS})
 add_executable(yolov8_seg ${PROJECT_SOURCE_DIR}/yolov8_seg.cpp ${SRCS})
 target_link_libraries(yolov8_seg nvinfer cudart myplugins ${OpenCV_LIBS})
 
+
+add_executable(yolov8_pose ${PROJECT_SOURCE_DIR}/yolov8_pose.cpp ${SRCS})
+target_link_libraries(yolov8_pose nvinfer cudart myplugins ${OpenCV_LIBS})
+
 add_executable(yolov8_cls ${PROJECT_SOURCE_DIR}/yolov8_cls.cpp ${SRCS})
-target_link_libraries(yolov8_cls nvinfer cudart myplugins ${OpenCV_LIBS})
\ No newline at end of file
+target_link_libraries(yolov8_cls nvinfer cudart myplugins ${OpenCV_LIBS})
diff --git a/yolov8/README.md b/yolov8/README.md
index dad23bcb..8f198c65 100644
--- a/yolov8/README.md
+++ b/yolov8/README.md
@@ -105,6 +105,17 @@ sudo ./yolov8_cls -s yolov8n-cls.wts yolov8-cls.engine n
 sudo ./yolov8_cls -d yolov8n-cls.engine ../samples
 ```
 
+
+### Pose Estimation
+```
+# Build and serialize TensorRT engine
+./yolov8_pose -s yolov8n-pose.wts yolov8n-pose.engine n
+
+# Run inference with labels file
+./yolov8_pose -d yolov8n-pose.engine ../images c
+```
+
+
 4. optional, load and run the tensorrt model in python
 
 ```
diff --git a/yolov8/include/block.h b/yolov8/include/block.h
index ed4c6ad1..650cacda 100644
--- a/yolov8/include/block.h
+++ b/yolov8/include/block.h
@@ -27,4 +27,4 @@ nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std
 
 nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
                                        std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
-                                       int px_arry_num, bool is_segmentation);
+                                       int px_arry_num, bool is_segmentation, bool is_pose);
diff --git a/yolov8/include/config.h b/yolov8/include/config.h
index d283860d..66ef7e4c 100644
--- a/yolov8/include/config.h
+++ b/yolov8/include/config.h
@@ -5,12 +5,14 @@
 const static char* kInputTensorName = "images";
 const static char* kOutputTensorName = "output";
 const static int kNumClass = 80;
+const static int kNumberOfPoints = 17;  // number of keypoints total
 const static int kBatchSize = 1;
 const static int kGpuId = 0;
 const static int kInputH = 640;
 const static int kInputW = 640;
 const static float kNmsThresh = 0.45f;
 const static float kConfThresh = 0.5f;
+const static float kConfThreshKeypoints = 0.5f;  // keypoints confidence
 const static int kMaxInputImageSize = 3000 * 3000;
 const static int kMaxNumOutputBbox = 1000;
 
diff --git a/yolov8/include/model.h b/yolov8/include/model.h
index 0a2a40a3..6546aa54 100644
--- a/yolov8/include/model.h
+++ b/yolov8/include/model.h
@@ -21,3 +21,7 @@ nvinfer1::IHostMemory* buildEngineYolov8Cls(nvinfer1::IBuilder* builder, nvinfer
 nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                             int& max_channels);
+
+nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
+                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
+                                             int& max_channels);
diff --git a/yolov8/include/postprocess.h b/yolov8/include/postprocess.h
index c6c8b92a..eb18d542 100644
--- a/yolov8/include/postprocess.h
+++ b/yolov8/include/postprocess.h
@@ -1,23 +1,30 @@
 #pragma once
 
-#include "types.h"
-#include "NvInfer.h"
 #include <opencv2/opencv.hpp>
+#include "NvInfer.h"
+#include "types.h"
 
 cv::Rect get_rect(cv::Mat& img, float bbox[4]);
 
-void nms(std::vector<Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5);
+void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh = 0.5);
+
+void batch_nms(std::vector<std::vector<Detection>>& batch_res, float* output, int batch_size, int output_size,
+               float conf_thresh, float nms_thresh = 0.5);
 
-void batch_nms(std::vector<std::vector<Detection>>& batch_res, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5);
+void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
 
-void draw_bbox(std::vector<cv::Mat> &img_batch, std::vector<std::vector<Detection>> &res_batch);
+void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
 
-void batch_process(std::vector<std::vector<Detection>> &res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector<cv::Mat>& img_batch);
+void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
+                   int bbox_element, const std::vector<cv::Mat>& img_batch);
 
-void process_decode_ptr_host(std::vector<Detection> &res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count);
+void process_decode_ptr_host(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img,
+                             int count);
 
-void cuda_decode(float* predict, int num_bboxes, float confidence_threshold,float* parray,int max_objects, cudaStream_t stream);
+void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
+                 cudaStream_t stream);
 
 void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);
 
-void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks, std::unordered_map<int, std::string>& labels_map);
+void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
+                    std::unordered_map<int, std::string>& labels_map);
diff --git a/yolov8/include/types.h b/yolov8/include/types.h
index 1eac8f4b..472c7354 100644
--- a/yolov8/include/types.h
+++ b/yolov8/include/types.h
@@ -2,15 +2,17 @@
 #include "config.h"
 
 struct alignas(float) Detection {
-  //center_x center_y w h
-  float bbox[4];
-  float conf;  // bbox_conf * cls_conf
-  float class_id;
-  float mask[32];
+    //center_x center_y w h
+    float bbox[4];
+    float conf;  // bbox_conf * cls_conf
+    float class_id;
+    float mask[32];
+    float keypoints[51];  // 17*3 keypoints
 };
 
 struct AffineMatrix {
     float value[6];
 };
 
-const int bbox_element = sizeof(AffineMatrix) / sizeof(float)+1;      // left, top, right, bottom, confidence, class, keepflag
+const int bbox_element =
+        sizeof(AffineMatrix) / sizeof(float) + 1;  // left, top, right, bottom, confidence, class, keepflag
diff --git a/yolov8/plugin/yololayer.cu b/yolov8/plugin/yololayer.cu
index 9b786de7..592914fa 100755
--- a/yolov8/plugin/yololayer.cu
+++ b/yolov8/plugin/yololayer.cu
@@ -20,10 +20,18 @@ void read(const char*& buffer, T& val) {
 }
 }  // namespace Tn
 
+__device__ float sigmoid(float x) {
+    return 1.0f / (1.0f + exp(-x));
+}
+
 namespace nvinfer1 {
-YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation,
-                                 const int* strides, int stridesLength) {
+YoloLayerPlugin::YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth,
+                                 int netHeight, int maxOut, bool is_segmentation, bool is_pose, const int* strides,
+                                 int stridesLength) {
+
     mClassCount = classCount;
+    mNumberofpoints = numberofpoints;
+    mConfthreshkeypoints = confthreshkeypoints;
     mYoloV8NetWidth = netWidth;
     mYoloV8netHeight = netHeight;
     mMaxOutObject = maxOut;
@@ -31,6 +39,7 @@ YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, in
     mStrides = new int[stridesLength];
     memcpy(mStrides, strides, stridesLength * sizeof(int));
     is_segmentation_ = is_segmentation;
+    is_pose_ = is_pose;
 }
 
 YoloLayerPlugin::~YoloLayerPlugin() {
@@ -44,6 +53,8 @@ YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
     using namespace Tn;
     const char *d = reinterpret_cast<const char*>(data), *a = d;
     read(d, mClassCount);
+    read(d, mNumberofpoints);
+    read(d, mConfthreshkeypoints);
     read(d, mThreadCount);
     read(d, mYoloV8NetWidth);
     read(d, mYoloV8netHeight);
@@ -54,6 +65,7 @@ YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
         read(d, mStrides[i]);
     }
     read(d, is_segmentation_);
+    read(d, is_pose_);
 
     assert(d == a + length);
 }
@@ -63,6 +75,8 @@ void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {
     using namespace Tn;
     char *d = static_cast<char*>(buffer), *a = d;
     write(d, mClassCount);
+    write(d, mNumberofpoints);
+    write(d, mConfthreshkeypoints);
     write(d, mThreadCount);
     write(d, mYoloV8NetWidth);
     write(d, mYoloV8netHeight);
@@ -72,13 +86,15 @@ void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {
         write(d, mStrides[i]);
     }
     write(d, is_segmentation_);
+    write(d, is_pose_);
 
     assert(d == a + getSerializationSize());
 }
 
 size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT {
-    return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) +
-           sizeof(mMaxOutObject) + sizeof(mStridesLength) + sizeof(int) * mStridesLength + sizeof(is_segmentation_);
+    return sizeof(mClassCount) + sizeof(mNumberofpoints) + sizeof(mConfthreshkeypoints) + sizeof(mThreadCount) +
+           sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(mStridesLength) +
+           sizeof(int) * mStridesLength + sizeof(is_segmentation_) + sizeof(is_pose_);
 }
 
 int YoloLayerPlugin::initialize() TRT_NOEXCEPT {
@@ -133,14 +149,14 @@ const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT {
 }
 
 void YoloLayerPlugin::destroy() TRT_NOEXCEPT {
-
     delete this;
 }
 
 nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT {
 
-    YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV8NetWidth, mYoloV8netHeight, mMaxOutObject,
-                                             is_segmentation_, mStrides, mStridesLength);
+    YoloLayerPlugin* p =
+            new YoloLayerPlugin(mClassCount, mNumberofpoints, mConfthreshkeypoints, mYoloV8NetWidth, mYoloV8netHeight,
+                                mMaxOutObject, is_segmentation_, is_pose_, mStrides, mStridesLength);
     p->setPluginNamespace(mPluginNamespace);
     return p;
 }
@@ -157,15 +173,15 @@ __device__ float Logist(float data) {
 };
 
 __global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h,
-                             int grid_w, const int stride, int classes, int outputElem, bool is_segmentation) {
+                             int grid_w, const int stride, int classes, int nk, float confkeypoints, int outputElem,
+                             bool is_segmentation, bool is_pose) {
     int idx = threadIdx.x + blockDim.x * blockIdx.x;
     if (idx >= numElements)
         return;
 
+    const int N_kpts = nk;
     int total_grid = grid_h * grid_w;
-    int info_len = 4 + classes;
-    if (is_segmentation)
-        info_len += 32;
+    int info_len = 4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0);
     int batchIdx = idx / total_grid;
     int elemIdx = idx % total_grid;
     const float* curInput = input + batchIdx * total_grid * info_len;
@@ -200,8 +216,36 @@ __global__ void CalDetection(const float* input, float* output, int numElements,
     det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride;
     det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride;
 
-    for (int k = 0; is_segmentation && k < 32; k++) {
-        det->mask[k] = curInput[elemIdx + (k + 4 + classes) * total_grid];
+    if (is_segmentation) {
+        for (int k = 0; k < 32; ++k) {
+            det->mask[k] = curInput[elemIdx + (4 + classes + k) * total_grid];
+        }
+    }
+
+    if (is_pose) {
+        for (int kpt = 0; kpt < N_kpts; kpt++) {
+            int kpt_x_idx = (4 + classes + (is_segmentation ? 32 : 0) + kpt * 3) * total_grid;
+            int kpt_y_idx = (4 + classes + (is_segmentation ? 32 : 0) + kpt * 3 + 1) * total_grid;
+            int kpt_conf_idx = (4 + classes + (is_segmentation ? 32 : 0) + kpt * 3 + 2) * total_grid;
+
+            float kpt_confidence = sigmoid(curInput[elemIdx + kpt_conf_idx]);
+
+            float kpt_x = (curInput[elemIdx + kpt_x_idx] * 2.0 + col) * stride;
+            float kpt_y = (curInput[elemIdx + kpt_y_idx] * 2.0 + row) * stride;
+
+            bool is_within_bbox =
+                    kpt_x >= det->bbox[0] && kpt_x <= det->bbox[2] && kpt_y >= det->bbox[1] && kpt_y <= det->bbox[3];
+
+            if (kpt_confidence < confkeypoints || !is_within_bbox) {
+                det->keypoints[kpt * 3] = -1;
+                det->keypoints[kpt * 3 + 1] = -1;
+                det->keypoints[kpt * 3 + 2] = -1;
+            } else {
+                det->keypoints[kpt * 3] = kpt_x;
+                det->keypoints[kpt * 3 + 1] = kpt_y;
+                det->keypoints[kpt * 3 + 2] = kpt_confidence;
+            }
+        }
     }
 }
 
@@ -230,8 +274,8 @@ void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cuda
             mThreadCount = numElem;
 
         CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>(
-                inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, outputElem,
-                is_segmentation_);
+                inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, mNumberofpoints,
+                mConfthreshkeypoints, outputElem, is_segmentation_, is_pose_);
     }
 }
 
@@ -260,16 +304,20 @@ IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFi
     assert(fc->nbFields == 1);
     assert(strcmp(fc->fields[0].name, "combinedInfo") == 0);
     const int* combinedInfo = static_cast<const int*>(fc->fields[0].data);
-    int netinfo_count = 5;
+    int netinfo_count = 8;
     int class_count = combinedInfo[0];
-    int input_w = combinedInfo[1];
-    int input_h = combinedInfo[2];
-    int max_output_object_count = combinedInfo[3];
-    bool is_segmentation = combinedInfo[4];
+    int numberofpoints = combinedInfo[1];
+    float confthreshkeypoints = combinedInfo[2];
+    int input_w = combinedInfo[3];
+    int input_h = combinedInfo[4];
+    int max_output_object_count = combinedInfo[5];
+    bool is_segmentation = combinedInfo[6];
+    bool is_pose = combinedInfo[7];
     const int* px_arry = combinedInfo + netinfo_count;
     int px_arry_length = fc->fields[0].length - netinfo_count;
-    YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, is_segmentation,
-                                               px_arry, px_arry_length);
+    YoloLayerPlugin* obj =
+            new YoloLayerPlugin(class_count, numberofpoints, confthreshkeypoints, input_w, input_h,
+                                max_output_object_count, is_segmentation, is_pose, px_arry, px_arry_length);
     obj->setPluginNamespace(mNamespace.c_str());
     return obj;
 }
diff --git a/yolov8/plugin/yololayer.h b/yolov8/plugin/yololayer.h
index 9496e7fc..b516ad87 100644
--- a/yolov8/plugin/yololayer.h
+++ b/yolov8/plugin/yololayer.h
@@ -6,8 +6,8 @@
 namespace nvinfer1 {
 class API YoloLayerPlugin : public IPluginV2IOExt {
    public:
-    YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation, const int* strides,
-                    int stridesLength);
+    YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, int netHeight,
+                    int maxOut, bool is_segmentation, bool is_pose, const int* strides, int stridesLength);
 
     YoloLayerPlugin(const void* data, size_t length);
     ~YoloLayerPlugin();
@@ -68,10 +68,13 @@ class API YoloLayerPlugin : public IPluginV2IOExt {
     int mThreadCount = 256;
     const char* mPluginNamespace;
     int mClassCount;
+    int mNumberofpoints;
+    float mConfthreshkeypoints;
     int mYoloV8NetWidth;
     int mYoloV8netHeight;
     int mMaxOutObject;
     bool is_segmentation_;
+    bool is_pose_;
     int* mStrides;
     int mStridesLength;
 };
diff --git a/yolov8/src/block.cpp b/yolov8/src/block.cpp
index 713f86ef..75d280e7 100644
--- a/yolov8/src/block.cpp
+++ b/yolov8/src/block.cpp
@@ -220,18 +220,21 @@ nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std
 
 nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
                                        std::vector<nvinfer1::IConcatenationLayer*> dets, const int* px_arry,
-                                       int px_arry_num, bool is_segmentation) {
+                                       int px_arry_num, bool is_segmentation, bool is_pose) {
     auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
-    const int netinfo_count = 5;  // Assuming the first 5 elements are for netinfo as per existing code.
+    const int netinfo_count = 8;  // Assuming the first 5 elements are for netinfo as per existing code.
     const int total_count = netinfo_count + px_arry_num;  // Total number of elements for netinfo and px_arry combined.
 
     std::vector<int> combinedInfo(total_count);
     // Fill in the first 5 elements as per existing netinfo.
     combinedInfo[0] = kNumClass;
-    combinedInfo[1] = kInputW;
-    combinedInfo[2] = kInputH;
-    combinedInfo[3] = kMaxNumOutputBbox;
-    combinedInfo[4] = is_segmentation;
+    combinedInfo[1] = kNumberOfPoints;
+    combinedInfo[2] = kConfThreshKeypoints;
+    combinedInfo[3] = kInputW;
+    combinedInfo[4] = kInputH;
+    combinedInfo[5] = kMaxNumOutputBbox;
+    combinedInfo[6] = is_segmentation;
+    combinedInfo[7] = is_pose;
 
     // Copy the contents of px_arry into the combinedInfo vector after the initial 5 elements.
     std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count);
diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp
index 0b105c97..2e67bacb 100644
--- a/yolov8/src/model.cpp
+++ b/yolov8/src/model.cpp
@@ -1257,3 +1257,347 @@ nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer
     }
     return serialized_model;
 }
+
+nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
+                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
+                                             int& max_channels) {
+    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
+    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
+
+    /*******************************************************************************************************
+    ******************************************  YOLOV8 INPUT  **********************************************
+    *******************************************************************************************************/
+    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW});
+    assert(data);
+
+    /*******************************************************************************************************
+    *****************************************  YOLOV8 BACKBONE  ********************************************
+    *******************************************************************************************************/
+    nvinfer1::IElementWiseLayer* conv0 =
+            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
+    printLayerDims(conv0, "conv0");
+    nvinfer1::IElementWiseLayer* conv1 =
+            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1");
+    printLayerDims(conv1, "conv1");
+    nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
+                                             get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
+    printLayerDims(conv2, "conv2");
+    nvinfer1::IElementWiseLayer* conv3 =
+            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3");
+    printLayerDims(conv3, "conv3");
+    nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
+                                             get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
+    printLayerDims(conv4, "conv4");
+    nvinfer1::IElementWiseLayer* conv5 =
+            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5");
+    printLayerDims(conv5, "conv5");
+    nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
+                                             get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
+    printLayerDims(conv6, "conv6");
+    nvinfer1::IElementWiseLayer* conv7 =
+            convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7");
+    printLayerDims(conv7, "conv7");
+    nvinfer1::IElementWiseLayer* conv8 =
+            C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
+                get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");
+    printLayerDims(conv8, "conv8");
+    nvinfer1::IElementWiseLayer* conv9 =
+            SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
+                 get_width(1024, gw, max_channels), 5, "model.9");
+    printLayerDims(conv9, "conv9");
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 HEAD  ********************************************
+    *******************************************************************************************************/
+    float scale[] = {1.0, 2.0, 2.0};
+    nvinfer1::IResizeLayer* upsample10 = network->addResize(*conv9->getOutput(0));
+    printLayerDims(upsample10, "upsample10");
+    assert(upsample10);
+    upsample10->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
+    printLayerDims(upsample10, "upsample10");
+    upsample10->setScales(scale, 3);
+    printLayerDims(upsample10, "upsample10");
+
+    nvinfer1::ITensor* inputTensor11[] = {upsample10->getOutput(0), conv6->getOutput(0)};
+    printTensorsDims(inputTensor11, 2, "inputTensor11");
+    nvinfer1::IConcatenationLayer* cat11 = network->addConcatenation(inputTensor11, 2);
+    printLayerDims(cat11, "cat11");
+    nvinfer1::IElementWiseLayer* conv12 =
+            C2F(network, weightMap, *cat11->getOutput(0), get_width(512, gw, max_channels),
+                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12");
+    printLayerDims(conv12, "conv12");
+
+    nvinfer1::IResizeLayer* upsample13 = network->addResize(*conv12->getOutput(0));
+    printLayerDims(upsample13, "upsample13");
+    assert(upsample13);
+    upsample13->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
+    printLayerDims(upsample13, "upsample13");
+    upsample13->setScales(scale, 3);
+    printLayerDims(upsample13, "upsample13");
+
+    nvinfer1::ITensor* inputTensor14[] = {upsample13->getOutput(0), conv4->getOutput(0)};
+    printTensorsDims(inputTensor14, 2, "inputTensor14");
+    nvinfer1::IConcatenationLayer* cat14 = network->addConcatenation(inputTensor14, 2);
+    printLayerDims(cat14, "cat14");
+    nvinfer1::IElementWiseLayer* conv15 =
+            C2F(network, weightMap, *cat14->getOutput(0), get_width(256, gw, max_channels),
+                get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15");
+    printLayerDims(conv15, "conv15");
+    nvinfer1::IElementWiseLayer* conv16 = convBnSiLU(network, weightMap, *conv15->getOutput(0),
+                                                     get_width(256, gw, max_channels), 3, 2, 1, "model.16");
+    printLayerDims(conv16, "conv16");
+    nvinfer1::ITensor* inputTensor17[] = {conv16->getOutput(0), conv12->getOutput(0)};
+    printTensorsDims(inputTensor17, 2, "inputTensor17");
+    nvinfer1::IConcatenationLayer* cat17 = network->addConcatenation(inputTensor17, 2);
+    printLayerDims(cat17, "cat17");
+    nvinfer1::IElementWiseLayer* conv18 =
+            C2F(network, weightMap, *cat17->getOutput(0), get_width(512, gw, max_channels),
+                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18");
+    printLayerDims(conv18, "conv18");
+    nvinfer1::IElementWiseLayer* conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0),
+                                                     get_width(512, gw, max_channels), 3, 2, 1, "model.19");
+    printLayerDims(conv19, "conv19");
+    nvinfer1::ITensor* inputTensor20[] = {conv19->getOutput(0), conv9->getOutput(0)};
+    printTensorsDims(inputTensor20, 2, "inputTensor20");
+    nvinfer1::IConcatenationLayer* cat20 = network->addConcatenation(inputTensor20, 2);
+    printLayerDims(cat20, "cat20");
+    nvinfer1::IElementWiseLayer* conv21 =
+            C2F(network, weightMap, *cat20->getOutput(0), get_width(1024, gw, max_channels),
+                get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21");
+    printLayerDims(conv21, "conv21");
+    printf("\n\n");
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 OUTPUT  ******************************************
+    *******************************************************************************************************/
+    int base_in_channel = (gw == 1.25) ? 80 : 64;
+    int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(256, gw, max_channels);
+
+    // output0
+    nvinfer1::IElementWiseLayer* conv22_cv2_0_0 =
+            convBnSiLU(network, weightMap, *conv15->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.0");
+    printLayerDims(conv22_cv2_0_0, "conv22_cv2_0_0");
+    nvinfer1::IElementWiseLayer* conv22_cv2_0_1 =
+            convBnSiLU(network, weightMap, *conv22_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.1");
+    printLayerDims(conv22_cv2_0_1, "conv22_cv2_0_1");
+    nvinfer1::IConvolutionLayer* conv22_cv2_0_2 =
+            network->addConvolutionNd(*conv22_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv2.0.2.weight"], weightMap["model.22.cv2.0.2.bias"]);
+    printLayerDims(conv22_cv2_0_2, "conv22_cv2_0_2");
+    conv22_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    printLayerDims(conv22_cv2_0_2, "conv22_cv2_0_2");
+    conv22_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    printLayerDims(conv22_cv2_0_2, "conv22_cv2_0_2");
+    nvinfer1::IElementWiseLayer* conv22_cv3_0_0 =
+            convBnSiLU(network, weightMap, *conv15->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.0");
+    printLayerDims(conv22_cv3_0_0, "conv22_cv3_0_0");
+    nvinfer1::IElementWiseLayer* conv22_cv3_0_1 = convBnSiLU(network, weightMap, *conv22_cv3_0_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.22.cv3.0.1");
+    printLayerDims(conv22_cv3_0_1, "conv22_cv3_0_1");
+    nvinfer1::IConvolutionLayer* conv22_cv3_0_2 =
+            network->addConvolutionNd(*conv22_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv3.0.2.weight"], weightMap["model.22.cv3.0.2.bias"]);
+    printLayerDims(conv22_cv3_0_2, "conv22_cv3_0_2");
+    conv22_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1});
+    printLayerDims(conv22_cv3_0_2, "conv22_cv3_0_2");
+    conv22_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0});
+    printLayerDims(conv22_cv3_0_2, "conv22_cv3_0_2");
+    nvinfer1::ITensor* inputTensor22_0[] = {conv22_cv2_0_2->getOutput(0), conv22_cv3_0_2->getOutput(0)};
+    printTensorsDims(inputTensor22_0, 2, "inputTensor22_0");
+    nvinfer1::IConcatenationLayer* cat22_0 = network->addConcatenation(inputTensor22_0, 2);
+    printLayerDims(cat22_0, "cat22_0");
+    printf("\n\n");
+
+    // output1
+    nvinfer1::IElementWiseLayer* conv22_cv2_1_0 =
+            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.0");
+    printLayerDims(conv22_cv2_1_0, "conv22_cv2_1_0");
+    nvinfer1::IElementWiseLayer* conv22_cv2_1_1 =
+            convBnSiLU(network, weightMap, *conv22_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.1");
+    printLayerDims(conv22_cv2_1_1, "conv22_cv2_1_1");
+    nvinfer1::IConvolutionLayer* conv22_cv2_1_2 =
+            network->addConvolutionNd(*conv22_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv2.1.2.weight"], weightMap["model.22.cv2.1.2.bias"]);
+    printLayerDims(conv22_cv2_1_2, "conv22_cv2_1_2");
+    conv22_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    printLayerDims(conv22_cv2_1_2, "conv22_cv2_1_2");
+    conv22_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    printLayerDims(conv22_cv2_1_2, "conv22_cv2_1_2");
+    nvinfer1::IElementWiseLayer* conv22_cv3_1_0 =
+            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.0");
+    printLayerDims(conv22_cv3_1_0, "conv22_cv3_1_0");
+    nvinfer1::IElementWiseLayer* conv22_cv3_1_1 = convBnSiLU(network, weightMap, *conv22_cv3_1_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.22.cv3.1.1");
+    printLayerDims(conv22_cv3_1_1, "conv22_cv3_1_1");
+    nvinfer1::IConvolutionLayer* conv22_cv3_1_2 =
+            network->addConvolutionNd(*conv22_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv3.1.2.weight"], weightMap["model.22.cv3.1.2.bias"]);
+    printLayerDims(conv22_cv3_1_2, "conv22_cv3_1_2");
+    conv22_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    printLayerDims(conv22_cv3_1_2, "conv22_cv3_1_2");
+    conv22_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    printLayerDims(conv22_cv3_1_2, "conv22_cv3_1_2");
+    nvinfer1::ITensor* inputTensor22_1[] = {conv22_cv2_1_2->getOutput(0), conv22_cv3_1_2->getOutput(0)};
+    printTensorsDims(inputTensor22_1, 2, "inputTensor22_1");
+    nvinfer1::IConcatenationLayer* cat22_1 = network->addConcatenation(inputTensor22_1, 2);
+    printLayerDims(cat22_1, "cat22_1");
+    printf("\n\n");
+
+    // output2
+    nvinfer1::IElementWiseLayer* conv22_cv2_2_0 =
+            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.0");
+    printLayerDims(conv22_cv2_2_0, "conv22_cv2_2_0");
+    nvinfer1::IElementWiseLayer* conv22_cv2_2_1 =
+            convBnSiLU(network, weightMap, *conv22_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.1");
+    printLayerDims(conv22_cv2_2_1, "conv22_cv2_2_1");
+    nvinfer1::IConvolutionLayer* conv22_cv2_2_2 =
+            network->addConvolution(*conv22_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                    weightMap["model.22.cv2.2.2.weight"], weightMap["model.22.cv2.2.2.bias"]);
+    printLayerDims(conv22_cv2_2_2, "conv22_cv2_2_2");
+    nvinfer1::IElementWiseLayer* conv22_cv3_2_0 =
+            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.0");
+    printLayerDims(conv22_cv3_2_0, "conv22_cv3_2_0");
+    nvinfer1::IElementWiseLayer* conv22_cv3_2_1 = convBnSiLU(network, weightMap, *conv22_cv3_2_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.22.cv3.2.1");
+    printLayerDims(conv22_cv3_2_1, "conv22_cv3_2_1");
+    nvinfer1::IConvolutionLayer* conv22_cv3_2_2 =
+            network->addConvolution(*conv22_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                    weightMap["model.22.cv3.2.2.weight"], weightMap["model.22.cv3.2.2.bias"]);
+    printLayerDims(conv22_cv3_2_2, "conv22_cv3_2_2");
+    nvinfer1::ITensor* inputTensor22_2[] = {conv22_cv2_2_2->getOutput(0), conv22_cv3_2_2->getOutput(0)};
+    printTensorsDims(inputTensor22_2, 2, "inputTensor22_2");
+    nvinfer1::IConcatenationLayer* cat22_2 = network->addConcatenation(inputTensor22_2, 2);
+    printLayerDims(cat22_2, "cat22_2");
+    printf("\n\n");
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 DETECT  ******************************************
+    *******************************************************************************************************/
+
+    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
+    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
+    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
+    int stridesLength = sizeof(strides) / sizeof(int);
+
+    /**************************************************************************************P3****************************************************************************************************************************************/
+    nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0));
+    printLayerDims(shuffle22_0, "shuffle22_0");
+    shuffle22_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
+    printLayerDims(shuffle22_0, "shuffle22_0->setReshapeDimensions");
+    nvinfer1::ISliceLayer* split22_0_0 = network->addSlice(
+            *shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    printLayerDims(split22_0_0, "split22_0_0");
+    nvinfer1::ISliceLayer* split22_0_1 = network->addSlice(
+            *shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    printLayerDims(split22_0_1, "split22_0_1");
+    nvinfer1::IShuffleLayer* dfl22_0 =
+            DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
+                1, 0, "model.22.dfl.conv.weight");
+    printLayerDims(dfl22_0, "dfl22_0");
+
+    // det0
+    printLayerDims(conv15, "conv15");
+    std::cout << "conv15->getOutput(0)->getDimensions().d[0] : " << conv15->getOutput(0)->getDimensions().d[0]
+              << "  (kInputH / strides[0]) * (kInputW / strides[0]) : "
+              << (kInputH / strides[0]) * (kInputW / strides[0]) << std::endl;
+    auto shuffle_conv15 = cv4_conv_combined(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0",
+                                            (kInputH / strides[0]) * (kInputW / strides[0]), gw, "pose");
+
+    printLayerDims(shuffle_conv15, "shuffle_conv15");
+
+    nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0),
+                                                shuffle_conv15->getOutput(0)};
+    printTensorsDims(inputTensor22_dfl_0, 3, "inputTensor22_dfl_0");
+    nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 3);
+    printLayerDims(cat22_dfl_0, "cat22_dfl_0");
+    printf("\n\n");
+
+    /********************************************************************************************P4**********************************************************************************************************************************/
+    nvinfer1::IShuffleLayer* shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0));
+    printLayerDims(shuffle22_1, "shuffle22_1");
+    shuffle22_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
+    printLayerDims(shuffle22_1, "shuffle22_1->setReshapeDimensions");
+    nvinfer1::ISliceLayer* split22_1_0 = network->addSlice(
+            *shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    printLayerDims(split22_1_0, "split22_1_0");
+    nvinfer1::ISliceLayer* split22_1_1 = network->addSlice(
+            *shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    printLayerDims(split22_1_1, "split22_1_1");
+    nvinfer1::IShuffleLayer* dfl22_1 =
+            DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
+                1, 0, "model.22.dfl.conv.weight");
+    printLayerDims(dfl22_1, "dfl22_1");
+
+    // det1
+    auto shuffle_conv18 = cv4_conv_combined(network, weightMap, *conv18->getOutput(0), "model.22.cv4.1",
+                                            (kInputH / strides[1]) * (kInputW / strides[1]), gw, "pose");
+
+    nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0),
+                                                shuffle_conv18->getOutput(0)};
+    printTensorsDims(inputTensor22_dfl_1, 3, "inputTensor22_dfl_1");
+    nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 3);
+    printLayerDims(cat22_dfl_1, "cat22_dfl_1");
+    printf("\n\n");
+
+    /********************************************************************************************P5**********************************************************************************************************************************/
+    nvinfer1::IShuffleLayer* shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0));
+    printLayerDims(shuffle22_2, "shuffle22_2");
+    shuffle22_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
+    printLayerDims(shuffle22_2, "shuffle22_2->setReshapeDimensions");
+    nvinfer1::ISliceLayer* split22_2_0 = network->addSlice(
+            *shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    printLayerDims(split22_2_0, "split22_2_0");
+    nvinfer1::ISliceLayer* split22_2_1 = network->addSlice(
+            *shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    printLayerDims(split22_2_1, "split22_2_1");
+    nvinfer1::IShuffleLayer* dfl22_2 =
+            DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
+                1, 0, "model.22.dfl.conv.weight");
+    printLayerDims(dfl22_2, "dfl22_2");
+
+    // det2
+    auto shuffle_conv21 = cv4_conv_combined(network, weightMap, *conv21->getOutput(0), "model.22.cv4.2",
+                                            (kInputH / strides[2]) * (kInputW / strides[2]), gw, "pose");
+    nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0),
+                                                shuffle_conv21->getOutput(0)};
+    printTensorsDims(inputTensor22_dfl_2, 3, "inputTensor22_dfl_2");
+    nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 3);
+    printLayerDims(cat22_dfl_2, "cat22_dfl_2");
+    printf("\n\n");
+
+    nvinfer1::IPluginV2Layer* yolo =
+            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2},
+                         strides, stridesLength, false, true);
+    printLayerDims(yolo, "yolo");
+    yolo->getOutput(0)->setName(kOutputTensorName);
+    network->markOutput(*yolo->getOutput(0));
+
+    builder->setMaxBatchSize(kBatchSize);
+    config->setMaxWorkspaceSize(16 * (1 << 20));
+
+#if defined(USE_FP16)
+    config->setFlag(nvinfer1::BuilderFlag::kFP16);
+#elif defined(USE_INT8)
+    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
+    assert(builder->platformHasFastInt8());
+    config->setFlag(nvinfer1::BuilderFlag::kINT8);
+    auto* calibrator =
+            new Int8EntropyCalibrator2(1, kInputW, kInputH, "../coco_calib/", "int8calib.table", kInputTensorName);
+    config->setInt8Calibrator(calibrator);
+#endif
+
+    std::cout << "Building engine, please wait for a while..." << std::endl;
+    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
+    std::cout << "Build engine successfully!" << std::endl;
+
+    delete network;
+
+    for (auto& mem : weightMap) {
+        free((void*)(mem.second.values));
+    }
+    return serialized_model;
+}
diff --git a/yolov8/src/postprocess.cpp b/yolov8/src/postprocess.cpp
index 574e15b5..25f86cd3 100644
--- a/yolov8/src/postprocess.cpp
+++ b/yolov8/src/postprocess.cpp
@@ -28,6 +28,34 @@ cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
     return cv::Rect(round(l), round(t), round(r - l), round(b - t));
 }
 
+cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[51]) {
+    int l, r, t, b;
+    float r_w = kInputW / (img.cols * 1.0);
+    float r_h = kInputH / (img.rows * 1.0);
+    if (r_h > r_w) {
+        l = bbox[0] / r_w;
+        r = bbox[2] / r_w;
+        t = (bbox[1] - (kInputH - r_w * img.rows) / 2) / r_w;
+        b = (bbox[3] - (kInputH - r_w * img.rows) / 2) / r_w;
+        for (int i = 0; i < 51; i += 3) {
+            lmk[i] /= r_w;
+            lmk[i + 1] = (lmk[i + 1] - (kInputH - r_w * img.rows) / 2) / r_w;
+            // lmk[i + 2]
+        }
+    } else {
+        l = (bbox[0] - (kInputW - r_h * img.cols) / 2) / r_h;
+        r = (bbox[2] - (kInputW - r_h * img.cols) / 2) / r_h;
+        t = bbox[1] / r_h;
+        b = bbox[3] / r_h;
+        for (int i = 0; i < 51; i += 3) {
+            lmk[i] = (lmk[i] - (kInputW - r_h * img.cols) / 2) / r_h;
+            lmk[i + 1] /= r_h;
+            // lmk[i + 2]
+        }
+    }
+    return cv::Rect(l, t, r - l, b - t);
+}
+
 static float iou(float lbox[4], float rbox[4]) {
     float interBox[] = {
             (std::max)(lbox[0], rbox[0]),
@@ -130,6 +158,40 @@ void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detectio
     }
 }
 
+void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
+    const std::vector<std::pair<int, int>> skeleton_pairs = {
+            {0, 1}, {0, 2},  {0, 5}, {0, 6},  {1, 2},   {1, 3},   {2, 4},   {5, 6},   {5, 7},  {5, 11},
+            {6, 8}, {6, 12}, {7, 9}, {8, 10}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}};
+
+    for (size_t i = 0; i < img_batch.size(); i++) {
+        auto& res = res_batch[i];
+        cv::Mat img = img_batch[i];
+        for (size_t j = 0; j < res.size(); j++) {
+            cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].keypoints);
+            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
+            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
+                        cv::Scalar(0xFF, 0xFF, 0xFF), 2);
+
+            for (int k = 0; k < 51; k += 3) {
+                if (res[j].keypoints[k + 2] > 0.5) {
+                    cv::circle(img, cv::Point((int)res[j].keypoints[k], (int)res[j].keypoints[k + 1]), 3,
+                               cv::Scalar(0, 0x27, 0xC1), -1);
+                }
+            }
+
+            for (const auto& bone : skeleton_pairs) {
+                int kp1_idx = bone.first * 3;
+                int kp2_idx = bone.second * 3;
+                if (res[j].keypoints[kp1_idx + 2] > 0.5 && res[j].keypoints[kp2_idx + 2] > 0.5) {
+                    cv::Point p1((int)res[j].keypoints[kp1_idx], (int)res[j].keypoints[kp1_idx + 1]);
+                    cv::Point p2((int)res[j].keypoints[kp2_idx], (int)res[j].keypoints[kp2_idx + 1]);
+                    cv::line(img, p1, p2, cv::Scalar(0, 0x27, 0xC1), 2);
+                }
+            }
+        }
+    }
+}
+
 cv::Mat scale_mask(cv::Mat mask, cv::Mat img) {
     int x, y, w, h;
     float r_w = kInputW / (img.cols * 1.0);
diff --git a/yolov8/yolov8_pose.cpp b/yolov8/yolov8_pose.cpp
new file mode 100644
index 00000000..5b848a6b
--- /dev/null
+++ b/yolov8/yolov8_pose.cpp
@@ -0,0 +1,276 @@
+
+#include <fstream>
+#include <iostream>
+#include <opencv2/opencv.hpp>
+#include "cuda_utils.h"
+#include "logging.h"
+#include "model.h"
+#include "postprocess.h"
+#include "preprocess.h"
+#include "utils.h"
+
+Logger gLogger;
+using namespace nvinfer1;
+const int kOutputSize = kMaxNumOutputBbox * (sizeof(Detection) - sizeof(float) * 32) / sizeof(float) + 1;
+
+void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd,
+                      float& gw, int& max_channels) {
+    IBuilder* builder = createInferBuilder(gLogger);
+    IBuilderConfig* config = builder->createBuilderConfig();
+    IHostMemory* serialized_engine = nullptr;
+
+    if (is_p == 6) {
+        serialized_engine = buildEngineYolov8Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
+    } else if (is_p == 2) {
+        serialized_engine = buildEngineYolov8Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
+    } else {
+        serialized_engine = buildEngineYolov8Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
+    }
+
+    assert(serialized_engine);
+    std::ofstream p(engine_name, std::ios::binary);
+    if (!p) {
+        std::cout << "could not open plan output file" << std::endl;
+        assert(false);
+    }
+    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
+
+    delete serialized_engine;
+    delete config;
+    delete builder;
+}
+
+void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
+                        IExecutionContext** context) {
+    std::ifstream file(engine_name, std::ios::binary);
+    if (!file.good()) {
+        std::cerr << "read " << engine_name << " error!" << std::endl;
+        assert(false);
+    }
+    size_t size = 0;
+    file.seekg(0, file.end);
+    size = file.tellg();
+    file.seekg(0, file.beg);
+    char* serialized_engine = new char[size];
+    assert(serialized_engine);
+    file.read(serialized_engine, size);
+    file.close();
+
+    *runtime = createInferRuntime(gLogger);
+    assert(*runtime);
+    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
+    assert(*engine);
+    *context = (*engine)->createExecutionContext();
+    assert(*context);
+    delete[] serialized_engine;
+}
+
+void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
+                    float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
+                    std::string cuda_post_process) {
+    assert(engine->getNbBindings() == 2);
+    // In order to bind the buffers, we need to know the names of the input and output tensors.
+    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
+    const int inputIndex = engine->getBindingIndex(kInputTensorName);
+    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
+    assert(inputIndex == 0);
+    assert(outputIndex == 1);
+    // Create GPU buffers on device
+    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
+    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
+    if (cuda_post_process == "c") {
+        *output_buffer_host = new float[kBatchSize * kOutputSize];
+    } else if (cuda_post_process == "g") {
+        if (kBatchSize > 1) {
+            std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
+            exit(0);
+        }
+        // Allocate memory for decode_ptr_host and copy to device
+        *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
+        CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
+    }
+}
+
+void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
+           float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
+    // infer on the batch asynchronously, and DMA output back to host
+    auto start = std::chrono::system_clock::now();
+    context.enqueue(batchsize, buffers, stream, nullptr);
+    if (cuda_post_process == "c") {
+        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
+                                   stream));
+        auto end = std::chrono::system_clock::now();
+        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
+                  << "ms" << std::endl;
+    } else if (cuda_post_process == "g") {
+        CUDA_CHECK(
+                cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
+        cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
+        cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);  //cuda nms
+        CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
+                                   sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
+                                   stream));
+        auto end = std::chrono::system_clock::now();
+        std::cout << "inference and gpu postprocess time: "
+                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
+    }
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+}
+
+bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir,
+                std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
+    if (argc < 4)
+        return false;
+    if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
+        wts = std::string(argv[2]);
+        engine = std::string(argv[3]);
+        auto sub_type = std::string(argv[4]);
+
+        if (sub_type[0] == 'n') {
+            gd = 0.33;
+            gw = 0.25;
+            max_channels = 1024;
+        } else if (sub_type[0] == 's') {
+            gd = 0.33;
+            gw = 0.50;
+            max_channels = 1024;
+        } else if (sub_type[0] == 'm') {
+            gd = 0.67;
+            gw = 0.75;
+            max_channels = 576;
+        } else if (sub_type[0] == 'l') {
+            gd = 1.0;
+            gw = 1.0;
+            max_channels = 512;
+        } else if (sub_type[0] == 'x') {
+            gd = 1.0;
+            gw = 1.25;
+            max_channels = 640;
+        } else {
+            return false;
+        }
+        if (sub_type.size() == 2 && sub_type[1] == '6') {
+            is_p = 6;
+        } else if (sub_type.size() == 2 && sub_type[1] == '2') {
+            is_p = 2;
+        }
+    } else if (std::string(argv[1]) == "-d" && argc == 5) {
+        engine = std::string(argv[2]);
+        img_dir = std::string(argv[3]);
+        cuda_post_process = std::string(argv[4]);
+    } else {
+        return false;
+    }
+    return true;
+}
+
+int main(int argc, char** argv) {
+    cudaSetDevice(kGpuId);
+    std::string wts_name = "";
+    std::string engine_name = "";
+    std::string img_dir;
+    std::string sub_type = "";
+    std::string cuda_post_process = "";
+    int model_bboxes;
+    int is_p = 0;
+    float gd = 0.0f, gw = 0.0f;
+    int max_channels = 0;
+
+    if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw,
+                    max_channels)) {
+        std::cerr << "Arguments not right!" << std::endl;
+        std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n6/s6/m6/l6/x6]  // serialize model to plan file"
+                  << std::endl;
+        std::cerr << "./yolov8 -d [.engine] ../samples  [c/g]// deserialize plan file and run inference" << std::endl;
+        return -1;
+    }
+
+    // Create a model using the API directly and serialize it to a file
+    if (!wts_name.empty()) {
+        serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels);
+        return 0;
+    }
+
+    // Deserialize the engine from file
+    IRuntime* runtime = nullptr;
+    ICudaEngine* engine = nullptr;
+    IExecutionContext* context = nullptr;
+    deserialize_engine(engine_name, &runtime, &engine, &context);
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    cuda_preprocess_init(kMaxInputImageSize);
+    auto out_dims = engine->getBindingDimensions(1);
+    model_bboxes = out_dims.d[0];
+    // Prepare cpu and gpu buffers
+    float* device_buffers[2];
+    float* output_buffer_host = nullptr;
+    float* decode_ptr_host = nullptr;
+    float* decode_ptr_device = nullptr;
+
+    // Read images from directory
+    std::vector<std::string> file_names;
+    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
+        std::cerr << "read_files_in_dir failed." << std::endl;
+        return -1;
+    }
+
+    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
+                   &decode_ptr_device, cuda_post_process);
+
+    // batch predict
+    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
+        // Get a batch of images
+        std::vector<cv::Mat> img_batch;
+        std::vector<std::string> img_name_batch;
+        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
+            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
+            img_batch.push_back(img);
+            img_name_batch.push_back(file_names[j]);
+        }
+        // Preprocess
+        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
+        // Run inference
+        infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
+              decode_ptr_device, model_bboxes, cuda_post_process);
+        std::vector<std::vector<Detection>> res_batch;
+        if (cuda_post_process == "c") {
+            // NMS
+            batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
+        } else if (cuda_post_process == "g") {
+            // Process gpu decode and nms results
+            // todo pose in gpu
+            std::cerr << "pose_postprocess is not support in gpu right now" << std::endl;
+        }
+        // Draw bounding boxes
+        draw_bbox_keypoints_line(img_batch, res_batch);
+        // Save images
+        for (size_t j = 0; j < img_batch.size(); j++) {
+            cv::imwrite("_" + img_name_batch[j], img_batch[j]);
+        }
+    }
+
+    // Release stream and buffers
+    cudaStreamDestroy(stream);
+    CUDA_CHECK(cudaFree(device_buffers[0]));
+    CUDA_CHECK(cudaFree(device_buffers[1]));
+    CUDA_CHECK(cudaFree(decode_ptr_device));
+    delete[] decode_ptr_host;
+    delete[] output_buffer_host;
+    cuda_preprocess_destroy();
+    // Destroy the engine
+    delete context;
+    delete engine;
+    delete runtime;
+
+    // Print histogram of the output distribution
+    //std::cout << "\nOutput:\n\n";
+    //for (unsigned int i = 0; i < kOutputSize; i++)
+    //{
+    //    std::cout << prob[i] << ", ";
+    //    if (i % 10 == 0) std::cout << std::endl;
+    //}
+    //std::cout << std::endl;
+
+    return 0;
+}

From 77e3f057a2c9d17a514b791a09cd348976225a38 Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Fri, 26 Apr 2024 17:27:37 +0800
Subject: [PATCH 12/24] yolov8 pose

---
 yolov8/yolov8_seg.cpp | 227 ++++++++++++++++++++++--------------------
 1 file changed, 119 insertions(+), 108 deletions(-)

diff --git a/yolov8/yolov8_seg.cpp b/yolov8/yolov8_seg.cpp
index 8ba71421..5a86f974 100644
--- a/yolov8/yolov8_seg.cpp
+++ b/yolov8/yolov8_seg.cpp
@@ -1,90 +1,87 @@
 
-#include <iostream>
 #include <fstream>
+#include <iostream>
 #include <opencv2/opencv.hpp>
-#include "model.h"
-#include "utils.h"
-#include "preprocess.h"
-#include "postprocess.h"
 #include "cuda_utils.h"
 #include "logging.h"
+#include "model.h"
+#include "postprocess.h"
+#include "preprocess.h"
+#include "utils.h"
 
 Logger gLogger;
 using namespace nvinfer1;
-const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
+const int kOutputSize = kMaxNumOutputBbox * (sizeof(Detection) - sizeof(float) * 51) / sizeof(float) + 1;
 const static int kOutputSegSize = 32 * (kInputH / 4) * (kInputW / 4);
 
 static cv::Rect get_downscale_rect(float bbox[4], float scale) {
 
-  float left = bbox[0];
-  float top  = bbox[1];
-  float right  = bbox[0] + bbox[2];
-  float bottom = bbox[1] + bbox[3];
-
-  left    = left < 0 ? 0 : left;
-  top     = top < 0 ? 0: top;
-  right   = right > 640 ? 640 : right;
-  bottom  = bottom > 640 ? 640: bottom;
-
-  left   /= scale;
-  top    /= scale;
-  right  /= scale;
-  bottom /= scale;
-  return cv::Rect(int(left), int(top), int(right - left), int(bottom - top));
+    float left = bbox[0];
+    float top = bbox[1];
+    float right = bbox[0] + bbox[2];
+    float bottom = bbox[1] + bbox[3];
+
+    left = left < 0 ? 0 : left;
+    top = top < 0 ? 0 : top;
+    right = right > 640 ? 640 : right;
+    bottom = bottom > 640 ? 640 : bottom;
+
+    left /= scale;
+    top /= scale;
+    right /= scale;
+    bottom /= scale;
+    return cv::Rect(int(left), int(top), int(right - left), int(bottom - top));
 }
 
 std::vector<cv::Mat> process_mask(const float* proto, int proto_size, std::vector<Detection>& dets) {
 
-  std::vector<cv::Mat> masks;
-  for (size_t i = 0; i < dets.size(); i++) {
+    std::vector<cv::Mat> masks;
+    for (size_t i = 0; i < dets.size(); i++) {
 
-    cv::Mat mask_mat = cv::Mat::zeros(kInputH / 4, kInputW / 4, CV_32FC1);
-    auto r = get_downscale_rect(dets[i].bbox, 4);
+        cv::Mat mask_mat = cv::Mat::zeros(kInputH / 4, kInputW / 4, CV_32FC1);
+        auto r = get_downscale_rect(dets[i].bbox, 4);
 
-    for (int x = r.x; x < r.x + r.width; x++) {
-      for (int y = r.y; y < r.y + r.height; y++) {
-        float e = 0.0f;
-        for (int j = 0; j < 32; j++) {
-            e += dets[i].mask[j] * proto[j * proto_size / 32 + y * mask_mat.cols + x];
+        for (int x = r.x; x < r.x + r.width; x++) {
+            for (int y = r.y; y < r.y + r.height; y++) {
+                float e = 0.0f;
+                for (int j = 0; j < 32; j++) {
+                    e += dets[i].mask[j] * proto[j * proto_size / 32 + y * mask_mat.cols + x];
+                }
+                e = 1.0f / (1.0f + expf(-e));
+                mask_mat.at<float>(y, x) = e;
+            }
         }
-        e = 1.0f / (1.0f + expf(-e));
-        mask_mat.at<float>(y, x) = e;
-      }
+        cv::resize(mask_mat, mask_mat, cv::Size(kInputW, kInputH));
+        masks.push_back(mask_mat);
     }
-    cv::resize(mask_mat, mask_mat, cv::Size(kInputW, kInputH));
-    masks.push_back(mask_mat);
-  }
-  return masks;
+    return masks;
 }
 
-
-void serialize_engine(std::string &wts_name, std::string &engine_name, std::string &sub_type, float &gd, float &gw, int &max_channels)
-{
-    IBuilder *builder = createInferBuilder(gLogger);
-    IBuilderConfig *config = builder->createBuilderConfig();
-    IHostMemory *serialized_engine = nullptr;
+void serialize_engine(std::string& wts_name, std::string& engine_name, std::string& sub_type, float& gd, float& gw,
+                      int& max_channels) {
+    IBuilder* builder = createInferBuilder(gLogger);
+    IBuilderConfig* config = builder->createBuilderConfig();
+    IHostMemory* serialized_engine = nullptr;
 
     serialized_engine = buildEngineYolov8Seg(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
 
     assert(serialized_engine);
     std::ofstream p(engine_name, std::ios::binary);
-    if (!p)
-    {
+    if (!p) {
         std::cout << "could not open plan output file" << std::endl;
         assert(false);
     }
-    p.write(reinterpret_cast<const char *>(serialized_engine->data()), serialized_engine->size());
+    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
 
     delete serialized_engine;
     delete config;
     delete builder;
 }
 
-void deserialize_engine(std::string &engine_name, IRuntime **runtime, ICudaEngine **engine, IExecutionContext **context)
-{
+void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
+                        IExecutionContext** context) {
     std::ifstream file(engine_name, std::ios::binary);
-    if (!file.good())
-    {
+    if (!file.good()) {
         std::cerr << "read " << engine_name << " error!" << std::endl;
         assert(false);
     }
@@ -92,7 +89,7 @@ void deserialize_engine(std::string &engine_name, IRuntime **runtime, ICudaEngin
     file.seekg(0, file.end);
     size = file.tellg();
     file.seekg(0, file.beg);
-    char *serialized_engine = new char[size];
+    char* serialized_engine = new char[size];
     assert(serialized_engine);
     file.read(serialized_engine, size);
     file.close();
@@ -106,8 +103,9 @@ void deserialize_engine(std::string &engine_name, IRuntime **runtime, ICudaEngin
     delete[] serialized_engine;
 }
 
-void prepare_buffer(ICudaEngine *engine, float **input_buffer_device, float **output_buffer_device, float **output_seg_buffer_device,
-                    float **output_buffer_host,float **output_seg_buffer_host ,float **decode_ptr_host, float **decode_ptr_device, std::string cuda_post_process) {
+void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
+                    float** output_seg_buffer_device, float** output_buffer_host, float** output_seg_buffer_host,
+                    float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) {
     assert(engine->getNbBindings() == 3);
     // In order to bind the buffers, we need to know the names of the input and output tensors.
     // Note that indices are guaranteed to be less than IEngine::getNbBindings()
@@ -119,10 +117,10 @@ void prepare_buffer(ICudaEngine *engine, float **input_buffer_device, float **ou
     assert(outputIndex == 1);
     assert(outputIndex_seg == 2);
     // Create GPU buffers on device
-    CUDA_CHECK(cudaMalloc((void **) input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
-    CUDA_CHECK(cudaMalloc((void **) output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
-    CUDA_CHECK(cudaMalloc((void **) output_seg_buffer_device, kBatchSize * kOutputSegSize * sizeof(float)));
-    
+    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
+    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
+    CUDA_CHECK(cudaMalloc((void**)output_seg_buffer_device, kBatchSize * kOutputSegSize * sizeof(float)));
+
     if (cuda_post_process == "c") {
         *output_buffer_host = new float[kBatchSize * kOutputSize];
         *output_seg_buffer_host = new float[kBatchSize * kOutputSegSize];
@@ -133,38 +131,47 @@ void prepare_buffer(ICudaEngine *engine, float **input_buffer_device, float **ou
         }
         // Allocate memory for decode_ptr_host and copy to device
         *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
-        CUDA_CHECK(cudaMalloc((void **)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
+        CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
     }
 }
 
-void infer(IExecutionContext &context, cudaStream_t &stream, void **buffers, float *output, float *output_seg,int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
+void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, float* output_seg,
+           int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes,
+           std::string cuda_post_process) {
     // infer on the batch asynchronously, and DMA output back to host
     auto start = std::chrono::system_clock::now();
     context.enqueue(batchsize, buffers, stream, nullptr);
     if (cuda_post_process == "c") {
 
-        std::cout << "kOutputSize:" << kOutputSize <<std::endl;
-        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,stream));
-        std::cout << "kOutputSegSize:" << kOutputSegSize <<std::endl;
-        CUDA_CHECK(cudaMemcpyAsync(output_seg, buffers[2], batchsize * kOutputSegSize * sizeof(float), cudaMemcpyDeviceToHost, stream));
+        std::cout << "kOutputSize:" << kOutputSize << std::endl;
+        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
+                                   stream));
+        std::cout << "kOutputSegSize:" << kOutputSegSize << std::endl;
+        CUDA_CHECK(cudaMemcpyAsync(output_seg, buffers[2], batchsize * kOutputSegSize * sizeof(float),
+                                   cudaMemcpyDeviceToHost, stream));
 
         auto end = std::chrono::system_clock::now();
-        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
+        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
+                  << "ms" << std::endl;
     } else if (cuda_post_process == "g") {
-        CUDA_CHECK(cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
-        cuda_decode((float *)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
-        cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);//cuda nms
-        CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(
+                cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
+        cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
+        cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);  //cuda nms
+        CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
+                                   sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
+                                   stream));
         auto end = std::chrono::system_clock::now();
-        std::cout << "inference and gpu postprocess time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
+        std::cout << "inference and gpu postprocess time: "
+                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
     }
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
 }
 
-bool parse_args(int argc, char **argv, std::string &wts, std::string &engine, std::string &img_dir, std::string &sub_type,
-                std::string &cuda_post_process, std::string& labels_filename, float &gd, float &gw, int &max_channels)
-{
+bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir,
+                std::string& sub_type, std::string& cuda_post_process, std::string& labels_filename, float& gd,
+                float& gw, int& max_channels) {
     if (argc < 4)
         return false;
     if (std::string(argv[1]) == "-s" && argc == 5) {
@@ -172,40 +179,40 @@ bool parse_args(int argc, char **argv, std::string &wts, std::string &engine, st
         engine = std::string(argv[3]);
         sub_type = std::string(argv[4]);
         if (sub_type == "n") {
-          gd = 0.33;
-          gw = 0.25;
-          max_channels = 1024;
+            gd = 0.33;
+            gw = 0.25;
+            max_channels = 1024;
         } else if (sub_type == "s") {
-          gd = 0.33;
-          gw = 0.50;
-          max_channels = 1024;
+            gd = 0.33;
+            gw = 0.50;
+            max_channels = 1024;
         } else if (sub_type == "m") {
-          gd = 0.67;
-          gw = 0.75;
-          max_channels = 576;
+            gd = 0.67;
+            gw = 0.75;
+            max_channels = 576;
         } else if (sub_type == "l") {
-          gd = 1.0;
-          gw = 1.0;
-          max_channels = 512;
+            gd = 1.0;
+            gw = 1.0;
+            max_channels = 512;
         } else if (sub_type == "x") {
-          gd = 1.0;
-          gw = 1.25;
-          max_channels = 640;
-        } else{
-          return false;
+            gd = 1.0;
+            gw = 1.25;
+            max_channels = 640;
+        } else {
+            return false;
         }
     } else if (std::string(argv[1]) == "-d" && argc == 6) {
-      engine = std::string(argv[2]);
-      img_dir = std::string(argv[3]);
-      cuda_post_process = std::string(argv[4]);
-      labels_filename = std::string(argv[5]);
+        engine = std::string(argv[2]);
+        img_dir = std::string(argv[3]);
+        cuda_post_process = std::string(argv[4]);
+        labels_filename = std::string(argv[5]);
     } else {
-      return false;
+        return false;
     }
     return true;
 }
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
     cudaSetDevice(kGpuId);
     std::string wts_name = "";
     std::string engine_name = "";
@@ -217,10 +224,12 @@ int main(int argc, char **argv) {
     float gd = 0.0f, gw = 0.0f;
     int max_channels = 0;
 
-    if (!parse_args(argc, argv, wts_name, engine_name, img_dir, sub_type, cuda_post_process, labels_filename, gd, gw, max_channels)) {
+    if (!parse_args(argc, argv, wts_name, engine_name, img_dir, sub_type, cuda_post_process, labels_filename, gd, gw,
+                    max_channels)) {
         std::cerr << "Arguments not right!" << std::endl;
         std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x]  // serialize model to plan file" << std::endl;
-        std::cerr << "./yolov8 -d [.engine] ../samples  [c/g] coco_file// deserialize plan file and run inference" << std::endl;
+        std::cerr << "./yolov8 -d [.engine] ../samples  [c/g] coco_file// deserialize plan file and run inference"
+                  << std::endl;
         return -1;
     }
 
@@ -230,10 +239,10 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-       // Deserialize the engine from file
-    IRuntime *runtime = nullptr;
-    ICudaEngine *engine = nullptr;
-    IExecutionContext *context = nullptr;
+    // Deserialize the engine from file
+    IRuntime* runtime = nullptr;
+    ICudaEngine* engine = nullptr;
+    IExecutionContext* context = nullptr;
     deserialize_engine(engine_name, &runtime, &engine, &context);
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
@@ -241,11 +250,11 @@ int main(int argc, char **argv) {
     auto out_dims = engine->getBindingDimensions(1);
     model_bboxes = out_dims.d[0];
     // Prepare cpu and gpu buffers
-    float *device_buffers[3];
-    float *output_buffer_host = nullptr;
-    float *output_seg_buffer_host = nullptr;
-    float *decode_ptr_host=nullptr;
-    float *decode_ptr_device=nullptr;
+    float* device_buffers[3];
+    float* output_buffer_host = nullptr;
+    float* output_seg_buffer_host = nullptr;
+    float* decode_ptr_host = nullptr;
+    float* decode_ptr_device = nullptr;
 
     // Read images from directory
     std::vector<std::string> file_names;
@@ -258,7 +267,8 @@ int main(int argc, char **argv) {
     read_labels(labels_filename, labels_map);
     assert(kNumClass == labels_map.size());
 
-    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &device_buffers[2], &output_buffer_host, &output_seg_buffer_host,&decode_ptr_host, &decode_ptr_device, cuda_post_process);
+    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &device_buffers[2], &output_buffer_host,
+                   &output_seg_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process);
 
     // // batch predict
     for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
@@ -273,7 +283,8 @@ int main(int argc, char **argv) {
         // Preprocess
         cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
         // Run inference
-        infer(*context, stream, (void **)device_buffers, output_buffer_host, output_seg_buffer_host,kBatchSize, decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process);
+        infer(*context, stream, (void**)device_buffers, output_buffer_host, output_seg_buffer_host, kBatchSize,
+              decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process);
         std::vector<std::vector<Detection>> res_batch;
         if (cuda_post_process == "c") {
             // NMS

From 16088b37227433bc8eaed382120f6472b793f9f2 Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Fri, 26 Apr 2024 17:30:22 +0800
Subject: [PATCH 13/24] yolov8 pose

---
 yolov8/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yolov8/README.md b/yolov8/README.md
index 8f198c65..8a42fe18 100644
--- a/yolov8/README.md
+++ b/yolov8/README.md
@@ -111,7 +111,7 @@ sudo ./yolov8_cls -d yolov8n-cls.engine ../samples
 # Build and serialize TensorRT engine
 ./yolov8_pose -s yolov8n-pose.wts yolov8n-pose.engine n
 
-# Run inference with labels file
+# Run inference
 ./yolov8_pose -d yolov8n-pose.engine ../images c
 ```
 

From 7e3d564d75bad6ec5a422f5927865eb10605f3e0 Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Sun, 28 Apr 2024 08:22:51 +0800
Subject: [PATCH 14/24] yolov8 pose

---
 yolov8/src/model.cpp | 94 --------------------------------------------
 1 file changed, 94 deletions(-)

diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp
index 2e67bacb..b7b14bbe 100644
--- a/yolov8/src/model.cpp
+++ b/yolov8/src/model.cpp
@@ -1275,96 +1275,65 @@ nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfe
     *******************************************************************************************************/
     nvinfer1::IElementWiseLayer* conv0 =
             convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
-    printLayerDims(conv0, "conv0");
     nvinfer1::IElementWiseLayer* conv1 =
             convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1");
-    printLayerDims(conv1, "conv1");
     nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
                                              get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
-    printLayerDims(conv2, "conv2");
     nvinfer1::IElementWiseLayer* conv3 =
             convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3");
-    printLayerDims(conv3, "conv3");
     nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
                                              get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
-    printLayerDims(conv4, "conv4");
     nvinfer1::IElementWiseLayer* conv5 =
             convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5");
-    printLayerDims(conv5, "conv5");
     nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
                                              get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
-    printLayerDims(conv6, "conv6");
     nvinfer1::IElementWiseLayer* conv7 =
             convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7");
-    printLayerDims(conv7, "conv7");
     nvinfer1::IElementWiseLayer* conv8 =
             C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");
-    printLayerDims(conv8, "conv8");
     nvinfer1::IElementWiseLayer* conv9 =
             SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
                  get_width(1024, gw, max_channels), 5, "model.9");
-    printLayerDims(conv9, "conv9");
     /*******************************************************************************************************
     *********************************************  YOLOV8 HEAD  ********************************************
     *******************************************************************************************************/
     float scale[] = {1.0, 2.0, 2.0};
     nvinfer1::IResizeLayer* upsample10 = network->addResize(*conv9->getOutput(0));
-    printLayerDims(upsample10, "upsample10");
     assert(upsample10);
     upsample10->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
-    printLayerDims(upsample10, "upsample10");
     upsample10->setScales(scale, 3);
-    printLayerDims(upsample10, "upsample10");
 
     nvinfer1::ITensor* inputTensor11[] = {upsample10->getOutput(0), conv6->getOutput(0)};
-    printTensorsDims(inputTensor11, 2, "inputTensor11");
     nvinfer1::IConcatenationLayer* cat11 = network->addConcatenation(inputTensor11, 2);
-    printLayerDims(cat11, "cat11");
     nvinfer1::IElementWiseLayer* conv12 =
             C2F(network, weightMap, *cat11->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12");
-    printLayerDims(conv12, "conv12");
 
     nvinfer1::IResizeLayer* upsample13 = network->addResize(*conv12->getOutput(0));
-    printLayerDims(upsample13, "upsample13");
     assert(upsample13);
     upsample13->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
-    printLayerDims(upsample13, "upsample13");
     upsample13->setScales(scale, 3);
-    printLayerDims(upsample13, "upsample13");
 
     nvinfer1::ITensor* inputTensor14[] = {upsample13->getOutput(0), conv4->getOutput(0)};
-    printTensorsDims(inputTensor14, 2, "inputTensor14");
     nvinfer1::IConcatenationLayer* cat14 = network->addConcatenation(inputTensor14, 2);
-    printLayerDims(cat14, "cat14");
     nvinfer1::IElementWiseLayer* conv15 =
             C2F(network, weightMap, *cat14->getOutput(0), get_width(256, gw, max_channels),
                 get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15");
-    printLayerDims(conv15, "conv15");
     nvinfer1::IElementWiseLayer* conv16 = convBnSiLU(network, weightMap, *conv15->getOutput(0),
                                                      get_width(256, gw, max_channels), 3, 2, 1, "model.16");
-    printLayerDims(conv16, "conv16");
     nvinfer1::ITensor* inputTensor17[] = {conv16->getOutput(0), conv12->getOutput(0)};
-    printTensorsDims(inputTensor17, 2, "inputTensor17");
     nvinfer1::IConcatenationLayer* cat17 = network->addConcatenation(inputTensor17, 2);
-    printLayerDims(cat17, "cat17");
     nvinfer1::IElementWiseLayer* conv18 =
             C2F(network, weightMap, *cat17->getOutput(0), get_width(512, gw, max_channels),
                 get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18");
-    printLayerDims(conv18, "conv18");
     nvinfer1::IElementWiseLayer* conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0),
                                                      get_width(512, gw, max_channels), 3, 2, 1, "model.19");
-    printLayerDims(conv19, "conv19");
     nvinfer1::ITensor* inputTensor20[] = {conv19->getOutput(0), conv9->getOutput(0)};
-    printTensorsDims(inputTensor20, 2, "inputTensor20");
     nvinfer1::IConcatenationLayer* cat20 = network->addConcatenation(inputTensor20, 2);
-    printLayerDims(cat20, "cat20");
     nvinfer1::IElementWiseLayer* conv21 =
             C2F(network, weightMap, *cat20->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21");
-    printLayerDims(conv21, "conv21");
-    printf("\n\n");
 
     /*******************************************************************************************************
     *********************************************  YOLOV8 OUTPUT  ******************************************
@@ -1375,99 +1344,64 @@ nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfe
     // output0
     nvinfer1::IElementWiseLayer* conv22_cv2_0_0 =
             convBnSiLU(network, weightMap, *conv15->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.0");
-    printLayerDims(conv22_cv2_0_0, "conv22_cv2_0_0");
     nvinfer1::IElementWiseLayer* conv22_cv2_0_1 =
             convBnSiLU(network, weightMap, *conv22_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.1");
-    printLayerDims(conv22_cv2_0_1, "conv22_cv2_0_1");
     nvinfer1::IConvolutionLayer* conv22_cv2_0_2 =
             network->addConvolutionNd(*conv22_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                       weightMap["model.22.cv2.0.2.weight"], weightMap["model.22.cv2.0.2.bias"]);
-    printLayerDims(conv22_cv2_0_2, "conv22_cv2_0_2");
     conv22_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
-    printLayerDims(conv22_cv2_0_2, "conv22_cv2_0_2");
     conv22_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
-    printLayerDims(conv22_cv2_0_2, "conv22_cv2_0_2");
     nvinfer1::IElementWiseLayer* conv22_cv3_0_0 =
             convBnSiLU(network, weightMap, *conv15->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.0");
-    printLayerDims(conv22_cv3_0_0, "conv22_cv3_0_0");
     nvinfer1::IElementWiseLayer* conv22_cv3_0_1 = convBnSiLU(network, weightMap, *conv22_cv3_0_0->getOutput(0),
                                                              base_out_channel, 3, 1, 1, "model.22.cv3.0.1");
-    printLayerDims(conv22_cv3_0_1, "conv22_cv3_0_1");
     nvinfer1::IConvolutionLayer* conv22_cv3_0_2 =
             network->addConvolutionNd(*conv22_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                       weightMap["model.22.cv3.0.2.weight"], weightMap["model.22.cv3.0.2.bias"]);
-    printLayerDims(conv22_cv3_0_2, "conv22_cv3_0_2");
     conv22_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1});
-    printLayerDims(conv22_cv3_0_2, "conv22_cv3_0_2");
     conv22_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0});
-    printLayerDims(conv22_cv3_0_2, "conv22_cv3_0_2");
     nvinfer1::ITensor* inputTensor22_0[] = {conv22_cv2_0_2->getOutput(0), conv22_cv3_0_2->getOutput(0)};
-    printTensorsDims(inputTensor22_0, 2, "inputTensor22_0");
     nvinfer1::IConcatenationLayer* cat22_0 = network->addConcatenation(inputTensor22_0, 2);
-    printLayerDims(cat22_0, "cat22_0");
-    printf("\n\n");
 
     // output1
     nvinfer1::IElementWiseLayer* conv22_cv2_1_0 =
             convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.0");
-    printLayerDims(conv22_cv2_1_0, "conv22_cv2_1_0");
     nvinfer1::IElementWiseLayer* conv22_cv2_1_1 =
             convBnSiLU(network, weightMap, *conv22_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.1");
-    printLayerDims(conv22_cv2_1_1, "conv22_cv2_1_1");
     nvinfer1::IConvolutionLayer* conv22_cv2_1_2 =
             network->addConvolutionNd(*conv22_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                       weightMap["model.22.cv2.1.2.weight"], weightMap["model.22.cv2.1.2.bias"]);
-    printLayerDims(conv22_cv2_1_2, "conv22_cv2_1_2");
     conv22_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
-    printLayerDims(conv22_cv2_1_2, "conv22_cv2_1_2");
     conv22_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
-    printLayerDims(conv22_cv2_1_2, "conv22_cv2_1_2");
     nvinfer1::IElementWiseLayer* conv22_cv3_1_0 =
             convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.0");
-    printLayerDims(conv22_cv3_1_0, "conv22_cv3_1_0");
     nvinfer1::IElementWiseLayer* conv22_cv3_1_1 = convBnSiLU(network, weightMap, *conv22_cv3_1_0->getOutput(0),
                                                              base_out_channel, 3, 1, 1, "model.22.cv3.1.1");
-    printLayerDims(conv22_cv3_1_1, "conv22_cv3_1_1");
     nvinfer1::IConvolutionLayer* conv22_cv3_1_2 =
             network->addConvolutionNd(*conv22_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                       weightMap["model.22.cv3.1.2.weight"], weightMap["model.22.cv3.1.2.bias"]);
-    printLayerDims(conv22_cv3_1_2, "conv22_cv3_1_2");
     conv22_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
-    printLayerDims(conv22_cv3_1_2, "conv22_cv3_1_2");
     conv22_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
-    printLayerDims(conv22_cv3_1_2, "conv22_cv3_1_2");
     nvinfer1::ITensor* inputTensor22_1[] = {conv22_cv2_1_2->getOutput(0), conv22_cv3_1_2->getOutput(0)};
-    printTensorsDims(inputTensor22_1, 2, "inputTensor22_1");
     nvinfer1::IConcatenationLayer* cat22_1 = network->addConcatenation(inputTensor22_1, 2);
-    printLayerDims(cat22_1, "cat22_1");
-    printf("\n\n");
 
     // output2
     nvinfer1::IElementWiseLayer* conv22_cv2_2_0 =
             convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.0");
-    printLayerDims(conv22_cv2_2_0, "conv22_cv2_2_0");
     nvinfer1::IElementWiseLayer* conv22_cv2_2_1 =
             convBnSiLU(network, weightMap, *conv22_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.1");
-    printLayerDims(conv22_cv2_2_1, "conv22_cv2_2_1");
     nvinfer1::IConvolutionLayer* conv22_cv2_2_2 =
             network->addConvolution(*conv22_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
                                     weightMap["model.22.cv2.2.2.weight"], weightMap["model.22.cv2.2.2.bias"]);
-    printLayerDims(conv22_cv2_2_2, "conv22_cv2_2_2");
     nvinfer1::IElementWiseLayer* conv22_cv3_2_0 =
             convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.0");
-    printLayerDims(conv22_cv3_2_0, "conv22_cv3_2_0");
     nvinfer1::IElementWiseLayer* conv22_cv3_2_1 = convBnSiLU(network, weightMap, *conv22_cv3_2_0->getOutput(0),
                                                              base_out_channel, 3, 1, 1, "model.22.cv3.2.1");
-    printLayerDims(conv22_cv3_2_1, "conv22_cv3_2_1");
     nvinfer1::IConvolutionLayer* conv22_cv3_2_2 =
             network->addConvolution(*conv22_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
                                     weightMap["model.22.cv3.2.2.weight"], weightMap["model.22.cv3.2.2.bias"]);
-    printLayerDims(conv22_cv3_2_2, "conv22_cv3_2_2");
     nvinfer1::ITensor* inputTensor22_2[] = {conv22_cv2_2_2->getOutput(0), conv22_cv3_2_2->getOutput(0)};
-    printTensorsDims(inputTensor22_2, 2, "inputTensor22_2");
     nvinfer1::IConcatenationLayer* cat22_2 = network->addConcatenation(inputTensor22_2, 2);
-    printLayerDims(cat22_2, "cat22_2");
-    printf("\n\n");
     /*******************************************************************************************************
     *********************************************  YOLOV8 DETECT  ******************************************
     *******************************************************************************************************/
@@ -1479,56 +1413,40 @@ nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfe
 
     /**************************************************************************************P3****************************************************************************************************************************************/
     nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0));
-    printLayerDims(shuffle22_0, "shuffle22_0");
     shuffle22_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
-    printLayerDims(shuffle22_0, "shuffle22_0->setReshapeDimensions");
     nvinfer1::ISliceLayer* split22_0_0 = network->addSlice(
             *shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0},
             nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
-    printLayerDims(split22_0_0, "split22_0_0");
     nvinfer1::ISliceLayer* split22_0_1 = network->addSlice(
             *shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0},
             nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
-    printLayerDims(split22_0_1, "split22_0_1");
     nvinfer1::IShuffleLayer* dfl22_0 =
             DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                 1, 0, "model.22.dfl.conv.weight");
-    printLayerDims(dfl22_0, "dfl22_0");
 
     // det0
-    printLayerDims(conv15, "conv15");
     std::cout << "conv15->getOutput(0)->getDimensions().d[0] : " << conv15->getOutput(0)->getDimensions().d[0]
               << "  (kInputH / strides[0]) * (kInputW / strides[0]) : "
               << (kInputH / strides[0]) * (kInputW / strides[0]) << std::endl;
     auto shuffle_conv15 = cv4_conv_combined(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0",
                                             (kInputH / strides[0]) * (kInputW / strides[0]), gw, "pose");
 
-    printLayerDims(shuffle_conv15, "shuffle_conv15");
-
     nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0),
                                                 shuffle_conv15->getOutput(0)};
-    printTensorsDims(inputTensor22_dfl_0, 3, "inputTensor22_dfl_0");
     nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 3);
-    printLayerDims(cat22_dfl_0, "cat22_dfl_0");
-    printf("\n\n");
 
     /********************************************************************************************P4**********************************************************************************************************************************/
     nvinfer1::IShuffleLayer* shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0));
-    printLayerDims(shuffle22_1, "shuffle22_1");
     shuffle22_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
-    printLayerDims(shuffle22_1, "shuffle22_1->setReshapeDimensions");
     nvinfer1::ISliceLayer* split22_1_0 = network->addSlice(
             *shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0},
             nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
-    printLayerDims(split22_1_0, "split22_1_0");
     nvinfer1::ISliceLayer* split22_1_1 = network->addSlice(
             *shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0},
             nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
-    printLayerDims(split22_1_1, "split22_1_1");
     nvinfer1::IShuffleLayer* dfl22_1 =
             DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
                 1, 0, "model.22.dfl.conv.weight");
-    printLayerDims(dfl22_1, "dfl22_1");
 
     // det1
     auto shuffle_conv18 = cv4_conv_combined(network, weightMap, *conv18->getOutput(0), "model.22.cv4.1",
@@ -1536,43 +1454,31 @@ nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfe
 
     nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0),
                                                 shuffle_conv18->getOutput(0)};
-    printTensorsDims(inputTensor22_dfl_1, 3, "inputTensor22_dfl_1");
     nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 3);
-    printLayerDims(cat22_dfl_1, "cat22_dfl_1");
-    printf("\n\n");
 
     /********************************************************************************************P5**********************************************************************************************************************************/
     nvinfer1::IShuffleLayer* shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0));
-    printLayerDims(shuffle22_2, "shuffle22_2");
     shuffle22_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
-    printLayerDims(shuffle22_2, "shuffle22_2->setReshapeDimensions");
     nvinfer1::ISliceLayer* split22_2_0 = network->addSlice(
             *shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0},
             nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
-    printLayerDims(split22_2_0, "split22_2_0");
     nvinfer1::ISliceLayer* split22_2_1 = network->addSlice(
             *shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0},
             nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
-    printLayerDims(split22_2_1, "split22_2_1");
     nvinfer1::IShuffleLayer* dfl22_2 =
             DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
                 1, 0, "model.22.dfl.conv.weight");
-    printLayerDims(dfl22_2, "dfl22_2");
 
     // det2
     auto shuffle_conv21 = cv4_conv_combined(network, weightMap, *conv21->getOutput(0), "model.22.cv4.2",
                                             (kInputH / strides[2]) * (kInputW / strides[2]), gw, "pose");
     nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0),
                                                 shuffle_conv21->getOutput(0)};
-    printTensorsDims(inputTensor22_dfl_2, 3, "inputTensor22_dfl_2");
     nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 3);
-    printLayerDims(cat22_dfl_2, "cat22_dfl_2");
-    printf("\n\n");
 
     nvinfer1::IPluginV2Layer* yolo =
             addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2},
                          strides, stridesLength, false, true);
-    printLayerDims(yolo, "yolo");
     yolo->getOutput(0)->setName(kOutputTensorName);
     network->markOutput(*yolo->getOutput(0));
 

From 255c191860fcf4da4cee5c9c7d02c1e50cb10241 Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Sun, 28 Apr 2024 08:25:46 +0800
Subject: [PATCH 15/24] yolov8 pose

---
 yolov8/src/model.cpp | 92 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 70 insertions(+), 22 deletions(-)

diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp
index b7b14bbe..f2930e8e 100644
--- a/yolov8/src/model.cpp
+++ b/yolov8/src/model.cpp
@@ -6,6 +6,31 @@
 #include "config.h"
 #include "model.h"
 
+// Get the dimensions of the layer's output.
+nvinfer1::Dims dims = layer->getOutput(0)->getDimensions();
+
+// Print the layer's name and output dimensions.
+std::cout << "name: " << layerName << "  Layer name: " << layer->getName() << " Output Dims: ";
+for (int i = 0; i < dims.nbDims; ++i) {
+    std::cout << dims.d[i] << (i < dims.nbDims - 1 ? "x" : "");
+}
+std::cout << std::endl;
+}
+
+for (int t = 0; t < numTensors; ++t) {
+    std::cout << tensorsName << "[" << t << "]: ";
+    if (tensors[t] != nullptr) {
+        nvinfer1::Dims dims = tensors[t]->getDimensions();
+        for (int i = 0; i < dims.nbDims; ++i) {
+            std::cout << dims.d[i] << (i < dims.nbDims - 1 ? "x" : "");
+        }
+        std::cout << std::endl;
+    } else {
+        std::cout << "nullptr" << std::endl;
+    }
+}
+}
+
 static int get_width(int x, float gw, int max_channels, int divisor = 8) {
     auto channel = int(ceil((x * gw) / divisor)) * divisor;
     return channel >= max_channels ? max_channels : channel;
@@ -47,30 +72,49 @@ static nvinfer1::IElementWiseLayer* Proto(nvinfer1::INetworkDefinition* network,
     return cv3;
 }
 
-static nvinfer1::IShuffleLayer* ProtoCoef(nvinfer1::INetworkDefinition* network,
-                                          std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input,
-                                          std::string lname, int grid_shape, float gw) {
-
+static nvinfer1::IShuffleLayer* cv4_conv_combined(nvinfer1::INetworkDefinition* network,
+                                                  std::map<std::string, nvinfer1::Weights>& weightMap,
+                                                  nvinfer1::ITensor& input, std::string lname, int grid_shape, float gw,
+                                                  std::string algo_type) {
     int mid_channle = 0;
-    if (gw == 0.25 || gw == 0.5) {
-        mid_channle = 32;
-    } else if (gw == 0.75) {
-        mid_channle = 48;
-    } else if (gw == 1.00) {
-        mid_channle = 64;
-    } else if (gw == 1.25) {
-        mid_channle = 80;
+    int output_channel = 0;
+
+    if (algo_type == "seg") {
+        if (gw == 0.25 || gw == 0.5) {
+            mid_channle = 32;
+        } else if (gw == 0.75) {
+            mid_channle = 48;
+        } else if (gw == 1.00) {
+            mid_channle = 64;
+        } else if (gw == 1.25) {
+            mid_channle = 80;
+        }
+
+        output_channel = 32;
+
+    } else if (algo_type == "pose") {
+        if (gw == 0.25 || gw == 0.5 || gw == 0.75) {
+            mid_channle = 51;
+        } else if (gw == 1.00) {
+            mid_channle = 64;
+        } else if (gw == 1.25) {
+            mid_channle = 80;
+        }
+
+        output_channel = 51;
     }
+
     auto cv0 = convBnSiLU(network, weightMap, input, mid_channle, 3, 1, 1, lname + ".0");
     auto cv1 = convBnSiLU(network, weightMap, *cv0->getOutput(0), mid_channle, 3, 1, 1, lname + ".1");
     float* cv2_bais_value = (float*)weightMap[lname + ".2" + ".bias"].values;
     int cv2_bais_len = weightMap[lname + ".2" + ".bias"].count;
     nvinfer1::Weights cv2_bais{nvinfer1::DataType::kFLOAT, cv2_bais_value, cv2_bais_len};
-    auto cv2 = network->addConvolutionNd(*cv1->getOutput(0), 32, nvinfer1::DimsHW{1, 1},
+    auto cv2 = network->addConvolutionNd(*cv1->getOutput(0), output_channel, nvinfer1::DimsHW{1, 1},
                                          weightMap[lname + ".2" + ".weight"], cv2_bais);
     cv2->setStrideNd(nvinfer1::DimsHW{1, 1});
     nvinfer1::IShuffleLayer* cv2_shuffle = network->addShuffle(*cv2->getOutput(0));
-    cv2_shuffle->setReshapeDimensions(nvinfer1::Dims2{32, grid_shape});
+    cv2_shuffle->setReshapeDimensions(nvinfer1::Dims2{output_channel, grid_shape});
+
     return cv2_shuffle;
 }
 
@@ -278,7 +322,8 @@ nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer
 
     nvinfer1::IPluginV2Layer* yolo =
             addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2},
-                         strides, stridesLength, false);
+                         strides, stridesLength, false, false);
+
     yolo->getOutput(0)->setName(kOutputTensorName);
     network->markOutput(*yolo->getOutput(0));
 
@@ -588,7 +633,7 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
 
     nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(
             network, std::vector<nvinfer1::IConcatenationLayer*>{cat30_dfl_0, cat30_dfl_1, cat30_dfl_2, cat30_dfl_3},
-            strides, stridesLength, false);
+            strides, stridesLength, false, false);
     yolo->getOutput(0)->setName(kOutputTensorName);
     network->markOutput(*yolo->getOutput(0));
 
@@ -847,6 +892,7 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinf
     nvinfer1::IShuffleLayer* dfl28_0 =
             DFL(network, weightMap, *split28_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
                 1, 0, "model.28.dfl.conv.weight");
+
     nvinfer1::ITensor* inputTensor28_dfl_0[] = {dfl28_0->getOutput(0), split28_0_1->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat28_dfl_0 = network->addConcatenation(inputTensor28_dfl_0, 2);
 
@@ -897,8 +943,7 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinf
 
     nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(
             network, std::vector<nvinfer1::IConcatenationLayer*>{cat28_dfl_0, cat28_dfl_1, cat28_dfl_2, cat28_dfl_3},
-            strides, stridesLength, false);
-
+            strides, stridesLength, false, false);
     yolo->getOutput(0)->setName(kOutputTensorName);
     network->markOutput(*yolo->getOutput(0));
 
@@ -1205,26 +1250,29 @@ nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer
                 1, 0, "model.22.dfl.conv.weight");
 
     // det0
-    auto proto_coef_0 = ProtoCoef(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0", 6400, gw);
+    auto proto_coef_0 = cv4_conv_combined(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0",
+                                          (kInputH / strides[0]) * (kInputW / strides[0]), gw, "seg");
     nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0),
                                                 proto_coef_0->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 3);
 
     // det1
-    auto proto_coef_1 = ProtoCoef(network, weightMap, *conv18->getOutput(0), "model.22.cv4.1", 1600, gw);
+    auto proto_coef_1 = cv4_conv_combined(network, weightMap, *conv18->getOutput(0), "model.22.cv4.1",
+                                          (kInputH / strides[1]) * (kInputW / strides[1]), gw, "seg");
     nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0),
                                                 proto_coef_1->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 3);
 
     // det2
-    auto proto_coef_2 = ProtoCoef(network, weightMap, *conv21->getOutput(0), "model.22.cv4.2", 400, gw);
+    auto proto_coef_2 = cv4_conv_combined(network, weightMap, *conv21->getOutput(0), "model.22.cv4.2",
+                                          (kInputH / strides[2]) * (kInputW / strides[2]), gw, "seg");
     nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0),
                                                 proto_coef_2->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 3);
 
     nvinfer1::IPluginV2Layer* yolo =
             addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2},
-                         strides, stridesLength, true);
+                         strides, stridesLength, true, false);
     yolo->getOutput(0)->setName(kOutputTensorName);
     network->markOutput(*yolo->getOutput(0));
 

From a8ea19b814ee0752adb8b1ecb5a44396566e1688 Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Sun, 28 Apr 2024 08:40:01 +0800
Subject: [PATCH 16/24] yolov8 pose

---
 yolov8/src/model.cpp | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp
index f2930e8e..4cc4088e 100644
--- a/yolov8/src/model.cpp
+++ b/yolov8/src/model.cpp
@@ -6,31 +6,6 @@
 #include "config.h"
 #include "model.h"
 
-// Get the dimensions of the layer's output.
-nvinfer1::Dims dims = layer->getOutput(0)->getDimensions();
-
-// Print the layer's name and output dimensions.
-std::cout << "name: " << layerName << "  Layer name: " << layer->getName() << " Output Dims: ";
-for (int i = 0; i < dims.nbDims; ++i) {
-    std::cout << dims.d[i] << (i < dims.nbDims - 1 ? "x" : "");
-}
-std::cout << std::endl;
-}
-
-for (int t = 0; t < numTensors; ++t) {
-    std::cout << tensorsName << "[" << t << "]: ";
-    if (tensors[t] != nullptr) {
-        nvinfer1::Dims dims = tensors[t]->getDimensions();
-        for (int i = 0; i < dims.nbDims; ++i) {
-            std::cout << dims.d[i] << (i < dims.nbDims - 1 ? "x" : "");
-        }
-        std::cout << std::endl;
-    } else {
-        std::cout << "nullptr" << std::endl;
-    }
-}
-}
-
 static int get_width(int x, float gw, int max_channels, int divisor = 8) {
     auto channel = int(ceil((x * gw) / divisor)) * divisor;
     return channel >= max_channels ? max_channels : channel;

From 99f946c38a908701c8372088acadf661621840dd Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Sun, 28 Apr 2024 08:42:18 +0800
Subject: [PATCH 17/24] yolov8 pose

---
 yolov8/yolov8_pose.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/yolov8/yolov8_pose.cpp b/yolov8/yolov8_pose.cpp
index 5b848a6b..4ef16598 100644
--- a/yolov8/yolov8_pose.cpp
+++ b/yolov8/yolov8_pose.cpp
@@ -20,9 +20,9 @@ void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p
     IHostMemory* serialized_engine = nullptr;
 
     if (is_p == 6) {
-        serialized_engine = buildEngineYolov8Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
+        std::cout << "p6 is not support in gpu right nowe" << std::endl;
     } else if (is_p == 2) {
-        serialized_engine = buildEngineYolov8Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
+        std::cout << "p2 is not support in gpu right now" << std::endl;
     } else {
         serialized_engine = buildEngineYolov8Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
     }

From 6369d79d2d3d701b5c19575897ad64c5d54306c7 Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Sun, 28 Apr 2024 08:43:31 +0800
Subject: [PATCH 18/24] yolov8 pose

---
 yolov8/yolov8_pose.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/yolov8/yolov8_pose.cpp b/yolov8/yolov8_pose.cpp
index 4ef16598..12aa1b13 100644
--- a/yolov8/yolov8_pose.cpp
+++ b/yolov8/yolov8_pose.cpp
@@ -20,9 +20,9 @@ void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p
     IHostMemory* serialized_engine = nullptr;
 
     if (is_p == 6) {
-        std::cout << "p6 is not support in gpu right nowe" << std::endl;
+        std::cout << "p6 is not supported right nowe" << std::endl;
     } else if (is_p == 2) {
-        std::cout << "p2 is not support in gpu right now" << std::endl;
+        std::cout << "p2 is not supported right now" << std::endl;
     } else {
         serialized_engine = buildEngineYolov8Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
     }

From 47fb20eeff68b356fec0c2e0ab7d4c5bcaf4c81c Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Sun, 28 Apr 2024 08:45:41 +0800
Subject: [PATCH 19/24] yolov8 pose

---
 yolov8/yolov8_pose.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/yolov8/yolov8_pose.cpp b/yolov8/yolov8_pose.cpp
index 12aa1b13..2b354e46 100644
--- a/yolov8/yolov8_pose.cpp
+++ b/yolov8/yolov8_pose.cpp
@@ -180,7 +180,8 @@ int main(int argc, char** argv) {
     if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw,
                     max_channels)) {
         std::cerr << "Arguments not right!" << std::endl;
-        std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n6/s6/m6/l6/x6]  // serialize model to plan file"
+        std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6]  // serialize model to "
+                     "plan file"
                   << std::endl;
         std::cerr << "./yolov8 -d [.engine] ../samples  [c/g]// deserialize plan file and run inference" << std::endl;
         return -1;

From c1eaf4b3982170a56e9c214dc88670738da9b313 Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Sun, 28 Apr 2024 08:50:22 +0800
Subject: [PATCH 20/24] yolov8 pose

---
 yolov8/README.md | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/yolov8/README.md b/yolov8/README.md
index 8a42fe18..fadf8ce2 100644
--- a/yolov8/README.md
+++ b/yolov8/README.md
@@ -108,11 +108,19 @@ sudo ./yolov8_cls -d yolov8n-cls.engine ../samples
 
 ### Pose Estimation
 ```
-# Build and serialize TensorRT engine
-./yolov8_pose -s yolov8n-pose.wts yolov8n-pose.engine n
-
-# Run inference
-./yolov8_pose -d yolov8n-pose.engine ../images c
+cd {tensorrtx}/yolov8/
+// update "kNumClass = 1" in config.h
+mkdir build
+cd build
+cp {ultralytics}/ultralytics/yolov8-pose.wts {tensorrtx}/yolov8/build
+cmake ..
+make
+sudo ./yolov8_pose -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6]  // serialize model to plan file
+sudo ./yolov8_pose -d [.engine] [image folder]  [c/g] // deserialize and run inference, the images in [image folder] will be processed.
+// For example yolov8-pose
+sudo ./yolov8_pose -s yolov8n.wts yolov8.engine n
+sudo ./yolov8_pose -d yolov8n.engine ../images c //cpu postprocess
+sudo ./yolov8_pose -d yolov8n.engine ../images g //gpu postprocess
 ```
 
 

From 6ed8ce182b5b392e4edd6b6e645d7ab628c80e7e Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Sun, 28 Apr 2024 08:51:53 +0800
Subject: [PATCH 21/24] yolov8 pose

---
 yolov8/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/yolov8/README.md b/yolov8/README.md
index fadf8ce2..c014514a 100644
--- a/yolov8/README.md
+++ b/yolov8/README.md
@@ -118,9 +118,9 @@ make
 sudo ./yolov8_pose -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6]  // serialize model to plan file
 sudo ./yolov8_pose -d [.engine] [image folder]  [c/g] // deserialize and run inference, the images in [image folder] will be processed.
 // For example yolov8-pose
-sudo ./yolov8_pose -s yolov8n.wts yolov8.engine n
-sudo ./yolov8_pose -d yolov8n.engine ../images c //cpu postprocess
-sudo ./yolov8_pose -d yolov8n.engine ../images g //gpu postprocess
+sudo ./yolov8_pose -s yolov8n-pose.wts yolov8.engine n
+sudo ./yolov8_pose -d yolov8n-pose.engine ../images c //cpu postprocess
+sudo ./yolov8_pose -d yolov8n-pose.engine ../images g //gpu postprocess
 ```
 
 

From 4b447e8b6c74b356c45a435c0bdd48dc619e6f46 Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Sun, 28 Apr 2024 08:52:33 +0800
Subject: [PATCH 22/24] yolov8 pose

---
 yolov8/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yolov8/README.md b/yolov8/README.md
index c014514a..e4762e2f 100644
--- a/yolov8/README.md
+++ b/yolov8/README.md
@@ -118,7 +118,7 @@ make
 sudo ./yolov8_pose -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6]  // serialize model to plan file
 sudo ./yolov8_pose -d [.engine] [image folder]  [c/g] // deserialize and run inference, the images in [image folder] will be processed.
 // For example yolov8-pose
-sudo ./yolov8_pose -s yolov8n-pose.wts yolov8.engine n
+sudo ./yolov8_pose -s yolov8n-pose.wts yolov8n-pose.engine n
 sudo ./yolov8_pose -d yolov8n-pose.engine ../images c //cpu postprocess
 sudo ./yolov8_pose -d yolov8n-pose.engine ../images g //gpu postprocess
 ```

From c6110dde5e728e0d5bf4b200dc7ced0f6280234e Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Mon, 29 Apr 2024 10:12:32 +0800
Subject: [PATCH 23/24] yolov8_pose_trt.py

---
 yolov8/README.md          |   7 +-
 yolov8/yolov8_pose_trt.py | 500 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 504 insertions(+), 3 deletions(-)
 create mode 100644 yolov8/yolov8_pose_trt.py

diff --git a/yolov8/README.md b/yolov8/README.md
index e4762e2f..0ceb1b5e 100644
--- a/yolov8/README.md
+++ b/yolov8/README.md
@@ -129,9 +129,10 @@ sudo ./yolov8_pose -d yolov8n-pose.engine ../images g //gpu postprocess
 ```
 // install python-tensorrt, pycuda, etc.
 // ensure the yolov8n.engine and libmyplugins.so have been built
-python yolov8_det.py  # Detection
-python yolov8_seg.py  # Segmentation
-python yolov8_cls.py  # Classification
+python yolov8_det_trt.py  # Detection
+python yolov8_seg_trt.py  # Segmentation
+python yolov8_cls_trt.py  # Classification
+python yolov8_pose_trt.py  # Pose Estimation
 ```
 
 # INT8 Quantization
diff --git a/yolov8/yolov8_pose_trt.py b/yolov8/yolov8_pose_trt.py
new file mode 100644
index 00000000..f56a61f2
--- /dev/null
+++ b/yolov8/yolov8_pose_trt.py
@@ -0,0 +1,500 @@
+"""
+An example that uses TensorRT's Python api to make inferences.
+"""
+import ctypes
+import os
+import shutil
+import random
+import sys
+import threading
+import time
+import cv2
+import numpy as np
+import pycuda.autoinit  # noqa: F401
+import pycuda.driver as cuda
+import tensorrt as trt
+
+
+CONF_THRESH = 0.5
+IOU_THRESHOLD = 0.4
+
+keypoint_pairs = [
+    (0, 1), (0, 2), (0, 5), (0, 6), (1, 2),
+    (1, 3), (2, 4), (5, 6), (5, 7), (5, 11),
+    (6, 8), (6, 12), (7, 9), (8, 10), (11, 12),
+    (11, 13), (12, 14), (13, 15), (14, 16)
+]
+
+
+def get_img_path_batches(batch_size, img_dir):
+    ret = []
+    batch = []
+    for root, dirs, files in os.walk(img_dir):
+        for name in files:
+            if len(batch) == batch_size:
+                ret.append(batch)
+                batch = []
+            batch.append(os.path.join(root, name))
+    if len(batch) > 0:
+        ret.append(batch)
+    return ret
+
+
+def plot_one_box(x, img, color=None, label=None, line_thickness=None):
+    """
+    description: Plots one bounding box on image img,
+                 this function comes from YoLov8 project.
+    param:
+        x:      a box likes [x1,y1,x2,y2]
+        img:    a opencv image object
+        color:  color to draw rectangle, such as (0,255,0)
+        label:  str
+        line_thickness: int
+    return:
+        no return
+
+    """
+    tl = (
+            line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
+    )  # line/font thickness
+    color = color or [random.randint(0, 255) for _ in range(3)]
+    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
+    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
+    if label:
+        tf = max(tl - 1, 1)  # font thickness
+        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
+        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
+        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
+        cv2.putText(
+            img,
+            label,
+            (c1[0], c1[1] - 2),
+            0,
+            tl / 3,
+            [225, 255, 255],
+            thickness=tf,
+            lineType=cv2.LINE_AA,
+        )
+
+
+class YoLov8TRT(object):
+    """
+    description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops.
+    """
+
+    def __init__(self, engine_file_path):
+        # Create a Context on this device,
+        self.ctx = cuda.Device(0).make_context()
+        stream = cuda.Stream()
+        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
+        runtime = trt.Runtime(TRT_LOGGER)
+
+        # Deserialize the engine from file
+        with open(engine_file_path, "rb") as f:
+            engine = runtime.deserialize_cuda_engine(f.read())
+        context = engine.create_execution_context()
+
+        host_inputs = []
+        cuda_inputs = []
+        host_outputs = []
+        cuda_outputs = []
+        bindings = []
+
+        for binding in engine:
+            print('bingding:', binding, engine.get_binding_shape(binding))
+            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
+            dtype = trt.nptype(engine.get_binding_dtype(binding))
+            # Allocate host and device buffers
+            host_mem = cuda.pagelocked_empty(size, dtype)
+            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
+            # Append the device buffer to device bindings.
+            bindings.append(int(cuda_mem))
+            # Append to the appropriate list.
+            if engine.binding_is_input(binding):
+                self.input_w = engine.get_binding_shape(binding)[-1]
+                self.input_h = engine.get_binding_shape(binding)[-2]
+                host_inputs.append(host_mem)
+                cuda_inputs.append(cuda_mem)
+            else:
+                host_outputs.append(host_mem)
+                cuda_outputs.append(cuda_mem)
+
+        # Store
+        self.stream = stream
+        self.context = context
+        self.host_inputs = host_inputs
+        self.cuda_inputs = cuda_inputs
+        self.host_outputs = host_outputs
+        self.cuda_outputs = cuda_outputs
+        self.bindings = bindings
+        self.batch_size = engine.max_batch_size
+        self.det_output_size = 89001
+
+    def infer(self, raw_image_generator):
+        threading.Thread.__init__(self)
+        # Make self the active context, pushing it on top of the context stack.
+        self.ctx.push()
+        # Restore
+        stream = self.stream
+        context = self.context
+        host_inputs = self.host_inputs
+        cuda_inputs = self.cuda_inputs
+        host_outputs = self.host_outputs
+        cuda_outputs = self.cuda_outputs
+        bindings = self.bindings
+        # Do image preprocess
+        batch_image_raw = []
+        batch_origin_h = []
+        batch_origin_w = []
+        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
+        for i, image_raw in enumerate(raw_image_generator):
+            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
+            batch_image_raw.append(image_raw)
+            batch_origin_h.append(origin_h)
+            batch_origin_w.append(origin_w)
+            np.copyto(batch_input_image[i],
+                      input_image)
+        batch_input_image = np.ascontiguousarray(batch_input_image)
+
+        # Copy input image to host buffer
+        np.copyto(host_inputs[0], batch_input_image.ravel())
+        start = time.time()
+        # Transfer input data  to the GPU.
+        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
+        # Run inference.
+        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
+        # Transfer predictions back from the GPU.
+        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
+        # Synchronize the stream
+        stream.synchronize()
+        end = time.time()
+        # Remove any context from the top of the context stack, deactivating it.
+        self.ctx.pop()
+        # Here we use the first row of output in that batch_size = 1
+        output = host_outputs[0]
+        # Do postprocess
+        for i in range(self.batch_size):
+
+            result_boxes, result_scores, result_classid, keypoints = self.post_process(
+                output[i * (self.det_output_size): (i + 1) * (self.det_output_size)],
+                batch_origin_h[i], batch_origin_w[i]
+            )
+
+            # Draw rectangles and labels on the original image
+            for j in range(len(result_boxes)):
+                box = result_boxes[j]
+                plot_one_box(
+                    box,
+                    batch_image_raw[i],
+                    label="{}:{:.2f}".format(
+                        categories[int(result_classid[j])], result_scores[j]
+                    ),
+                )
+
+                num_keypoints = len(keypoints[j]) // 3
+                points = []
+                for k in range(num_keypoints):
+                    x = keypoints[j][k * 3]
+                    y = keypoints[j][k * 3 + 1]
+                    confidence = keypoints[j][k * 3 + 2]
+                    if confidence > 0:
+                        points.append((int(x), int(y)))
+                    else:
+                        points.append(None)
+
+                # 根据关键点索引对绘制线条
+                for pair in keypoint_pairs:
+                    partA, partB = pair
+                    if points[partA] and points[partB]:
+                        cv2.line(batch_image_raw[i], points[partA], points[partB], (0, 255, 0), 2)
+
+        return batch_image_raw, end - start
+
+    def destroy(self):
+        # Remove any context from the top of the context stack, deactivating it.
+        self.ctx.pop()
+
+    def get_raw_image(self, image_path_batch):
+        """
+        description: Read an image from image path
+        """
+        for img_path in image_path_batch:
+            yield cv2.imread(img_path)
+
+    def get_raw_image_zeros(self, image_path_batch=None):
+        """
+        description: Ready data for warmup
+        """
+        for _ in range(self.batch_size):
+            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
+
+    def preprocess_image(self, raw_bgr_image):
+        """
+        description: Convert BGR image to RGB,
+                     resize and pad it to target size, normalize to [0,1],
+                     transform to NCHW format.
+        param:
+            input_image_path: str, image path
+        return:
+            image:  the processed image
+            image_raw: the original image
+            h: original height
+            w: original width
+        """
+        image_raw = raw_bgr_image
+        h, w, c = image_raw.shape
+        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
+        # Calculate widht and height and paddings
+        r_w = self.input_w / w
+        r_h = self.input_h / h
+        if r_h > r_w:
+            tw = self.input_w
+            th = int(r_w * h)
+            tx1 = tx2 = 0
+            ty1 = int((self.input_h - th) / 2)
+            ty2 = self.input_h - th - ty1
+        else:
+            tw = int(r_h * w)
+            th = self.input_h
+            tx1 = int((self.input_w - tw) / 2)
+            tx2 = self.input_w - tw - tx1
+            ty1 = ty2 = 0
+        # Resize the image with long side while maintaining ratio
+        image = cv2.resize(image, (tw, th))
+        # Pad the short side with (128,128,128)
+        image = cv2.copyMakeBorder(
+            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
+        )
+        image = image.astype(np.float32)
+        # Normalize to [0,1]
+        image /= 255.0
+        # HWC to CHW format:
+        image = np.transpose(image, [2, 0, 1])
+        # CHW to NCHW format
+        image = np.expand_dims(image, axis=0)
+        # Convert the image to row-major order, also known as "C order":
+        image = np.ascontiguousarray(image)
+        return image, image_raw, h, w
+
+    def xywh2xyxy_with_keypoints(self, origin_h, origin_w, boxes, keypoints):
+
+        n = len(boxes)
+        box_array = np.zeros_like(boxes)
+        keypoint_array = np.zeros_like(keypoints)
+        r_w = self.input_w / origin_w
+        r_h = self.input_h / origin_h
+        for i in range(n):
+            if r_h > r_w:
+                box = boxes[i]
+                lmk = keypoints[i]
+                box_array[i, 0] = box[0] / r_w
+                box_array[i, 2] = box[2] / r_w
+                box_array[i, 1] = (box[1] - (self.input_h - r_w * origin_h) / 2) / r_w
+                box_array[i, 3] = (box[3] - (self.input_h - r_w * origin_h) / 2) / r_w
+
+                for j in range(0, len(lmk), 3):
+                    keypoint_array[i, j] = lmk[j] / r_w
+                    keypoint_array[i, j + 1] = (lmk[j + 1] - (self.input_h - r_w * origin_h) / 2) / r_w
+                    keypoint_array[i, j + 2] = lmk[j + 2]
+            else:
+
+                box = boxes[i]
+                lmk = keypoints[i]
+
+                box_array[i, 0] = (box[0] - (self.input_w - r_h * origin_w) / 2) / r_h
+                box_array[i, 2] = (box[2] - (self.input_w - r_h * origin_w) / 2) / r_h
+                box_array[i, 1] = box[1] / r_h
+                box_array[i, 3] = box[3] / r_h
+
+                for j in range(0, len(lmk), 3):
+                    keypoint_array[i, j] = (lmk[j] - (self.input_w - r_h * origin_w) / 2) / r_h
+                    keypoint_array[i, j + 1] = lmk[j + 1] / r_h
+                    keypoint_array[i, j + 2] = lmk[j + 2]
+
+        return box_array, keypoint_array
+
+    def post_process(self, output, origin_h, origin_w):
+        """
+        description: Post-process the prediction to include pose keypoints
+        param:
+            output:     A numpy array like [num_boxes, cx, cy, w, h, conf,
+            cls_id, px1, py1, pconf1,...px17, py17, pconf17] where p denotes pose keypoint
+            origin_h:   Height of original image
+            origin_w:   Width of original image
+        return:
+            result_boxes:    Final boxes, a numpy array, each row is a box [x1, y1, x2, y2]
+            result_scores:   Final scores, a numpy array, each element is the score corresponding to box
+            result_classid:  Final classID, a numpy array, each element is the classid corresponding to box
+            result_keypoints: Final keypoints, a list of numpy arrays,
+            each element represents keypoints for a box, shaped as (#keypoints, 3)
+        """
+        # Number of values per detection: 38 base values + 17 keypoints * 3 values each
+        num_values_per_detection = 38 + 17 * 3
+        # Get the number of boxes detected
+        num = int(output[0])
+        # Reshape to a two-dimensional ndarray with the full detection shape
+        pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]
+
+        # Perform non-maximum suppression to filter the detections
+        boxes = self.non_max_suppression(
+            pred[:, :num_values_per_detection], origin_h, origin_w,
+            conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
+
+        # Extract the bounding boxes, confidence scores, and class IDs
+        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
+        result_scores = boxes[:, 4] if len(boxes) else np.array([])
+        result_classid = boxes[:, 5] if len(boxes) else np.array([])
+        result_keypoints = boxes[:, -51:] if len(boxes) else np.array([])
+
+        # Return the post-processed results including keypoints
+        return result_boxes, result_scores, result_classid, result_keypoints
+
+    def bbox_iou(self, box1, box2, x1y1x2y2=True):
+        """
+        description: compute the IoU of two bounding boxes
+        param:
+            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
+            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
+            x1y1x2y2: select the coordinate format
+        return:
+            iou: computed iou
+        """
+        if not x1y1x2y2:
+            # Transform from center and width to exact coordinates
+            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
+            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
+            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
+            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
+        else:
+            # Get the coordinates of bounding boxes
+            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
+            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
+
+        # Get the coordinates of the intersection rectangle
+        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
+        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
+        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
+        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
+        # Intersection area
+        inter_area = np.clip(
+            inter_rect_x2 - inter_rect_x1 + 1, 0, None) * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
+        # Union Area
+        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
+        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
+
+        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
+
+        return iou
+
+    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
+        """
+        description: Removes detections with lower object confidence score than 'conf_thres' and performs
+        Non-Maximum Suppression to further filter detections.
+        param:
+            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
+            origin_h: original image height
+            origin_w: original image width
+            conf_thres: a confidence threshold to filter detections
+            nms_thres: a iou threshold to filter detections
+        return:
+            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
+        """
+        # Get the boxes that score > CONF_THRESH
+        boxes = prediction[prediction[:, 4] >= conf_thres]
+        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
+        res_array = np.copy(boxes)
+        box_pred_deep_copy = np.copy(boxes[:, :4])
+        keypoints_pred_deep_copy = np.copy(boxes[:, -51:])
+        res_box, res_keypoints = self.xywh2xyxy_with_keypoints(
+            origin_h, origin_w, box_pred_deep_copy, keypoints_pred_deep_copy)
+        res_array[:, :4] = res_box
+        res_array[:, -51:] = res_keypoints
+        # clip the coordinates
+        res_array[:, 0] = np.clip(res_array[:, 0], 0, origin_w - 1)
+        res_array[:, 2] = np.clip(res_array[:, 2], 0, origin_w - 1)
+        res_array[:, 1] = np.clip(res_array[:, 1], 0, origin_h - 1)
+        res_array[:, 3] = np.clip(res_array[:, 3], 0, origin_h - 1)
+        # Object confidence
+        confs = res_array[:, 4]
+        # Sort by the confs
+        res_array = res_array[np.argsort(-confs)]
+        # Perform non-maximum suppression
+        keep_res_array = []
+        while res_array.shape[0]:
+            large_overlap = self.bbox_iou(np.expand_dims(res_array[0, :4], 0), res_array[:, :4]) > nms_thres
+            label_match = res_array[0, 5] == res_array[:, 5]
+            invalid = large_overlap & label_match
+            keep_res_array.append(res_array[0])
+            res_array = res_array[~invalid]
+
+        res_array = np.stack(keep_res_array, 0) if len(keep_res_array) else np.array([])
+        return res_array
+
+
+class inferThread(threading.Thread):
+    def __init__(self, yolov8_wrapper, image_path_batch):
+        threading.Thread.__init__(self)
+        self.yolov8_wrapper = yolov8_wrapper
+        self.image_path_batch = image_path_batch
+
+    def run(self):
+        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch))
+        for i, img_path in enumerate(self.image_path_batch):
+            parent, filename = os.path.split(img_path)
+            save_name = os.path.join('output', filename)
+            # Save image
+
+            cv2.imwrite(save_name, batch_image_raw[i])
+        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))
+
+
+class warmUpThread(threading.Thread):
+    def __init__(self, yolov8_wrapper):
+        threading.Thread.__init__(self)
+        self.yolov8_wrapper = yolov8_wrapper
+
+    def run(self):
+        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros())
+        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
+
+
+if __name__ == "__main__":
+    # load custom plugin and engine
+    PLUGIN_LIBRARY = "./build/libmyplugins.so"
+    engine_file_path = "yolov8n-pose.engine"
+
+    if len(sys.argv) > 1:
+        engine_file_path = sys.argv[1]
+    if len(sys.argv) > 2:
+        PLUGIN_LIBRARY = sys.argv[2]
+
+    ctypes.CDLL(PLUGIN_LIBRARY)
+
+    # load coco labels
+
+    categories = ["person"]
+
+    if os.path.exists('output/'):
+        shutil.rmtree('output/')
+    os.makedirs('output/')
+    # a YoLov8TRT instance
+    yolov8_wrapper = YoLov8TRT(engine_file_path)
+    try:
+        print('batch size is', yolov8_wrapper.batch_size)
+
+        image_dir = "samples/"
+        image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir)
+
+        for i in range(10):
+            # create a new thread to do warm_up
+            thread1 = warmUpThread(yolov8_wrapper)
+            thread1.start()
+            thread1.join()
+        for batch in image_path_batches:
+            # create a new thread to do inference
+            thread1 = inferThread(yolov8_wrapper, batch)
+            thread1.start()
+            thread1.join()
+    finally:
+        # destroy the instance
+        yolov8_wrapper.destroy()

From 5172d155ffcd029d02046fb99e914f12a36700ba Mon Sep 17 00:00:00 2001
From: linds <lindsayshuo@foxmail.com>
Date: Mon, 29 Apr 2024 11:15:51 +0800
Subject: [PATCH 24/24] yolov8 pose p6

---
 yolov8/include/model.h |   4 +
 yolov8/src/model.cpp   | 333 ++++++++++++++++++++++++++++++++++++++++-
 yolov8/yolov8_pose.cpp |   2 +-
 3 files changed, 335 insertions(+), 4 deletions(-)

diff --git a/yolov8/include/model.h b/yolov8/include/model.h
index 6546aa54..82586da1 100644
--- a/yolov8/include/model.h
+++ b/yolov8/include/model.h
@@ -25,3 +25,7 @@ nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer
 nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                              nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                              int& max_channels);
+
+nvinfer1::IHostMemory* buildEngineYolov8PoseP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
+                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
+                                               int& max_channels);
diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp
index 4cc4088e..a5f7e8e5 100644
--- a/yolov8/src/model.cpp
+++ b/yolov8/src/model.cpp
@@ -1448,9 +1448,6 @@ nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfe
                 1, 0, "model.22.dfl.conv.weight");
 
     // det0
-    std::cout << "conv15->getOutput(0)->getDimensions().d[0] : " << conv15->getOutput(0)->getDimensions().d[0]
-              << "  (kInputH / strides[0]) * (kInputW / strides[0]) : "
-              << (kInputH / strides[0]) * (kInputW / strides[0]) << std::endl;
     auto shuffle_conv15 = cv4_conv_combined(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0",
                                             (kInputH / strides[0]) * (kInputW / strides[0]), gw, "pose");
 
@@ -1530,3 +1527,333 @@ nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfe
     }
     return serialized_model;
 }
+
+nvinfer1::IHostMemory* buildEngineYolov8PoseP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
+                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
+                                               int& max_channels) {
+    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
+    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
+    /*******************************************************************************************************
+    ******************************************  YOLOV8 INPUT  **********************************************
+    *******************************************************************************************************/
+    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW});
+    assert(data);
+    /*******************************************************************************************************
+    *****************************************  YOLOV8 BACKBONE  ********************************************
+    *******************************************************************************************************/
+    nvinfer1::IElementWiseLayer* conv0 =
+            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
+    nvinfer1::IElementWiseLayer* conv1 =
+            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1");
+    // 11233
+    nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
+                                             get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
+    nvinfer1::IElementWiseLayer* conv3 =
+            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3");
+    // 22466
+    nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
+                                             get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
+    nvinfer1::IElementWiseLayer* conv5 =
+            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5");
+    // 22466
+    nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
+                                             get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
+
+    nvinfer1::IElementWiseLayer* conv7 =
+            convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(768, gw, max_channels), 3, 2, 1, "model.7");
+    nvinfer1::IElementWiseLayer* conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(768, gw, max_channels),
+                                             get_width(768, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");
+
+    nvinfer1::IElementWiseLayer* conv9 =
+            convBnSiLU(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.9");
+    nvinfer1::IElementWiseLayer* conv10 =
+            C2F(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels),
+                get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.10");
+
+    nvinfer1::IElementWiseLayer* conv11 =
+            SPPF(network, weightMap, *conv10->getOutput(0), get_width(1024, gw, max_channels),
+                 get_width(1024, gw, max_channels), 5, "model.11");
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 HEAD  ********************************************
+    *******************************************************************************************************/
+    // Head
+    float scale[] = {1.0, 2.0, 2.0};  // scale used for upsampling
+
+    // P5
+    nvinfer1::IResizeLayer* upsample12 = network->addResize(*conv11->getOutput(0));
+    upsample12->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
+    upsample12->setScales(scale, 3);
+    nvinfer1::ITensor* concat13_inputs[] = {upsample12->getOutput(0), conv8->getOutput(0)};
+    nvinfer1::IConcatenationLayer* concat13 = network->addConcatenation(concat13_inputs, 2);
+    nvinfer1::IElementWiseLayer* conv14 =
+            C2(network, weightMap, *concat13->getOutput(0), get_width(768, gw, max_channels),
+               get_width(768, gw, max_channels), get_depth(3, gd), false, 0.5, "model.14");
+
+    // P4
+    nvinfer1::IResizeLayer* upsample15 = network->addResize(*conv14->getOutput(0));
+    upsample15->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
+    upsample15->setScales(scale, 3);
+    nvinfer1::ITensor* concat16_inputs[] = {upsample15->getOutput(0), conv6->getOutput(0)};
+    nvinfer1::IConcatenationLayer* concat16 = network->addConcatenation(concat16_inputs, 2);
+    nvinfer1::IElementWiseLayer* conv17 =
+            C2(network, weightMap, *concat16->getOutput(0), get_width(512, gw, max_channels),
+               get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.17");
+
+    // P3
+    nvinfer1::IResizeLayer* upsample18 = network->addResize(*conv17->getOutput(0));
+    upsample18->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
+    upsample18->setScales(scale, 3);
+    nvinfer1::ITensor* concat19_inputs[] = {upsample18->getOutput(0), conv4->getOutput(0)};
+    nvinfer1::IConcatenationLayer* concat19 = network->addConcatenation(concat19_inputs, 2);
+    nvinfer1::IElementWiseLayer* conv20 =
+            C2(network, weightMap, *concat19->getOutput(0), get_width(256, gw, max_channels),
+               get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.20");
+
+    // Additional layers for P4, P5, P6
+    // P4/16-medium
+    nvinfer1::IElementWiseLayer* conv21 = convBnSiLU(network, weightMap, *conv20->getOutput(0),
+                                                     get_width(256, gw, max_channels), 3, 2, 1, "model.21");
+    nvinfer1::ITensor* concat22_inputs[] = {conv21->getOutput(0), conv17->getOutput(0)};
+    nvinfer1::IConcatenationLayer* concat22 = network->addConcatenation(concat22_inputs, 2);
+    nvinfer1::IElementWiseLayer* conv23 =
+            C2(network, weightMap, *concat22->getOutput(0), get_width(512, gw, max_channels),
+               get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.23");
+
+    // P5/32-large
+    nvinfer1::IElementWiseLayer* conv24 = convBnSiLU(network, weightMap, *conv23->getOutput(0),
+                                                     get_width(512, gw, max_channels), 3, 2, 1, "model.24");
+    nvinfer1::ITensor* concat25_inputs[] = {conv24->getOutput(0), conv14->getOutput(0)};
+    nvinfer1::IConcatenationLayer* concat25 = network->addConcatenation(concat25_inputs, 2);
+    nvinfer1::IElementWiseLayer* conv26 =
+            C2(network, weightMap, *concat25->getOutput(0), get_width(768, gw, max_channels),
+               get_width(768, gw, max_channels), get_depth(3, gd), false, 0.5, "model.26");
+
+    // P6/64-xlarge
+    nvinfer1::IElementWiseLayer* conv27 = convBnSiLU(network, weightMap, *conv26->getOutput(0),
+                                                     get_width(768, gw, max_channels), 3, 2, 1, "model.27");
+    nvinfer1::ITensor* concat28_inputs[] = {conv27->getOutput(0), conv11->getOutput(0)};
+    nvinfer1::IConcatenationLayer* concat28 = network->addConcatenation(concat28_inputs, 2);
+    nvinfer1::IElementWiseLayer* conv29 =
+            C2(network, weightMap, *concat28->getOutput(0), get_width(1024, gw, max_channels),
+               get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.29");
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 OUTPUT  ******************************************
+    *******************************************************************************************************/
+    int base_in_channel = (gw == 1.25) ? 80 : 64;
+    int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(256, gw, max_channels);
+
+    // output0
+    nvinfer1::IElementWiseLayer* conv30_cv2_0_0 =
+            convBnSiLU(network, weightMap, *conv20->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.0.0");
+    nvinfer1::IElementWiseLayer* conv30_cv2_0_1 =
+            convBnSiLU(network, weightMap, *conv30_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.0.1");
+    nvinfer1::IConvolutionLayer* conv30_cv2_0_2 =
+            network->addConvolutionNd(*conv30_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.30.cv2.0.2.weight"], weightMap["model.30.cv2.0.2.bias"]);
+    conv30_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+
+    conv30_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+
+    nvinfer1::IElementWiseLayer* conv30_cv3_0_0 =
+            convBnSiLU(network, weightMap, *conv20->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.0.0");
+
+    nvinfer1::IElementWiseLayer* conv30_cv3_0_1 = convBnSiLU(network, weightMap, *conv30_cv3_0_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.30.cv3.0.1");
+    nvinfer1::IConvolutionLayer* conv30_cv3_0_2 =
+            network->addConvolutionNd(*conv30_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.30.cv3.0.2.weight"], weightMap["model.30.cv3.0.2.bias"]);
+    conv30_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1});
+    conv30_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor* inputTensor30_0[] = {conv30_cv2_0_2->getOutput(0), conv30_cv3_0_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat30_0 = network->addConcatenation(inputTensor30_0, 2);
+
+    // output1
+    nvinfer1::IElementWiseLayer* conv30_cv2_1_0 =
+            convBnSiLU(network, weightMap, *conv23->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.1.0");
+    nvinfer1::IElementWiseLayer* conv30_cv2_1_1 =
+            convBnSiLU(network, weightMap, *conv30_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.1.1");
+    nvinfer1::IConvolutionLayer* conv30_cv2_1_2 =
+            network->addConvolutionNd(*conv30_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.30.cv2.1.2.weight"], weightMap["model.30.cv2.1.2.bias"]);
+    conv30_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv30_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer* conv30_cv3_1_0 =
+            convBnSiLU(network, weightMap, *conv23->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.1.0");
+    nvinfer1::IElementWiseLayer* conv30_cv3_1_1 = convBnSiLU(network, weightMap, *conv30_cv3_1_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.30.cv3.1.1");
+    nvinfer1::IConvolutionLayer* conv30_cv3_1_2 =
+            network->addConvolutionNd(*conv30_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.30.cv3.1.2.weight"], weightMap["model.30.cv3.1.2.bias"]);
+    conv30_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv30_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor* inputTensor30_1[] = {conv30_cv2_1_2->getOutput(0), conv30_cv3_1_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat30_1 = network->addConcatenation(inputTensor30_1, 2);
+
+    // output2
+    nvinfer1::IElementWiseLayer* conv30_cv2_2_0 =
+            convBnSiLU(network, weightMap, *conv26->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.2.0");
+    nvinfer1::IElementWiseLayer* conv30_cv2_2_1 =
+            convBnSiLU(network, weightMap, *conv30_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.2.1");
+    nvinfer1::IConvolutionLayer* conv30_cv2_2_2 =
+            network->addConvolution(*conv30_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                    weightMap["model.30.cv2.2.2.weight"], weightMap["model.30.cv2.2.2.bias"]);
+    conv30_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv30_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer* conv30_cv3_2_0 =
+            convBnSiLU(network, weightMap, *conv26->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.2.0");
+    nvinfer1::IElementWiseLayer* conv30_cv3_2_1 = convBnSiLU(network, weightMap, *conv30_cv3_2_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.30.cv3.2.1");
+    nvinfer1::IConvolutionLayer* conv30_cv3_2_2 =
+            network->addConvolution(*conv30_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                    weightMap["model.30.cv3.2.2.weight"], weightMap["model.30.cv3.2.2.bias"]);
+    conv30_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv30_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor* inputTensor30_2[] = {conv30_cv2_2_2->getOutput(0), conv30_cv3_2_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat30_2 = network->addConcatenation(inputTensor30_2, 2);
+
+    // output3
+    nvinfer1::IElementWiseLayer* conv30_cv2_3_0 =
+            convBnSiLU(network, weightMap, *conv29->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.3.0");
+    nvinfer1::IElementWiseLayer* conv30_cv2_3_1 =
+            convBnSiLU(network, weightMap, *conv30_cv2_3_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.3.1");
+    nvinfer1::IConvolutionLayer* conv30_cv2_3_2 =
+            network->addConvolution(*conv30_cv2_3_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                    weightMap["model.30.cv2.3.2.weight"], weightMap["model.30.cv2.3.2.bias"]);
+    conv30_cv2_3_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv30_cv2_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer* conv30_cv3_3_0 =
+            convBnSiLU(network, weightMap, *conv29->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.3.0");
+    nvinfer1::IElementWiseLayer* conv30_cv3_3_1 = convBnSiLU(network, weightMap, *conv30_cv3_3_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.30.cv3.3.1");
+    nvinfer1::IConvolutionLayer* conv30_cv3_3_2 =
+            network->addConvolution(*conv30_cv3_3_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                    weightMap["model.30.cv3.3.2.weight"], weightMap["model.30.cv3.3.2.bias"]);
+    conv30_cv3_3_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv30_cv3_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor* inputTensor30_3[] = {conv30_cv2_3_2->getOutput(0), conv30_cv3_3_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat30_3 = network->addConcatenation(inputTensor30_3, 2);
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 DETECT  ******************************************
+    *******************************************************************************************************/
+    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7, conv9};
+    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
+    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
+    int stridesLength = sizeof(strides) / sizeof(int);
+
+    // P3 processing steps (remains unchanged)
+    nvinfer1::IShuffleLayer* shuffle30_0 =
+            network->addShuffle(*cat30_0->getOutput(0));  // Reusing the previous cat30_0 as P3 concatenation layer
+    shuffle30_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
+    nvinfer1::ISliceLayer* split30_0_0 = network->addSlice(
+            *shuffle30_0->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split30_0_1 = network->addSlice(
+            *shuffle30_0->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl30_0 =
+            DFL(network, weightMap, *split30_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
+                1, 0, "model.30.dfl.conv.weight");
+
+    // det0
+    auto shuffle_conv20 = cv4_conv_combined(network, weightMap, *conv20->getOutput(0), "model.30.cv4.0",
+                                            (kInputH / strides[0]) * (kInputW / strides[0]), gw, "pose");
+    nvinfer1::ITensor* inputTensor30_dfl_0[] = {dfl30_0->getOutput(0), split30_0_1->getOutput(0),
+                                                shuffle_conv20->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat30_dfl_0 = network->addConcatenation(inputTensor30_dfl_0, 2);
+
+    // P4 processing steps (remains unchanged)
+    nvinfer1::IShuffleLayer* shuffle30_1 =
+            network->addShuffle(*cat30_1->getOutput(0));  // Reusing the previous cat30_1 as P4 concatenation layer
+    shuffle30_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
+    nvinfer1::ISliceLayer* split30_1_0 = network->addSlice(
+            *shuffle30_1->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split30_1_1 = network->addSlice(
+            *shuffle30_1->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl30_1 =
+            DFL(network, weightMap, *split30_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
+                1, 0, "model.30.dfl.conv.weight");
+
+    // det1
+    auto shuffle_conv23 = cv4_conv_combined(network, weightMap, *conv23->getOutput(0), "model.30.cv4.1",
+                                            (kInputH / strides[1]) * (kInputW / strides[1]), gw, "pose");
+    nvinfer1::ITensor* inputTensor30_dfl_1[] = {dfl30_1->getOutput(0), split30_1_1->getOutput(0),
+                                                shuffle_conv23->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat30_dfl_1 = network->addConcatenation(inputTensor30_dfl_1, 2);
+
+    // P5 processing steps (remains unchanged)
+    nvinfer1::IShuffleLayer* shuffle30_2 =
+            network->addShuffle(*cat30_2->getOutput(0));  // Reusing the previous cat30_2 as P5 concatenation layer
+    shuffle30_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
+    nvinfer1::ISliceLayer* split30_2_0 = network->addSlice(
+            *shuffle30_2->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split30_2_1 = network->addSlice(
+            *shuffle30_2->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl30_2 =
+            DFL(network, weightMap, *split30_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
+                1, 0, "model.30.dfl.conv.weight");
+
+    // det2
+    auto shuffle_conv26 = cv4_conv_combined(network, weightMap, *conv26->getOutput(0), "model.30.cv4.2",
+                                            (kInputH / strides[2]) * (kInputW / strides[2]), gw, "pose");
+    nvinfer1::ITensor* inputTensor30_dfl_2[] = {dfl30_2->getOutput(0), split30_2_1->getOutput(0),
+                                                shuffle_conv26->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat30_dfl_2 = network->addConcatenation(inputTensor30_dfl_2, 2);
+
+    // P6 processing steps
+    nvinfer1::IShuffleLayer* shuffle30_3 = network->addShuffle(*cat30_3->getOutput(0));
+    shuffle30_3->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])});
+    nvinfer1::ISliceLayer* split30_3_0 = network->addSlice(
+            *shuffle30_3->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split30_3_1 = network->addSlice(
+            *shuffle30_3->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl30_3 =
+            DFL(network, weightMap, *split30_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1,
+                1, 0, "model.30.dfl.conv.weight");
+
+    // det2
+    auto shuffle_conv29 = cv4_conv_combined(network, weightMap, *conv29->getOutput(0), "model.30.cv4.3",
+                                            (kInputH / strides[3]) * (kInputW / strides[3]), gw, "pose");
+    nvinfer1::ITensor* inputTensor30_dfl_3[] = {dfl30_3->getOutput(0), split30_3_1->getOutput(0),
+                                                shuffle_conv29->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat30_dfl_3 = network->addConcatenation(inputTensor30_dfl_3, 2);
+
+    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(
+            network, std::vector<nvinfer1::IConcatenationLayer*>{cat30_dfl_0, cat30_dfl_1, cat30_dfl_2, cat30_dfl_3},
+            strides, stridesLength, false, false);
+    yolo->getOutput(0)->setName(kOutputTensorName);
+    network->markOutput(*yolo->getOutput(0));
+
+    builder->setMaxBatchSize(kBatchSize);
+    config->setMaxWorkspaceSize(16 * (1 << 20));
+
+#if defined(USE_FP16)
+    config->setFlag(nvinfer1::BuilderFlag::kFP16);
+#elif defined(USE_INT8)
+    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
+    assert(builder->platformHasFastInt8());
+    config->setFlag(nvinfer1::BuilderFlag::kINT8);
+    auto* calibrator =
+            new Int8EntropyCalibrator2(1, kInputW, kInputH, "../coco_calib/", "int8calib.table", kInputTensorName);
+    config->setInt8Calibrator(calibrator);
+#endif
+
+    std::cout << "Building engine, please wait for a while..." << std::endl;
+    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
+    std::cout << "Build engine successfully!" << std::endl;
+
+    delete network;
+
+    for (auto& mem : weightMap) {
+        free((void*)(mem.second.values));
+    }
+    return serialized_model;
+}
diff --git a/yolov8/yolov8_pose.cpp b/yolov8/yolov8_pose.cpp
index 2b354e46..84d35aea 100644
--- a/yolov8/yolov8_pose.cpp
+++ b/yolov8/yolov8_pose.cpp
@@ -20,7 +20,7 @@ void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p
     IHostMemory* serialized_engine = nullptr;
 
     if (is_p == 6) {
-        std::cout << "p6 is not supported right nowe" << std::endl;
+        serialized_engine = buildEngineYolov8PoseP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
     } else if (is_p == 2) {
         std::cout << "p2 is not supported right now" << std::endl;
     } else {