diff --git a/yolov8/CMakeLists.txt b/yolov8/CMakeLists.txt
index a702b341..e0c3f8ea 100644
--- a/yolov8/CMakeLists.txt
+++ b/yolov8/CMakeLists.txt
@@ -57,3 +57,6 @@ target_link_libraries(yolov8_pose nvinfer cudart myplugins ${OpenCV_LIBS})
 
 add_executable(yolov8_cls ${PROJECT_SOURCE_DIR}/yolov8_cls.cpp ${SRCS})
 target_link_libraries(yolov8_cls nvinfer cudart myplugins ${OpenCV_LIBS})
+
+add_executable(yolov8_5u_det ${PROJECT_SOURCE_DIR}/yolov8_5u_det.cpp ${SRCS})
+target_link_libraries(yolov8_5u_det nvinfer cudart myplugins ${OpenCV_LIBS})
diff --git a/yolov8/README.md b/yolov8/README.md
index 84868bc5..8bada0d4 100644
--- a/yolov8/README.md
+++ b/yolov8/README.md
@@ -43,8 +43,16 @@ python gen_wts.py -w yolov8n.pt -o yolov8n.wts -t detect
 
 // For p2 model
 // download https://github.com/lindsayshuo/yolov8_p2_tensorrtx/releases/download/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt (only for 10 cls p2 model)
+cd {ultralytics}/ultralytics
 python gen_wts.py -w VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt -o VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts -t detect (only for  10 cls p2 model)
 // a file 'VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts' will be generated.
+
+// For yolov8_5u_det model
+// download https://github.com/ultralytics/assets/releases/yolov5nu.pt
+cd {ultralytics}/ultralytics
+python gen_wts.py -w yolov5nu.pt -o yolov5nu.wts -t detect
+// a file 'yolov5nu.wts' will be generated.
+
 ```
 
 2. build tensorrtx/yolov8 and run
@@ -74,6 +82,11 @@ wget https://github.com/lindsayshuo/yolov8-p2/releases/download/VisDrone_train_y
 cp -r 0000008_01999_d_0000040.jpg ../images
 sudo ./yolov8_det -d VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine ../images c //cpu postprocess
 sudo ./yolov8_det -d VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine ../images g //gpu postprocess
+
+// For yolov8_5u_det(YOLOv5u with the anchor-free, objectness-free split head structure based on YOLOv8 features) model:
+sudo ./yolov8_5u_det -s [.wts] [.engine] [n/s/m/l/x//n6/s6/m6/l6/x6]
+sudo ./yolov8_5u_det -d yolov5xu.engine ../images c //cpu postprocess
+sudo ./yolov8_5u_det -d yolov5xu.engine ../images g //gpu postprocess
 ```
 
 ### Instance Segmentation
@@ -141,6 +154,7 @@ python yolov8_det_trt.py  # Detection
 python yolov8_seg_trt.py  # Segmentation
 python yolov8_cls_trt.py  # Classification
 python yolov8_pose_trt.py  # Pose Estimation
+python yolov8_5u_det_trt.py  # yolov8_5u_det(YOLOv5u with the anchor-free, objectness-free split head structure based on YOLOv8 features) model
 ```
 
 # INT8 Quantization
diff --git a/yolov8/include/block.h b/yolov8/include/block.h
index 650cacda..ae8ec993 100644
--- a/yolov8/include/block.h
+++ b/yolov8/include/block.h
@@ -4,6 +4,8 @@
 #include <vector>
 #include "NvInfer.h"
 
+int calculateP(int ksize);
+
 std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file);
 
 nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network,
@@ -18,6 +20,10 @@ nvinfer1::IElementWiseLayer* C2(nvinfer1::INetworkDefinition* network,
                                 std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c1,
                                 int c2, int n, bool shortcut, float e, std::string lname);
 
+nvinfer1::IElementWiseLayer* C3(nvinfer1::INetworkDefinition* network,
+                                std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
+                                int c2, int n, bool shortcut, float e, std::string lname);
+
 nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network,
                                   std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                                   int c2, int k, std::string lname);
diff --git a/yolov8/include/model.h b/yolov8/include/model.h
index 82586da1..8f30e029 100644
--- a/yolov8/include/model.h
+++ b/yolov8/include/model.h
@@ -29,3 +29,11 @@ nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfe
 nvinfer1::IHostMemory* buildEngineYolov8PoseP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
                                                nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
                                                int& max_channels);
+
+nvinfer1::IHostMemory* buildEngineYolov8_5uDet(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
+                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
+                                               int& max_channels);
+
+nvinfer1::IHostMemory* buildEngineYolov8_5uDetP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
+                                                 nvinfer1::DataType dt, const std::string& wts_path, float& gd,
+                                                 float& gw, int& max_channels);
diff --git a/yolov8/src/block.cpp b/yolov8/src/block.cpp
index 75d280e7..caf395f4 100644
--- a/yolov8/src/block.cpp
+++ b/yolov8/src/block.cpp
@@ -6,12 +6,18 @@
 #include "config.h"
 #include "yololayer.h"
 
+int calculateP(int ksize) {
+    return ksize / 3;
+}
+
 std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file) {
     std::cout << "Loading weights: " << file << std::endl;
     std::map<std::string, nvinfer1::Weights> WeightMap;
 
     std::ifstream input(file);
-    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");
+    assert(input.is_open() &&
+           "Unable to load weight file. please check if the "
+           ".wts file path is right!!!!!!");
 
     int32_t count;
     input >> count;
@@ -103,6 +109,20 @@ nvinfer1::ILayer* bottleneck(nvinfer1::INetworkDefinition* network, std::map<std
     return conv2;
 }
 
+static nvinfer1::ILayer* bottleneck_c3(nvinfer1::INetworkDefinition* network,
+                                       std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input,
+                                       int c1, int c2, bool shortcut, float e, std::string lname) {
+    nvinfer1::IElementWiseLayer* cv1 =
+            convBnSiLU(network, weightMap, input, (int)((float)c2 * e), 1, 1, calculateP(1), lname + ".cv1");
+    nvinfer1::IElementWiseLayer* cv2 =
+            convBnSiLU(network, weightMap, *cv1->getOutput(0), c2, 3, 1, calculateP(3), lname + ".cv2");
+    if (shortcut && c1 == c2) {
+        auto ew = network->addElementWise(input, *cv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
+        return ew;
+    }
+    return cv2;
+}
+
 nvinfer1::IElementWiseLayer* C2F(nvinfer1::INetworkDefinition* network,
                                  std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                                  int c2, int n, bool shortcut, float e, std::string lname) {
@@ -173,6 +193,24 @@ nvinfer1::IElementWiseLayer* C2(nvinfer1::INetworkDefinition* network,
     return conv2;
 }
 
+nvinfer1::IElementWiseLayer* C3(nvinfer1::INetworkDefinition* network,
+                                std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
+                                int c2, int n, bool shortcut, float e, std::string lname) {
+    int c_ = (float)c2 * e;
+    nvinfer1::IElementWiseLayer* cv1 = convBnSiLU(network, weightMap, input, c_, 1, 1, calculateP(1), lname + ".cv1");
+    nvinfer1::IElementWiseLayer* cv2 = convBnSiLU(network, weightMap, input, c_, 1, 1, calculateP(1), lname + ".cv2");
+    nvinfer1::ITensor* y1 = cv1->getOutput(0);
+    for (int i = 0; i < n; i++) {
+        auto b = bottleneck_c3(network, weightMap, *y1, c_, c_, shortcut, 1.0, lname + ".m." + std::to_string(i));
+        y1 = b->getOutput(0);
+    }
+    nvinfer1::ITensor* inputTensors[] = {y1, cv2->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 2);
+    nvinfer1::IElementWiseLayer* conv3 =
+            convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, calculateP(1), lname + ".cv3");
+    return conv3;
+}
+
 nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network,
                                   std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor& input, int c1,
                                   int c2, int k, std::string lname) {
@@ -236,7 +274,8 @@ nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition* network,
     combinedInfo[6] = is_segmentation;
     combinedInfo[7] = is_pose;
 
-    // Copy the contents of px_arry into the combinedInfo vector after the initial 5 elements.
+    // Copy the contents of px_arry into the combinedInfo vector after the initial
+    // 5 elements.
     std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count);
 
     // Now let's create the PluginField object to hold this combined information.
diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp
index 9901ab32..8fb524e7 100644
--- a/yolov8/src/model.cpp
+++ b/yolov8/src/model.cpp
@@ -6,6 +6,10 @@
 #include "config.h"
 #include "model.h"
 
+static int get_width_5u(int x, float gw, int divisor = 8) {
+    return int(ceil((x * gw) / divisor)) * divisor;
+}
+
 static int get_width(int x, float gw, int max_channels, int divisor = 8) {
     auto channel = int(ceil((x * gw) / divisor)) * divisor;
     return channel >= max_channels ? max_channels : channel;
@@ -94,14 +98,16 @@ nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer
     nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
 
     /*******************************************************************************************************
-    ******************************************  YOLOV8 INPUT  **********************************************
-    *******************************************************************************************************/
+  ******************************************  YOLOV8 INPUT
+  ***********************************************
+  *******************************************************************************************************/
     nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW});
     assert(data);
 
     /*******************************************************************************************************
-    *****************************************  YOLOV8 BACKBONE  ********************************************
-    *******************************************************************************************************/
+  *****************************************  YOLOV8 BACKBONE
+  *********************************************
+  *******************************************************************************************************/
     nvinfer1::IElementWiseLayer* conv0 =
             convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
     nvinfer1::IElementWiseLayer* conv1 =
@@ -129,8 +135,9 @@ nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer
             SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
                  get_width(1024, gw, max_channels), 5, "model.9");
     /*******************************************************************************************************
-    *********************************************  YOLOV8 HEAD  ********************************************
-    *******************************************************************************************************/
+  *********************************************  YOLOV8 HEAD
+  *********************************************
+  *******************************************************************************************************/
     float scale[] = {1.0, 2.0, 2.0};
     nvinfer1::IResizeLayer* upsample10 = network->addResize(*conv9->getOutput(0));
     assert(upsample10);
@@ -171,8 +178,9 @@ nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer
                 get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21");
 
     /*******************************************************************************************************
-    *********************************************  YOLOV8 OUTPUT  ******************************************
-    *******************************************************************************************************/
+  *********************************************  YOLOV8 OUTPUT
+  *******************************************
+  *******************************************************************************************************/
     int base_in_channel = (gw == 1.25) ? 80 : 64;
     int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(256, gw, max_channels);
 
@@ -239,8 +247,9 @@ nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer
     nvinfer1::IConcatenationLayer* cat22_2 = network->addConcatenation(inputTensor22_2, 2);
 
     /*******************************************************************************************************
-    *********************************************  YOLOV8 DETECT  ******************************************
-    *******************************************************************************************************/
+  *********************************************  YOLOV8 DETECT
+  *******************************************
+  *******************************************************************************************************/
 
     nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
     int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
@@ -328,13 +337,15 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
     std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
     nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
     /*******************************************************************************************************
-    ******************************************  YOLOV8 INPUT  **********************************************
-    *******************************************************************************************************/
+  ******************************************  YOLOV8 INPUT
+  ***********************************************
+  *******************************************************************************************************/
     nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW});
     assert(data);
     /*******************************************************************************************************
-    *****************************************  YOLOV8 BACKBONE  ********************************************
-    *******************************************************************************************************/
+  *****************************************  YOLOV8 BACKBONE
+  *********************************************
+  *******************************************************************************************************/
     nvinfer1::IElementWiseLayer* conv0 =
             convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
     nvinfer1::IElementWiseLayer* conv1 =
@@ -369,8 +380,9 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
                  get_width(1024, gw, max_channels), 5, "model.11");
 
     /*******************************************************************************************************
-    *********************************************  YOLOV8 HEAD  ********************************************
-    *******************************************************************************************************/
+  *********************************************  YOLOV8 HEAD
+  *********************************************
+  *******************************************************************************************************/
     // Head
     float scale[] = {1.0, 2.0, 2.0};  // scale used for upsampling
 
@@ -433,8 +445,9 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
                get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.29");
 
     /*******************************************************************************************************
-    *********************************************  YOLOV8 OUTPUT  ******************************************
-    *******************************************************************************************************/
+  *********************************************  YOLOV8 OUTPUT
+  *******************************************
+  *******************************************************************************************************/
     int base_in_channel = (gw == 1.25) ? 80 : 64;
     int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(256, gw, max_channels);
 
@@ -530,8 +543,9 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf
     nvinfer1::IConcatenationLayer* cat30_3 = network->addConcatenation(inputTensor30_3, 2);
 
     /*******************************************************************************************************
-    *********************************************  YOLOV8 DETECT  ******************************************
-    *******************************************************************************************************/
+  *********************************************  YOLOV8 DETECT
+  *******************************************
+  *******************************************************************************************************/
     nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7, conv9};
     int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
     calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
@@ -639,14 +653,16 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinf
     nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
 
     /*******************************************************************************************************
-    ******************************************  YOLOV8 INPUT  **********************************************
-    *******************************************************************************************************/
+  ******************************************  YOLOV8 INPUT
+  ***********************************************
+  *******************************************************************************************************/
     nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW});
     assert(data);
 
     /*******************************************************************************************************
-    *****************************************  YOLOV8 BACKBONE  ********************************************
-    *******************************************************************************************************/
+  *****************************************  YOLOV8 BACKBONE
+  *********************************************
+  *******************************************************************************************************/
     nvinfer1::IElementWiseLayer* conv0 =
             convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
     nvinfer1::IElementWiseLayer* conv1 =
@@ -675,19 +691,21 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinf
                  get_width(1024, gw, max_channels), 5, "model.9");
 
     /*******************************************************************************************************
-    *********************************************  YOLOV8 HEAD  ********************************************
-    *******************************************************************************************************/
+  *********************************************  YOLOV8 HEAD
+  *********************************************
+  *******************************************************************************************************/
     // Head
     float scale[] = {1.0, 2.0, 2.0};  // scale used for upsampling
 
     // P4
-    nvinfer1::IResizeLayer* upsample10 = network->addResize(
-            *conv9->getOutput(0));  // Assuming conv9 is the last layer of the backbone as per P5 in your first section.
+    nvinfer1::IResizeLayer* upsample10 =
+            network->addResize(*conv9->getOutput(0));  // Assuming conv9 is the last layer of the backbone
+                                                       // as per P5 in your first section.
     upsample10->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
     upsample10->setScales(scale, 3);
-    nvinfer1::ITensor* concat11_inputs[] = {
-            upsample10->getOutput(0),
-            conv6->getOutput(0)};  // Assuming conv6 corresponds to "backbone P4" as per your pseudocode
+    nvinfer1::ITensor* concat11_inputs[] = {upsample10->getOutput(0),
+                                            conv6->getOutput(0)};  // Assuming conv6 corresponds to "backbone P4" as
+                                                                   // per your pseudocode
     nvinfer1::IConcatenationLayer* concat11 = network->addConcatenation(concat11_inputs, 2);
     nvinfer1::IElementWiseLayer* conv12 =
             C2F(network, weightMap, *concat11->getOutput(0), get_width(512, gw, max_channels),
@@ -747,8 +765,9 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinf
                 get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.27");
 
     /*******************************************************************************************************
-    *********************************************  YOLOV8 OUTPUT  ******************************************
-    *******************************************************************************************************/
+  *********************************************  YOLOV8 OUTPUT
+  *******************************************
+  *******************************************************************************************************/
     int base_in_channel = 64;
     int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(128, gw, max_channels);
 
@@ -841,8 +860,9 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinf
     nvinfer1::IConcatenationLayer* cat28_3 = network->addConcatenation(inputTensor28_3, 2);
 
     /*******************************************************************************************************
-    *********************************************  YOLOV8 DETECT  ******************************************
-    *******************************************************************************************************/
+  *********************************************  YOLOV8 DETECT
+  *******************************************
+  *******************************************************************************************************/
 
     nvinfer1::IElementWiseLayer* conv_layers[] = {conv1, conv3, conv5, conv7};
     int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
@@ -947,11 +967,13 @@ nvinfer1::IHostMemory* buildEngineYolov8Cls(nvinfer1::IBuilder* builder, nvinfer
     std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
     nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
     int max_channels = 1280;
-    // ****************************************** YOLOV8 INPUT **********************************************
+    // ****************************************** YOLOV8 INPUT
+    // **********************************************
     nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kClsInputH, kClsInputW});
     assert(data);
 
-    // ***************************************** YOLOV8 BACKBONE ********************************************
+    // ***************************************** YOLOV8 BACKBONE
+    // ********************************************
     nvinfer1::IElementWiseLayer* conv0 =
             convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
     nvinfer1::IElementWiseLayer* conv1 =
@@ -976,12 +998,13 @@ nvinfer1::IHostMemory* buildEngineYolov8Cls(nvinfer1::IBuilder* builder, nvinfer
             C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
                 get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");
 
-    // ********************************************* YOLOV8 HEAD *********************************************
+    // ********************************************* YOLOV8 HEAD
+    // *********************************************
 
     auto conv_class = convBnSiLU(network, weightMap, *conv8->getOutput(0), 1280, 1, 1, 1, "model.9.conv");
     // Adjusted code
-    nvinfer1::Dims dims =
-            conv_class->getOutput(0)->getDimensions();  // Obtain the dimensions of the output of conv_class
+    nvinfer1::Dims dims = conv_class->getOutput(0)->getDimensions();  // Obtain the dimensions of the
+                                                                      // output of conv_class
     assert(dims.nbDims == 3);  // Make sure there are exactly 3 dimensions (channels, height, width)
 
     nvinfer1::IPoolingLayer* pool2 = network->addPoolingNd(*conv_class->getOutput(0), nvinfer1::PoolingType::kAVERAGE,
@@ -1034,14 +1057,16 @@ nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer
     nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
 
     /*******************************************************************************************************
-    ******************************************  YOLOV8 INPUT  **********************************************
-    *******************************************************************************************************/
+  ******************************************  YOLOV8 INPUT
+  ***********************************************
+  *******************************************************************************************************/
     nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW});
     assert(data);
 
     /*******************************************************************************************************
-    *****************************************  YOLOV8 BACKBONE  ********************************************
-    *******************************************************************************************************/
+  *****************************************  YOLOV8 BACKBONE
+  *********************************************
+  *******************************************************************************************************/
     nvinfer1::IElementWiseLayer* conv0 =
             convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
     nvinfer1::IElementWiseLayer* conv1 =
@@ -1066,8 +1091,9 @@ nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer
                  get_width(1024, gw, max_channels), 5, "model.9");
 
     /*******************************************************************************************************
-    *********************************************  YOLOV8 HEAD  ********************************************
-    *******************************************************************************************************/
+  *********************************************  YOLOV8 HEAD
+  *********************************************
+  *******************************************************************************************************/
     float scale[] = {1.0, 2.0, 2.0};
     nvinfer1::IResizeLayer* upsample10 = network->addResize(*conv9->getOutput(0));
     assert(upsample10);
@@ -1106,8 +1132,9 @@ nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer
                 get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21");
 
     /*******************************************************************************************************
-    *********************************************  YOLOV8 OUTPUT  ******************************************
-    *******************************************************************************************************/
+  *********************************************  YOLOV8 OUTPUT
+  *******************************************
+  *******************************************************************************************************/
     int base_in_channel = (gw == 1.25) ? 80 : 64;
     int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(256, gw, max_channels);
 
@@ -1174,8 +1201,9 @@ nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer
     nvinfer1::IConcatenationLayer* cat22_2 = network->addConcatenation(inputTensor22_2, 2);
 
     /*******************************************************************************************************
-    *********************************************  YOLOV8 DETECT  ******************************************
-    *******************************************************************************************************/
+  *********************************************  YOLOV8 DETECT
+  *******************************************
+  *******************************************************************************************************/
 
     nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
     int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
@@ -1282,14 +1310,16 @@ nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfe
     nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
 
     /*******************************************************************************************************
-    ******************************************  YOLOV8 INPUT  **********************************************
-    *******************************************************************************************************/
+  ******************************************  YOLOV8 INPUT
+  ***********************************************
+  *******************************************************************************************************/
     nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW});
     assert(data);
 
     /*******************************************************************************************************
-    *****************************************  YOLOV8 BACKBONE  ********************************************
-    *******************************************************************************************************/
+  *****************************************  YOLOV8 BACKBONE
+  *********************************************
+  *******************************************************************************************************/
     nvinfer1::IElementWiseLayer* conv0 =
             convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
     nvinfer1::IElementWiseLayer* conv1 =
@@ -1313,8 +1343,9 @@ nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfe
             SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
                  get_width(1024, gw, max_channels), 5, "model.9");
     /*******************************************************************************************************
-    *********************************************  YOLOV8 HEAD  ********************************************
-    *******************************************************************************************************/
+  *********************************************  YOLOV8 HEAD
+  *********************************************
+  *******************************************************************************************************/
     float scale[] = {1.0, 2.0, 2.0};
     nvinfer1::IResizeLayer* upsample10 = network->addResize(*conv9->getOutput(0));
     assert(upsample10);
@@ -1353,8 +1384,9 @@ nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfe
                 get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21");
 
     /*******************************************************************************************************
-    *********************************************  YOLOV8 OUTPUT  ******************************************
-    *******************************************************************************************************/
+  *********************************************  YOLOV8 OUTPUT
+  *******************************************
+  *******************************************************************************************************/
     int base_in_channel = (gw == 1.25) ? 80 : 64;
     int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(256, gw, max_channels);
 
@@ -1420,8 +1452,9 @@ nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfe
     nvinfer1::ITensor* inputTensor22_2[] = {conv22_cv2_2_2->getOutput(0), conv22_cv3_2_2->getOutput(0)};
     nvinfer1::IConcatenationLayer* cat22_2 = network->addConcatenation(inputTensor22_2, 2);
     /*******************************************************************************************************
-    *********************************************  YOLOV8 DETECT  ******************************************
-    *******************************************************************************************************/
+  *********************************************  YOLOV8 DETECT
+  *******************************************
+  *******************************************************************************************************/
 
     nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
     int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
@@ -1528,13 +1561,15 @@ nvinfer1::IHostMemory* buildEngineYolov8PoseP6(nvinfer1::IBuilder* builder, nvin
     std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
     nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
     /*******************************************************************************************************
-    ******************************************  YOLOV8 INPUT  **********************************************
-    *******************************************************************************************************/
+  ******************************************  YOLOV8 INPUT
+  ***********************************************
+  *******************************************************************************************************/
     nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW});
     assert(data);
     /*******************************************************************************************************
-    *****************************************  YOLOV8 BACKBONE  ********************************************
-    *******************************************************************************************************/
+  *****************************************  YOLOV8 BACKBONE
+  *********************************************
+  *******************************************************************************************************/
     nvinfer1::IElementWiseLayer* conv0 =
             convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
     nvinfer1::IElementWiseLayer* conv1 =
@@ -1569,8 +1604,9 @@ nvinfer1::IHostMemory* buildEngineYolov8PoseP6(nvinfer1::IBuilder* builder, nvin
                  get_width(1024, gw, max_channels), 5, "model.11");
 
     /*******************************************************************************************************
-    *********************************************  YOLOV8 HEAD  ********************************************
-    *******************************************************************************************************/
+  *********************************************  YOLOV8 HEAD
+  *********************************************
+  *******************************************************************************************************/
     // Head
     float scale[] = {1.0, 2.0, 2.0};  // scale used for upsampling
 
@@ -1633,8 +1669,9 @@ nvinfer1::IHostMemory* buildEngineYolov8PoseP6(nvinfer1::IBuilder* builder, nvin
                get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.29");
 
     /*******************************************************************************************************
-    *********************************************  YOLOV8 OUTPUT  ******************************************
-    *******************************************************************************************************/
+  *********************************************  YOLOV8 OUTPUT
+  *******************************************
+  *******************************************************************************************************/
     int base_in_channel = (gw == 1.25) ? 80 : 64;
     int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(256, gw, max_channels);
 
@@ -1730,8 +1767,9 @@ nvinfer1::IHostMemory* buildEngineYolov8PoseP6(nvinfer1::IBuilder* builder, nvin
     nvinfer1::IConcatenationLayer* cat30_3 = network->addConcatenation(inputTensor30_3, 2);
 
     /*******************************************************************************************************
-    *********************************************  YOLOV8 DETECT  ******************************************
-    *******************************************************************************************************/
+  *********************************************  YOLOV8 DETECT
+  *******************************************
+  *******************************************************************************************************/
     nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7, conv9};
     int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
     calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
@@ -1851,3 +1889,598 @@ nvinfer1::IHostMemory* buildEngineYolov8PoseP6(nvinfer1::IBuilder* builder, nvin
     }
     return serialized_model;
 }
+
+nvinfer1::IHostMemory* buildEngineYolov8_5uDet(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
+                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
+                                               int& max_channels) {
+    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
+    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
+
+    /*******************************************************************************************************
+  ******************************************  YOLOV5U INPUT
+  ***********************************************
+  *******************************************************************************************************/
+    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW});
+    assert(data);
+
+    /*******************************************************************************************************
+  *****************************************  YOLOV5U BACKBONE
+  *********************************************
+  *******************************************************************************************************/
+
+    nvinfer1::IElementWiseLayer* conv0 =
+            convBnSiLU(network, weightMap, *data, get_width_5u(64, gw), 6, 2, calculateP(6), "model.0");
+    nvinfer1::IElementWiseLayer* conv1 =
+            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width_5u(128, gw), 3, 2, calculateP(3), "model.1");
+    // 11233
+    nvinfer1::IElementWiseLayer* conv2 = C3(network, weightMap, *conv1->getOutput(0), get_width_5u(128, gw),
+                                            get_width_5u(128, gw), get_depth(3, gd), true, 0.5, "model.2");
+
+    nvinfer1::IElementWiseLayer* conv3 =
+            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width_5u(256, gw), 3, 2, calculateP(3), "model.3");
+    // 22466
+    nvinfer1::IElementWiseLayer* conv4 = C3(network, weightMap, *conv3->getOutput(0), get_width_5u(256, gw),
+                                            get_width_5u(256, gw), get_depth(6, gd), true, 0.5, "model.4");
+    nvinfer1::IElementWiseLayer* conv5 =
+            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width_5u(512, gw), 3, 2, calculateP(3), "model.5");
+    // 22466
+    nvinfer1::IElementWiseLayer* conv6 = C3(network, weightMap, *conv5->getOutput(0), get_width_5u(512, gw),
+                                            get_width_5u(512, gw), get_depth(6, gd), true, 0.5, "model.6");
+    nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width_5u(1024, gw), 3,
+                                                    2, calculateP(3), "model.7");
+    // 11233
+    nvinfer1::IElementWiseLayer* conv8 = C3(network, weightMap, *conv7->getOutput(0), get_width_5u(1024, gw),
+                                            get_width_5u(1024, gw), get_depth(3, gd), true, 0.5, "model.8");
+    nvinfer1::IElementWiseLayer* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width_5u(1024, gw),
+                                              get_width_5u(1024, gw), 5, "model.9");
+    /*******************************************************************************************************
+  *********************************************  YOLOV5U HEAD
+  *********************************************
+  *******************************************************************************************************/
+
+    //    auto conv10 = convBlock(network, weightMap, *spp9->getOutput(0),
+    //    get_width_5u(512, gw), 1, 1, 1, "model.10");
+
+    //*********************************************  cat backbone P4
+    //********************************************
+    nvinfer1::IElementWiseLayer* conv10 = convBnSiLU(network, weightMap, *conv9->getOutput(0), get_width_5u(512, gw), 1,
+                                                     1, calculateP(1), "model.10");
+    nvinfer1::IResizeLayer* upsample11 = network->addResize(*conv10->getOutput(0));
+    assert(upsample11);
+    upsample11->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
+    upsample11->setOutputDimensions(conv6->getOutput(0)->getDimensions());
+    nvinfer1::ITensor* inputTensor12[] = {upsample11->getOutput(0), conv6->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat12 = network->addConcatenation(inputTensor12, 2);
+    nvinfer1::IElementWiseLayer* conv13 = C3(network, weightMap, *cat12->getOutput(0), get_width_5u(512, gw),
+                                             get_width_5u(512, gw), get_depth(3, gd), false, 0.5, "model.13");
+    //*********************************************  cat backbone P4
+    //********************************************
+
+    //*********************************************  cat backbone P3
+    //********************************************
+    nvinfer1::IElementWiseLayer* conv14 = convBnSiLU(network, weightMap, *conv13->getOutput(0), get_width_5u(256, gw),
+                                                     1, 1, calculateP(1), "model.14");
+    nvinfer1::IResizeLayer* upsample15 = network->addResize(*conv14->getOutput(0));
+    assert(upsample15);
+    upsample15->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
+    upsample15->setOutputDimensions(conv4->getOutput(0)->getDimensions());
+    nvinfer1::ITensor* inputTensor16[] = {upsample15->getOutput(0), conv4->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat16 = network->addConcatenation(inputTensor16, 2);
+    nvinfer1::IElementWiseLayer* conv17 = C3(network, weightMap, *cat16->getOutput(0), get_width_5u(256, gw),
+                                             get_width_5u(256, gw), get_depth(3, gd), false, 0.5, "model.17");
+    //*********************************************  cat backbone P3
+    //********************************************
+
+    //*********************************************  cat head P4
+    //********************************************
+    nvinfer1::IElementWiseLayer* conv18 = convBnSiLU(network, weightMap, *conv17->getOutput(0), get_width_5u(256, gw),
+                                                     3, 2, calculateP(3), "model.18");
+    nvinfer1::ITensor* inputTensor19[] = {conv18->getOutput(0), conv14->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat19 = network->addConcatenation(inputTensor19, 2);
+    nvinfer1::IElementWiseLayer* conv20 = C3(network, weightMap, *cat19->getOutput(0), get_width_5u(512, gw),
+                                             get_width_5u(512, gw), get_depth(3, gd), false, 0.5, "model.20");
+    //*********************************************  cat head P4
+    //********************************************
+
+    //*********************************************  cat head P3
+    //********************************************
+    nvinfer1::IElementWiseLayer* conv21 = convBnSiLU(network, weightMap, *conv20->getOutput(0), get_width_5u(512, gw),
+                                                     3, 2, calculateP(3), "model.21");
+    nvinfer1::ITensor* inputTensor22[] = {conv21->getOutput(0), conv10->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat22 = network->addConcatenation(inputTensor22, 2);
+    nvinfer1::IElementWiseLayer* conv23 = C3(network, weightMap, *cat22->getOutput(0), get_width_5u(1024, gw),
+                                             get_width_5u(1024, gw), get_depth(3, gd), false, 0.5, "model.23");
+    //*********************************************  cat head P3
+    //********************************************
+
+    /*******************************************************************************************************
+  *********************************************  YOLOV5U OUTPUT
+  *******************************************
+  *******************************************************************************************************/
+    int base_in_channel = (gw == 1.25) ? 80 : 64;
+    int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width_5u(256, gw);
+
+    // output0
+    nvinfer1::IElementWiseLayer* conv24_cv2_0_0 = convBnSiLU(network, weightMap, *conv17->getOutput(0), base_in_channel,
+                                                             3, 1, calculateP(3), "model.24.cv2.0.0");
+    nvinfer1::IElementWiseLayer* conv24_cv2_0_1 = convBnSiLU(network, weightMap, *conv24_cv2_0_0->getOutput(0),
+                                                             base_in_channel, 3, 1, calculateP(3), "model.24.cv2.0.1");
+    nvinfer1::IConvolutionLayer* conv24_cv2_0_2 =
+            network->addConvolutionNd(*conv24_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.24.cv2.0.2.weight"], weightMap["model.24.cv2.0.2.bias"]);
+    conv24_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv24_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer* conv24_cv3_0_0 = convBnSiLU(network, weightMap, *conv17->getOutput(0),
+                                                             base_out_channel, 3, 1, calculateP(3), "model.24.cv3.0.0");
+    nvinfer1::IElementWiseLayer* conv24_cv3_0_1 = convBnSiLU(network, weightMap, *conv24_cv3_0_0->getOutput(0),
+                                                             base_out_channel, 3, 1, calculateP(3), "model.24.cv3.0.1");
+    nvinfer1::IConvolutionLayer* conv24_cv3_0_2 =
+            network->addConvolutionNd(*conv24_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.24.cv3.0.2.weight"], weightMap["model.24.cv3.0.2.bias"]);
+    conv24_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1});
+    conv24_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor* inputTensor24_0[] = {conv24_cv2_0_2->getOutput(0), conv24_cv3_0_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat24_0 = network->addConcatenation(inputTensor24_0, 2);
+
+    // output1
+    nvinfer1::IElementWiseLayer* conv24_cv2_1_0 = convBnSiLU(network, weightMap, *conv20->getOutput(0), base_in_channel,
+                                                             3, 1, calculateP(3), "model.24.cv2.1.0");
+    nvinfer1::IElementWiseLayer* conv24_cv2_1_1 = convBnSiLU(network, weightMap, *conv24_cv2_1_0->getOutput(0),
+                                                             base_in_channel, 3, 1, calculateP(3), "model.24.cv2.1.1");
+    nvinfer1::IConvolutionLayer* conv24_cv2_1_2 =
+            network->addConvolutionNd(*conv24_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.24.cv2.1.2.weight"], weightMap["model.24.cv2.1.2.bias"]);
+    conv24_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv24_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer* conv24_cv3_1_0 = convBnSiLU(network, weightMap, *conv20->getOutput(0),
+                                                             base_out_channel, 3, 1, calculateP(3), "model.24.cv3.1.0");
+    nvinfer1::IElementWiseLayer* conv24_cv3_1_1 = convBnSiLU(network, weightMap, *conv24_cv3_1_0->getOutput(0),
+                                                             base_out_channel, 3, 1, calculateP(3), "model.24.cv3.1.1");
+    nvinfer1::IConvolutionLayer* conv24_cv3_1_2 =
+            network->addConvolutionNd(*conv24_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.24.cv3.1.2.weight"], weightMap["model.24.cv3.1.2.bias"]);
+    conv24_cv3_1_2->setStride(nvinfer1::DimsHW{1, 1});
+    conv24_cv3_1_2->setPadding(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor* inputTensor24_1[] = {conv24_cv2_1_2->getOutput(0), conv24_cv3_1_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat24_1 = network->addConcatenation(inputTensor24_1, 2);
+
+    // output2
+    nvinfer1::IElementWiseLayer* conv24_cv2_2_0 = convBnSiLU(network, weightMap, *conv23->getOutput(0), base_in_channel,
+                                                             3, 1, calculateP(3), "model.24.cv2.2.0");
+    nvinfer1::IElementWiseLayer* conv24_cv2_2_1 = convBnSiLU(network, weightMap, *conv24_cv2_2_0->getOutput(0),
+                                                             base_in_channel, 3, 1, calculateP(3), "model.24.cv2.2.1");
+    nvinfer1::IConvolutionLayer* conv24_cv2_2_2 =
+            network->addConvolutionNd(*conv24_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.24.cv2.2.2.weight"], weightMap["model.24.cv2.2.2.bias"]);
+    conv24_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv24_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer* conv24_cv3_2_0 = convBnSiLU(network, weightMap, *conv23->getOutput(0),
+                                                             base_out_channel, 3, 1, calculateP(3), "model.24.cv3.2.0");
+    nvinfer1::IElementWiseLayer* conv24_cv3_2_1 = convBnSiLU(network, weightMap, *conv24_cv3_2_0->getOutput(0),
+                                                             base_out_channel, 3, 1, calculateP(3), "model.24.cv3.2.1");
+    nvinfer1::IConvolutionLayer* conv24_cv3_2_2 =
+            network->addConvolutionNd(*conv24_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.24.cv3.2.2.weight"], weightMap["model.24.cv3.2.2.bias"]);
+    conv24_cv3_2_2->setStride(nvinfer1::DimsHW{1, 1});
+    conv24_cv3_2_2->setPadding(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor* inputTensor24_2[] = {conv24_cv2_2_2->getOutput(0), conv24_cv3_2_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat24_2 = network->addConcatenation(inputTensor24_2, 2);
+
+    /*******************************************************************************************************
+  *********************************************  YOLOV5U DETECT
+  *******************************************
+  *******************************************************************************************************/
+
+    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7};
+    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
+    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
+    int stridesLength = sizeof(strides) / sizeof(int);
+
+    // det0
+    nvinfer1::IShuffleLayer* shuffle24_0 = network->addShuffle(*cat24_0->getOutput(0));
+    shuffle24_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
+    nvinfer1::ISliceLayer* split24_0_0 = network->addSlice(
+            *shuffle24_0->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split24_0_1 = network->addSlice(
+            *shuffle24_0->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl24_0 =
+            DFL(network, weightMap, *split24_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
+                1, 0, "model.24.dfl.conv.weight");
+    nvinfer1::ITensor* inputTensor24_dfl_0[] = {dfl24_0->getOutput(0), split24_0_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat24_dfl_0 = network->addConcatenation(inputTensor24_dfl_0, 2);
+
+    // det1
+    nvinfer1::IShuffleLayer* shuffle24_1 = network->addShuffle(*cat24_1->getOutput(0));
+    shuffle24_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
+    nvinfer1::ISliceLayer* split24_1_0 = network->addSlice(
+            *shuffle24_1->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split24_1_1 = network->addSlice(
+            *shuffle24_1->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl24_1 =
+            DFL(network, weightMap, *split24_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
+                1, 0, "model.24.dfl.conv.weight");
+    nvinfer1::ITensor* inputTensor24_dfl_1[] = {dfl24_1->getOutput(0), split24_1_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat24_dfl_1 = network->addConcatenation(inputTensor24_dfl_1, 2);
+
+    // det2
+    nvinfer1::IShuffleLayer* shuffle24_2 = network->addShuffle(*cat24_2->getOutput(0));
+    shuffle24_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
+    nvinfer1::ISliceLayer* split24_2_0 = network->addSlice(
+            *shuffle24_2->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split24_2_1 = network->addSlice(
+            *shuffle24_2->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl24_2 =
+            DFL(network, weightMap, *split24_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
+                1, 0, "model.24.dfl.conv.weight");
+    nvinfer1::ITensor* inputTensor24_dfl_2[] = {dfl24_2->getOutput(0), split24_2_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat24_dfl_2 = network->addConcatenation(inputTensor24_dfl_2, 2);
+
+    nvinfer1::IPluginV2Layer* yolo =
+            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer*>{cat24_dfl_0, cat24_dfl_1, cat24_dfl_2},
+                         strides, stridesLength, false, false);
+
+    yolo->getOutput(0)->setName(kOutputTensorName);
+    network->markOutput(*yolo->getOutput(0));
+
+    builder->setMaxBatchSize(kBatchSize);
+    config->setMaxWorkspaceSize(16 * (1 << 20));
+
+#if defined(USE_FP16)
+    config->setFlag(nvinfer1::BuilderFlag::kFP16);
+#elif defined(USE_INT8)
+    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
+    assert(builder->platformHasFastInt8());
+    config->setFlag(nvinfer1::BuilderFlag::kINT8);
+    auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
+                                                  kInputTensorName);
+    config->setInt8Calibrator(calibrator);
+#endif
+
+    std::cout << "Building engine, please wait for a while..." << std::endl;
+    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
+    std::cout << "Build engine successfully!" << std::endl;
+
+    delete network;
+
+    for (auto& mem : weightMap) {
+        free((void*)(mem.second.values));
+    }
+    return serialized_model;
+}
+
+nvinfer1::IHostMemory* buildEngineYolov8_5uDetP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
+                                                 nvinfer1::DataType dt, const std::string& wts_path, float& gd,
+                                                 float& gw, int& max_channels) {
+    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
+    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
+
+    /*******************************************************************************************************
+  ******************************************  YOLOV5U-P6 INPUT
+  ***********************************************
+  *******************************************************************************************************/
+    nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW});
+    assert(data);
+
+    /*******************************************************************************************************
+  *****************************************  YOLOV5U-P6 BACKBONE
+  *********************************************
+  *******************************************************************************************************/
+
+    nvinfer1::IElementWiseLayer* conv0 =
+            convBnSiLU(network, weightMap, *data, get_width_5u(64, gw), 6, 2, calculateP(6), "model.0");
+    nvinfer1::IElementWiseLayer* conv1 =
+            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width_5u(128, gw), 3, 2, calculateP(3), "model.1");
+    // 11233
+    nvinfer1::IElementWiseLayer* conv2 = C3(network, weightMap, *conv1->getOutput(0), get_width_5u(128, gw),
+                                            get_width_5u(128, gw), get_depth(3, gd), true, 0.5, "model.2");
+
+    nvinfer1::IElementWiseLayer* conv3 =
+            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width_5u(256, gw), 3, 2, calculateP(3), "model.3");
+    // 22466
+    nvinfer1::IElementWiseLayer* conv4 = C3(network, weightMap, *conv3->getOutput(0), get_width_5u(256, gw),
+                                            get_width_5u(256, gw), get_depth(6, gd), true, 0.5, "model.4");
+    nvinfer1::IElementWiseLayer* conv5 =
+            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width_5u(512, gw), 3, 2, calculateP(3), "model.5");
+    // 22466
+    nvinfer1::IElementWiseLayer* conv6 = C3(network, weightMap, *conv5->getOutput(0), get_width_5u(512, gw),
+                                            get_width_5u(512, gw), get_depth(6, gd), true, 0.5, "model.6");
+    nvinfer1::IElementWiseLayer* conv7 =
+            convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width_5u(768, gw), 3, 2, calculateP(3), "model.7");
+    // 11233
+    nvinfer1::IElementWiseLayer* conv8 = C3(network, weightMap, *conv7->getOutput(0), get_width_5u(768, gw),
+                                            get_width_5u(768, gw), get_depth(3, gd), true, 0.5, "model.8");
+
+    nvinfer1::IElementWiseLayer* conv9 = convBnSiLU(network, weightMap, *conv8->getOutput(0), get_width_5u(1024, gw), 3,
+                                                    2, calculateP(3), "model.9");
+    // 11233
+    nvinfer1::IElementWiseLayer* conv10 = C3(network, weightMap, *conv9->getOutput(0), get_width_5u(1024, gw),
+                                             get_width_5u(1024, gw), get_depth(3, gd), true, 0.5, "model.10");
+
+    nvinfer1::IElementWiseLayer* conv11 = SPPF(network, weightMap, *conv10->getOutput(0), get_width_5u(1024, gw),
+                                               get_width_5u(1024, gw), 5, "model.11");
+    /*******************************************************************************************************
+  *********************************************  YOLOV5U-P6 HEAD
+  *********************************************
+  *******************************************************************************************************/
+
+    //*********************************************  cat backbone P5
+    //********************************************
+    nvinfer1::IElementWiseLayer* conv12 = convBnSiLU(network, weightMap, *conv11->getOutput(0), get_width_5u(768, gw),
+                                                     1, 1, calculateP(1), "model.12");
+    nvinfer1::IResizeLayer* upsample13 = network->addResize(*conv12->getOutput(0));
+    assert(upsample13);
+    upsample13->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
+    upsample13->setOutputDimensions(conv8->getOutput(0)->getDimensions());
+    nvinfer1::ITensor* inputTensor14[] = {upsample13->getOutput(0), conv8->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat14 = network->addConcatenation(inputTensor14, 2);
+    nvinfer1::IElementWiseLayer* conv15 = C3(network, weightMap, *cat14->getOutput(0), get_width_5u(768, gw),
+                                             get_width_5u(768, gw), get_depth(3, gd), false, 0.5, "model.15");
+    //*********************************************  cat backbone P5
+    //********************************************
+
+    //*********************************************  cat backbone P4
+    //********************************************
+    nvinfer1::IElementWiseLayer* conv16 = convBnSiLU(network, weightMap, *conv15->getOutput(0), get_width_5u(512, gw),
+                                                     1, 1, calculateP(1), "model.16");
+    nvinfer1::IResizeLayer* upsample17 = network->addResize(*conv16->getOutput(0));
+    assert(upsample17);
+    upsample17->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
+    upsample17->setOutputDimensions(conv6->getOutput(0)->getDimensions());
+    nvinfer1::ITensor* inputTensor18[] = {upsample17->getOutput(0), conv6->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat18 = network->addConcatenation(inputTensor18, 2);
+    nvinfer1::IElementWiseLayer* conv19 = C3(network, weightMap, *cat18->getOutput(0), get_width_5u(512, gw),
+                                             get_width_5u(512, gw), get_depth(3, gd), false, 0.5, "model.19");
+    //*********************************************  cat backbone P4
+    //********************************************
+
+    //*********************************************  cat backbone P3
+    //********************************************
+    nvinfer1::IElementWiseLayer* conv20 = convBnSiLU(network, weightMap, *conv19->getOutput(0), get_width_5u(256, gw),
+                                                     1, 1, calculateP(1), "model.20");
+    nvinfer1::IResizeLayer* upsample21 = network->addResize(*conv20->getOutput(0));
+    assert(upsample21);
+    upsample21->setResizeMode(nvinfer1::ResizeMode::kNEAREST);
+    upsample21->setOutputDimensions(conv4->getOutput(0)->getDimensions());
+    nvinfer1::ITensor* inputTensor22[] = {upsample21->getOutput(0), conv4->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat22 = network->addConcatenation(inputTensor22, 2);
+    nvinfer1::IElementWiseLayer* conv23 = C3(network, weightMap, *cat22->getOutput(0), get_width_5u(256, gw),
+                                             get_width_5u(256, gw), get_depth(3, gd), false, 0.5, "model.23");
+    //*********************************************  cat backbone P3
+    //********************************************
+
+    //*********************************************  cat head P4
+    //********************************************
+    nvinfer1::IElementWiseLayer* conv24 = convBnSiLU(network, weightMap, *conv23->getOutput(0), get_width_5u(256, gw),
+                                                     3, 2, calculateP(3), "model.24");
+    nvinfer1::ITensor* inputTensor25[] = {conv24->getOutput(0), conv20->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat25 = network->addConcatenation(inputTensor25, 2);
+    nvinfer1::IElementWiseLayer* conv26 = C3(network, weightMap, *cat25->getOutput(0), get_width_5u(512, gw),
+                                             get_width_5u(512, gw), get_depth(3, gd), false, 0.5, "model.26");
+    //*********************************************  cat head P4
+    //********************************************
+
+    //*********************************************  cat head P5
+    //********************************************
+    nvinfer1::IElementWiseLayer* conv27 = convBnSiLU(network, weightMap, *conv26->getOutput(0), get_width_5u(512, gw),
+                                                     3, 2, calculateP(3), "model.27");
+    nvinfer1::ITensor* inputTensor28[] = {conv27->getOutput(0), conv16->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat28 = network->addConcatenation(inputTensor28, 2);
+    nvinfer1::IElementWiseLayer* conv29 = C3(network, weightMap, *cat28->getOutput(0), get_width_5u(768, gw),
+                                             get_width_5u(768, gw), get_depth(3, gd), false, 0.5, "model.29");
+    //*********************************************  cat head P5
+    //********************************************
+
+    //*********************************************  cat head P6
+    //********************************************
+    nvinfer1::IElementWiseLayer* conv30 = convBnSiLU(network, weightMap, *conv29->getOutput(0), get_width_5u(768, gw),
+                                                     3, 2, calculateP(3), "model.30");
+    nvinfer1::ITensor* inputTensor31[] = {conv30->getOutput(0), conv12->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat31 = network->addConcatenation(inputTensor31, 2);
+    nvinfer1::IElementWiseLayer* conv32 = C3(network, weightMap, *cat31->getOutput(0), get_width_5u(768, gw),
+                                             get_width_5u(1024, gw), get_depth(3, gd), false, 0.5, "model.32");
+    //*********************************************  cat head P6
+    //********************************************
+
+    /*******************************************************************************************************
+  *********************************************  YOLOV5U-P6 OUTPUT
+  *******************************************
+  *******************************************************************************************************/
+    int base_in_channel = (gw == 1.25) ? 80 : 64;
+    int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width_5u(256, gw);
+
+    // output0
+    nvinfer1::IElementWiseLayer* conv33_cv2_0_0 = convBnSiLU(network, weightMap, *conv23->getOutput(0), base_in_channel,
+                                                             3, 1, calculateP(3), "model.33.cv2.0.0");
+    nvinfer1::IElementWiseLayer* conv33_cv2_0_1 = convBnSiLU(network, weightMap, *conv33_cv2_0_0->getOutput(0),
+                                                             base_in_channel, 3, 1, calculateP(3), "model.33.cv2.0.1");
+    nvinfer1::IConvolutionLayer* conv33_cv2_0_2 =
+            network->addConvolutionNd(*conv33_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.33.cv2.0.2.weight"], weightMap["model.33.cv2.0.2.bias"]);
+    conv33_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv33_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer* conv33_cv3_0_0 = convBnSiLU(network, weightMap, *conv23->getOutput(0),
+                                                             base_out_channel, 3, 1, calculateP(3), "model.33.cv3.0.0");
+    nvinfer1::IElementWiseLayer* conv33_cv3_0_1 = convBnSiLU(network, weightMap, *conv33_cv3_0_0->getOutput(0),
+                                                             base_out_channel, 3, 1, calculateP(3), "model.33.cv3.0.1");
+    nvinfer1::IConvolutionLayer* conv33_cv3_0_2 =
+            network->addConvolutionNd(*conv33_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.33.cv3.0.2.weight"], weightMap["model.33.cv3.0.2.bias"]);
+    conv33_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1});
+    conv33_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor* inputTensor33_0[] = {conv33_cv2_0_2->getOutput(0), conv33_cv3_0_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat33_0 = network->addConcatenation(inputTensor33_0, 2);
+
+    // output1
+    nvinfer1::IElementWiseLayer* conv33_cv2_1_0 = convBnSiLU(network, weightMap, *conv26->getOutput(0), base_in_channel,
+                                                             3, 1, calculateP(3), "model.33.cv2.1.0");
+    nvinfer1::IElementWiseLayer* conv33_cv2_1_1 = convBnSiLU(network, weightMap, *conv33_cv2_1_0->getOutput(0),
+                                                             base_in_channel, 3, 1, calculateP(3), "model.33.cv2.1.1");
+    nvinfer1::IConvolutionLayer* conv33_cv2_1_2 =
+            network->addConvolutionNd(*conv33_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.33.cv2.1.2.weight"], weightMap["model.33.cv2.1.2.bias"]);
+    conv33_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv33_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer* conv33_cv3_1_0 = convBnSiLU(network, weightMap, *conv26->getOutput(0),
+                                                             base_out_channel, 3, 1, calculateP(3), "model.33.cv3.1.0");
+    nvinfer1::IElementWiseLayer* conv33_cv3_1_1 = convBnSiLU(network, weightMap, *conv33_cv3_1_0->getOutput(0),
+                                                             base_out_channel, 3, 1, calculateP(3), "model.33.cv3.1.1");
+    nvinfer1::IConvolutionLayer* conv33_cv3_1_2 =
+            network->addConvolutionNd(*conv33_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.33.cv3.1.2.weight"], weightMap["model.33.cv3.1.2.bias"]);
+    conv33_cv3_1_2->setStride(nvinfer1::DimsHW{1, 1});
+    conv33_cv3_1_2->setPadding(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor* inputTensor33_1[] = {conv33_cv2_1_2->getOutput(0), conv33_cv3_1_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat33_1 = network->addConcatenation(inputTensor33_1, 2);
+
+    // output2
+    nvinfer1::IElementWiseLayer* conv33_cv2_2_0 = convBnSiLU(network, weightMap, *conv29->getOutput(0), base_in_channel,
+                                                             3, 1, calculateP(3), "model.33.cv2.2.0");
+    nvinfer1::IElementWiseLayer* conv33_cv2_2_1 = convBnSiLU(network, weightMap, *conv33_cv2_2_0->getOutput(0),
+                                                             base_in_channel, 3, 1, calculateP(3), "model.33.cv2.2.1");
+    nvinfer1::IConvolutionLayer* conv33_cv2_2_2 =
+            network->addConvolutionNd(*conv33_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.33.cv2.2.2.weight"], weightMap["model.33.cv2.2.2.bias"]);
+    conv33_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv33_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer* conv33_cv3_2_0 = convBnSiLU(network, weightMap, *conv29->getOutput(0),
+                                                             base_out_channel, 3, 1, calculateP(3), "model.33.cv3.2.0");
+    nvinfer1::IElementWiseLayer* conv33_cv3_2_1 = convBnSiLU(network, weightMap, *conv33_cv3_2_0->getOutput(0),
+                                                             base_out_channel, 3, 1, calculateP(3), "model.33.cv3.2.1");
+    nvinfer1::IConvolutionLayer* conv33_cv3_2_2 =
+            network->addConvolutionNd(*conv33_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.33.cv3.2.2.weight"], weightMap["model.33.cv3.2.2.bias"]);
+    conv33_cv3_2_2->setStride(nvinfer1::DimsHW{1, 1});
+    conv33_cv3_2_2->setPadding(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor* inputTensor33_2[] = {conv33_cv2_2_2->getOutput(0), conv33_cv3_2_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat33_2 = network->addConcatenation(inputTensor33_2, 2);
+
+    // output3
+    nvinfer1::IElementWiseLayer* conv33_cv2_3_0 = convBnSiLU(network, weightMap, *conv32->getOutput(0), base_in_channel,
+                                                             3, 1, calculateP(3), "model.33.cv2.3.0");
+    nvinfer1::IElementWiseLayer* conv33_cv2_3_1 = convBnSiLU(network, weightMap, *conv33_cv2_3_0->getOutput(0),
+                                                             base_in_channel, 3, 1, calculateP(3), "model.33.cv2.3.1");
+    nvinfer1::IConvolutionLayer* conv33_cv2_3_2 =
+            network->addConvolutionNd(*conv33_cv2_3_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.33.cv2.3.2.weight"], weightMap["model.33.cv2.3.2.bias"]);
+    conv33_cv2_3_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv33_cv2_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer* conv33_cv3_3_0 = convBnSiLU(network, weightMap, *conv32->getOutput(0),
+                                                             base_out_channel, 3, 1, calculateP(3), "model.33.cv3.3.0");
+    nvinfer1::IElementWiseLayer* conv33_cv3_3_1 = convBnSiLU(network, weightMap, *conv33_cv3_3_0->getOutput(0),
+                                                             base_out_channel, 3, 1, calculateP(3), "model.33.cv3.3.1");
+    nvinfer1::IConvolutionLayer* conv33_cv3_3_2 =
+            network->addConvolutionNd(*conv33_cv3_3_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.33.cv3.3.2.weight"], weightMap["model.33.cv3.3.2.bias"]);
+    conv33_cv3_3_2->setStride(nvinfer1::DimsHW{1, 1});
+    conv33_cv3_3_2->setPadding(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor* inputTensor33_3[] = {conv33_cv2_3_2->getOutput(0), conv33_cv3_3_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat33_3 = network->addConcatenation(inputTensor33_3, 2);
+
+    /*******************************************************************************************************
+  *********************************************  YOLOV5U-P6 DETECT
+  *******************************************
+  *******************************************************************************************************/
+
+    nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7, conv9};
+    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
+    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
+    int stridesLength = sizeof(strides) / sizeof(int);
+
+    // det0
+    nvinfer1::IShuffleLayer* shuffle33_0 = network->addShuffle(*cat33_0->getOutput(0));
+    shuffle33_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
+    nvinfer1::ISliceLayer* split33_0_0 = network->addSlice(
+            *shuffle33_0->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split33_0_1 = network->addSlice(
+            *shuffle33_0->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl33_0 =
+            DFL(network, weightMap, *split33_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
+                1, 0, "model.33.dfl.conv.weight");
+    nvinfer1::ITensor* inputTensor33_dfl_0[] = {dfl33_0->getOutput(0), split33_0_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat33_dfl_0 = network->addConcatenation(inputTensor33_dfl_0, 2);
+
+    // det1
+    nvinfer1::IShuffleLayer* shuffle33_1 = network->addShuffle(*cat33_1->getOutput(0));
+    shuffle33_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
+    nvinfer1::ISliceLayer* split33_1_0 = network->addSlice(
+            *shuffle33_1->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split33_1_1 = network->addSlice(
+            *shuffle33_1->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl33_1 =
+            DFL(network, weightMap, *split33_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
+                1, 0, "model.33.dfl.conv.weight");
+    nvinfer1::ITensor* inputTensor33_dfl_1[] = {dfl33_1->getOutput(0), split33_1_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat33_dfl_1 = network->addConcatenation(inputTensor33_dfl_1, 2);
+
+    // det2
+    nvinfer1::IShuffleLayer* shuffle33_2 = network->addShuffle(*cat33_2->getOutput(0));
+    shuffle33_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
+    nvinfer1::ISliceLayer* split33_2_0 = network->addSlice(
+            *shuffle33_2->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split33_2_1 = network->addSlice(
+            *shuffle33_2->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl33_2 =
+            DFL(network, weightMap, *split33_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
+                1, 0, "model.33.dfl.conv.weight");
+    nvinfer1::ITensor* inputTensor33_dfl_2[] = {dfl33_2->getOutput(0), split33_2_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat33_dfl_2 = network->addConcatenation(inputTensor33_dfl_2, 2);
+
+    // det3
+    nvinfer1::IShuffleLayer* shuffle33_3 = network->addShuffle(*cat33_3->getOutput(0));
+    shuffle33_3->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])});
+    nvinfer1::ISliceLayer* split33_3_0 = network->addSlice(
+            *shuffle33_3->getOutput(0), nvinfer1::Dims2{0, 0},
+            nvinfer1::Dims2{64, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::ISliceLayer* split33_3_1 = network->addSlice(
+            *shuffle33_3->getOutput(0), nvinfer1::Dims2{64, 0},
+            nvinfer1::Dims2{kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1});
+    nvinfer1::IShuffleLayer* dfl33_3 =
+            DFL(network, weightMap, *split33_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1,
+                1, 0, "model.33.dfl.conv.weight");
+    nvinfer1::ITensor* inputTensor33_dfl_3[] = {dfl33_3->getOutput(0), split33_3_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer* cat33_dfl_3 = network->addConcatenation(inputTensor33_dfl_3, 2);
+
+    nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(
+            network, std::vector<nvinfer1::IConcatenationLayer*>{cat33_dfl_0, cat33_dfl_1, cat33_dfl_2, cat33_dfl_3},
+            strides, stridesLength, false, false);
+
+    yolo->getOutput(0)->setName(kOutputTensorName);
+    network->markOutput(*yolo->getOutput(0));
+
+    builder->setMaxBatchSize(kBatchSize);
+    config->setMaxWorkspaceSize(16 * (1 << 20));
+
+#if defined(USE_FP16)
+    config->setFlag(nvinfer1::BuilderFlag::kFP16);
+#elif defined(USE_INT8)
+    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
+    assert(builder->platformHasFastInt8());
+    config->setFlag(nvinfer1::BuilderFlag::kINT8);
+    auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
+                                                  kInputTensorName);
+    config->setInt8Calibrator(calibrator);
+#endif
+
+    std::cout << "Building engine, please wait for a while..." << std::endl;
+    nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
+    std::cout << "Build engine successfully!" << std::endl;
+
+    delete network;
+
+    for (auto& mem : weightMap) {
+        free((void*)(mem.second.values));
+    }
+    return serialized_model;
+}
diff --git a/yolov8/yolov8_5u_det.cpp b/yolov8/yolov8_5u_det.cpp
new file mode 100644
index 00000000..6813c4d7
--- /dev/null
+++ b/yolov8/yolov8_5u_det.cpp
@@ -0,0 +1,278 @@
+
+#include <fstream>
+#include <iostream>
+#include <opencv2/opencv.hpp>
+#include "cuda_utils.h"
+#include "logging.h"
+#include "model.h"
+#include "postprocess.h"
+#include "preprocess.h"
+#include "utils.h"
+
+Logger gLogger;
+using namespace nvinfer1;
+const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
+
+void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd,
+                      float& gw, int& max_channels) {
+    IBuilder* builder = createInferBuilder(gLogger);
+    IBuilderConfig* config = builder->createBuilderConfig();
+    IHostMemory* serialized_engine = nullptr;
+
+    if (is_p == 6) {
+        serialized_engine =
+                buildEngineYolov8_5uDetP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
+    } else {
+        serialized_engine = buildEngineYolov8_5uDet(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
+    }
+
+    assert(serialized_engine);
+    std::ofstream p(engine_name, std::ios::binary);
+    if (!p) {
+        std::cout << "could not open plan output file" << std::endl;
+        assert(false);
+    }
+    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
+
+    delete serialized_engine;
+    delete config;
+    delete builder;
+}
+
+void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
+                        IExecutionContext** context) {
+    std::ifstream file(engine_name, std::ios::binary);
+    if (!file.good()) {
+        std::cerr << "read " << engine_name << " error!" << std::endl;
+        assert(false);
+    }
+    size_t size = 0;
+    file.seekg(0, file.end);
+    size = file.tellg();
+    file.seekg(0, file.beg);
+    char* serialized_engine = new char[size];
+    assert(serialized_engine);
+    file.read(serialized_engine, size);
+    file.close();
+
+    *runtime = createInferRuntime(gLogger);
+    assert(*runtime);
+    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
+    assert(*engine);
+    *context = (*engine)->createExecutionContext();
+    assert(*context);
+    delete[] serialized_engine;
+}
+
+void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
+                    float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
+                    std::string cuda_post_process) {
+    assert(engine->getNbBindings() == 2);
+    // In order to bind the buffers, we need to know the names of the input and
+    // output tensors. Note that indices are guaranteed to be less than
+    // IEngine::getNbBindings()
+    const int inputIndex = engine->getBindingIndex(kInputTensorName);
+    const int outputIndex = engine->getBindingIndex(kOutputTensorName);
+    assert(inputIndex == 0);
+    assert(outputIndex == 1);
+    // Create GPU buffers on device
+    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
+    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
+    if (cuda_post_process == "c") {
+        *output_buffer_host = new float[kBatchSize * kOutputSize];
+    } else if (cuda_post_process == "g") {
+        if (kBatchSize > 1) {
+            std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
+            exit(0);
+        }
+        // Allocate memory for decode_ptr_host and copy to device
+        *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
+        CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
+    }
+}
+
+void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
+           float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
+    // infer on the batch asynchronously, and DMA output back to host
+    auto start = std::chrono::system_clock::now();
+    context.enqueue(batchsize, buffers, stream, nullptr);
+    if (cuda_post_process == "c") {
+        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
+                                   stream));
+        auto end = std::chrono::system_clock::now();
+        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
+                  << "ms" << std::endl;
+    } else if (cuda_post_process == "g") {
+        CUDA_CHECK(
+                cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
+        cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
+        cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox,
+                 stream);  // cuda nms
+        CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
+                                   sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
+                                   stream));
+        auto end = std::chrono::system_clock::now();
+        std::cout << "inference and gpu postprocess time: "
+                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
+    }
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+}
+
+bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir,
+                std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
+    if (argc < 4)
+        return false;
+    if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
+        wts = std::string(argv[2]);
+        engine = std::string(argv[3]);
+        auto sub_type = std::string(argv[4]);
+
+        if (sub_type[0] == 'n') {
+            gd = 0.33;
+            gw = 0.25;
+            max_channels = 1024;
+        } else if (sub_type[0] == 's') {
+            gd = 0.33;
+            gw = 0.50;
+            max_channels = 1024;
+        } else if (sub_type[0] == 'm') {
+            gd = 0.67;
+            gw = 0.75;
+            max_channels = 576;
+        } else if (sub_type[0] == 'l') {
+            gd = 1.0;
+            gw = 1.0;
+            max_channels = 512;
+        } else if (sub_type[0] == 'x') {
+            gd = 1.33;
+            gw = 1.25;
+            max_channels = 640;
+        } else {
+            return false;
+        }
+        if (sub_type.size() == 2 && sub_type[1] == '6') {
+            is_p = 6;
+        }
+    } else if (std::string(argv[1]) == "-d" && argc == 5) {
+        engine = std::string(argv[2]);
+        img_dir = std::string(argv[3]);
+        cuda_post_process = std::string(argv[4]);
+    } else {
+        return false;
+    }
+    return true;
+}
+
+int main(int argc, char** argv) {
+    cudaSetDevice(kGpuId);
+    std::string wts_name = "";
+    std::string engine_name = "";
+    std::string img_dir;
+    std::string sub_type = "";
+    std::string cuda_post_process = "";
+    int model_bboxes;
+    int is_p = 0;
+    float gd = 0.0f, gw = 0.0f;
+    int max_channels = 0;
+
+    if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw,
+                    max_channels)) {
+        std::cerr << "Arguments not right!" << std::endl;
+        std::cerr << "./yolov8_5u_det -s [.wts] [.engine] "
+                     "[n/s/m/l/x//n6/s6/m6/l6/x6]  // serialize model to "
+                     "plan file"
+                  << std::endl;
+        std::cerr << "./yolov8_5u_det -d [.engine] ../samples  [c/g]// deserialize "
+                     "plan file and run inference"
+                  << std::endl;
+        return -1;
+    }
+
+    // Create a model using the API directly and serialize it to a file
+    if (!wts_name.empty()) {
+        serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels);
+        return 0;
+    }
+
+    // Deserialize the engine from file
+    IRuntime* runtime = nullptr;
+    ICudaEngine* engine = nullptr;
+    IExecutionContext* context = nullptr;
+    deserialize_engine(engine_name, &runtime, &engine, &context);
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    cuda_preprocess_init(kMaxInputImageSize);
+    auto out_dims = engine->getBindingDimensions(1);
+    model_bboxes = out_dims.d[0];
+    // Prepare cpu and gpu buffers
+    float* device_buffers[2];
+    float* output_buffer_host = nullptr;
+    float* decode_ptr_host = nullptr;
+    float* decode_ptr_device = nullptr;
+
+    // Read images from directory
+    std::vector<std::string> file_names;
+    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
+        std::cerr << "read_files_in_dir failed." << std::endl;
+        return -1;
+    }
+
+    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
+                   &decode_ptr_device, cuda_post_process);
+
+    // batch predict
+    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
+        // Get a batch of images
+        std::vector<cv::Mat> img_batch;
+        std::vector<std::string> img_name_batch;
+        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
+            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
+            img_batch.push_back(img);
+            img_name_batch.push_back(file_names[j]);
+        }
+        // Preprocess
+        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
+        // Run inference
+        infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
+              decode_ptr_device, model_bboxes, cuda_post_process);
+        std::vector<std::vector<Detection>> res_batch;
+        if (cuda_post_process == "c") {
+            // NMS
+            batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
+        } else if (cuda_post_process == "g") {
+            // Process gpu decode and nms results
+            batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
+        }
+        // Draw bounding boxes
+        draw_bbox(img_batch, res_batch);
+        // Save images
+        for (size_t j = 0; j < img_batch.size(); j++) {
+            cv::imwrite("_" + img_name_batch[j], img_batch[j]);
+        }
+    }
+
+    // Release stream and buffers
+    cudaStreamDestroy(stream);
+    CUDA_CHECK(cudaFree(device_buffers[0]));
+    CUDA_CHECK(cudaFree(device_buffers[1]));
+    CUDA_CHECK(cudaFree(decode_ptr_device));
+    delete[] decode_ptr_host;
+    delete[] output_buffer_host;
+    cuda_preprocess_destroy();
+    // Destroy the engine
+    delete context;
+    delete engine;
+    delete runtime;
+
+    // Print histogram of the output distribution
+    // std::cout << "\nOutput:\n\n";
+    // for (unsigned int i = 0; i < kOutputSize; i++)
+    //{
+    //    std::cout << prob[i] << ", ";
+    //    if (i % 10 == 0) std::cout << std::endl;
+    //}
+    // std::cout << std::endl;
+
+    return 0;
+}
diff --git a/yolov8/yolov8_5u_det_trt.py b/yolov8/yolov8_5u_det_trt.py
new file mode 100644
index 00000000..252fe767
--- /dev/null
+++ b/yolov8/yolov8_5u_det_trt.py
@@ -0,0 +1,461 @@
+"""
+An example that uses TensorRT's Python api to make inferences.
+"""
+import ctypes
+import os
+import shutil
+import random
+import sys
+import threading
+import time
+import cv2
+import numpy as np
+import pycuda.autoinit  # noqa: F401
+import pycuda.driver as cuda
+import tensorrt as trt
+
+CONF_THRESH = 0.5
+IOU_THRESHOLD = 0.4
+POSE_NUM = 17 * 3
+DET_NUM = 6
+SEG_NUM = 32
+
+
+def get_img_path_batches(batch_size, img_dir):
+    ret = []
+    batch = []
+    for root, dirs, files in os.walk(img_dir):
+        for name in files:
+            if len(batch) == batch_size:
+                ret.append(batch)
+                batch = []
+            batch.append(os.path.join(root, name))
+    if len(batch) > 0:
+        ret.append(batch)
+    return ret
+
+
+def plot_one_box(x, img, color=None, label=None, line_thickness=None):
+    """
+    description: Plots one bounding box on image img,
+                 this function comes from YoLov8 project.
+    param:
+        x:      a box likes [x1,y1,x2,y2]
+        img:    a opencv image object
+        color:  color to draw rectangle, such as (0,255,0)
+        label:  str
+        line_thickness: int
+    return:
+        no return
+
+    """
+    tl = (
+            line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
+    )  # line/font thickness
+    color = color or [random.randint(0, 255) for _ in range(3)]
+    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
+    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
+    if label:
+        tf = max(tl - 1, 1)  # font thickness
+        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
+        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
+        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
+        cv2.putText(
+            img,
+            label,
+            (c1[0], c1[1] - 2),
+            0,
+            tl / 3,
+            [225, 255, 255],
+            thickness=tf,
+            lineType=cv2.LINE_AA,
+        )
+
+
+class YoLov8TRT(object):
+    """
+    description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops.
+    """
+
+    def __init__(self, engine_file_path):
+        # Create a Context on this device,
+        self.ctx = cuda.Device(0).make_context()
+        stream = cuda.Stream()
+        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
+        runtime = trt.Runtime(TRT_LOGGER)
+
+        # Deserialize the engine from file
+        with open(engine_file_path, "rb") as f:
+            engine = runtime.deserialize_cuda_engine(f.read())
+        context = engine.create_execution_context()
+
+        host_inputs = []
+        cuda_inputs = []
+        host_outputs = []
+        cuda_outputs = []
+        bindings = []
+
+        for binding in engine:
+            print('bingding:', binding, engine.get_binding_shape(binding))
+            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
+            dtype = trt.nptype(engine.get_binding_dtype(binding))
+            # Allocate host and device buffers
+            host_mem = cuda.pagelocked_empty(size, dtype)
+            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
+            # Append the device buffer to device bindings.
+            bindings.append(int(cuda_mem))
+            # Append to the appropriate list.
+            if engine.binding_is_input(binding):
+                self.input_w = engine.get_binding_shape(binding)[-1]
+                self.input_h = engine.get_binding_shape(binding)[-2]
+                host_inputs.append(host_mem)
+                cuda_inputs.append(cuda_mem)
+            else:
+                host_outputs.append(host_mem)
+                cuda_outputs.append(cuda_mem)
+
+        # Store
+        self.stream = stream
+        self.context = context
+        self.engine = engine
+        self.host_inputs = host_inputs
+        self.cuda_inputs = cuda_inputs
+        self.host_outputs = host_outputs
+        self.cuda_outputs = cuda_outputs
+        self.bindings = bindings
+        self.batch_size = engine.max_batch_size
+        self.det_output_length = host_outputs[0].shape[0]
+
+    def infer(self, raw_image_generator):
+        threading.Thread.__init__(self)
+        # Make self the active context, pushing it on top of the context stack.
+        self.ctx.push()
+        # Restore
+        stream = self.stream
+        context = self.context
+        host_inputs = self.host_inputs
+        cuda_inputs = self.cuda_inputs
+        host_outputs = self.host_outputs
+        cuda_outputs = self.cuda_outputs
+        bindings = self.bindings
+        # Do image preprocess
+        batch_image_raw = []
+        batch_origin_h = []
+        batch_origin_w = []
+        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
+        for i, image_raw in enumerate(raw_image_generator):
+            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
+            batch_image_raw.append(image_raw)
+            batch_origin_h.append(origin_h)
+            batch_origin_w.append(origin_w)
+            np.copyto(batch_input_image[i], input_image)
+        batch_input_image = np.ascontiguousarray(batch_input_image)
+
+        # Copy input image to host buffer
+        np.copyto(host_inputs[0], batch_input_image.ravel())
+        start = time.time()
+        # Transfer input data  to the GPU.
+        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
+        # Run inference.
+        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
+        # Transfer predictions back from the GPU.
+        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
+        # Synchronize the stream
+        stream.synchronize()
+        end = time.time()
+        # Remove any context from the top of the context stack, deactivating it.
+        self.ctx.pop()
+        # Here we use the first row of output in that batch_size = 1
+        output = host_outputs[0]
+        # Do postprocess
+        for i in range(self.batch_size):
+            result_boxes, result_scores, result_classid = self.post_process(
+                output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
+                batch_origin_w[i]
+            )
+            # Draw rectangles and labels on the original image
+            for j in range(len(result_boxes)):
+                box = result_boxes[j]
+                plot_one_box(
+                    box,
+                    batch_image_raw[i],
+                    label="{}:{:.2f}".format(
+                        categories[int(result_classid[j])], result_scores[j]
+                    ),
+                )
+        return batch_image_raw, end - start
+
+    def destroy(self):
+        # Remove any context from the top of the context stack, deactivating it.
+        self.ctx.pop()
+
+    def get_raw_image(self, image_path_batch):
+        """
+        description: Read an image from image path
+        """
+        for img_path in image_path_batch:
+            yield cv2.imread(img_path)
+
+    def get_raw_image_zeros(self, image_path_batch=None):
+        """
+        description: Ready data for warmup
+        """
+        for _ in range(self.batch_size):
+            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
+
+    def preprocess_image(self, raw_bgr_image):
+        """
+        description: Convert BGR image to RGB,
+                     resize and pad it to target size, normalize to [0,1],
+                     transform to NCHW format.
+        param:
+            input_image_path: str, image path
+        return:
+            image:  the processed image
+            image_raw: the original image
+            h: original height
+            w: original width
+        """
+        image_raw = raw_bgr_image
+        h, w, c = image_raw.shape
+        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
+        # Calculate widht and height and paddings
+        r_w = self.input_w / w
+        r_h = self.input_h / h
+        if r_h > r_w:
+            tw = self.input_w
+            th = int(r_w * h)
+            tx1 = tx2 = 0
+            ty1 = int((self.input_h - th) / 2)
+            ty2 = self.input_h - th - ty1
+        else:
+            tw = int(r_h * w)
+            th = self.input_h
+            tx1 = int((self.input_w - tw) / 2)
+            tx2 = self.input_w - tw - tx1
+            ty1 = ty2 = 0
+        # Resize the image with long side while maintaining ratio
+        image = cv2.resize(image, (tw, th))
+        # Pad the short side with (128,128,128)
+        image = cv2.copyMakeBorder(
+            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
+        )
+        image = image.astype(np.float32)
+        # Normalize to [0,1]
+        image /= 255.0
+        # HWC to CHW format:
+        image = np.transpose(image, [2, 0, 1])
+        # CHW to NCHW format
+        image = np.expand_dims(image, axis=0)
+        # Convert the image to row-major order, also known as "C order":
+        image = np.ascontiguousarray(image)
+        return image, image_raw, h, w
+
+    def xywh2xyxy(self, origin_h, origin_w, x):
+        """
+        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+        param:
+            origin_h:   height of original image
+            origin_w:   width of original image
+            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
+        return:
+            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
+        """
+        y = np.zeros_like(x)
+        r_w = self.input_w / origin_w
+        r_h = self.input_h / origin_h
+        if r_h > r_w:
+            y[:, 0] = x[:, 0]
+            y[:, 2] = x[:, 2]
+            y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
+            y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
+            y /= r_w
+        else:
+            y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
+            y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
+            y[:, 1] = x[:, 1]
+            y[:, 3] = x[:, 3]
+            y /= r_h
+
+        return y
+
+    def post_process(self, output, origin_h, origin_w):
+        """
+        description: postprocess the prediction
+        param:
+            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
+            origin_h:   height of original image
+            origin_w:   width of original image
+        return:
+            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
+            result_scores: finally scores, a numpy, each element is the score correspoing to box
+            result_classid: finally classid, a numpy, each element is the classid correspoing to box
+        """
+        num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM
+        # Get the num of boxes detected
+        num = int(output[0])
+        # Reshape to a two dimentional ndarray
+        # pred = np.reshape(output[1:], (-1, 38))[:num, :]
+        pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]
+        # Do nms
+        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
+        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
+        result_scores = boxes[:, 4] if len(boxes) else np.array([])
+        result_classid = boxes[:, 5] if len(boxes) else np.array([])
+        return result_boxes, result_scores, result_classid
+
+    def bbox_iou(self, box1, box2, x1y1x2y2=True):
+        """
+        description: compute the IoU of two bounding boxes
+        param:
+            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
+            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
+            x1y1x2y2: select the coordinate format
+        return:
+            iou: computed iou
+        """
+        if not x1y1x2y2:
+            # Transform from center and width to exact coordinates
+            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
+            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
+            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
+            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
+        else:
+            # Get the coordinates of bounding boxes
+            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
+            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
+
+        # Get the coordinates of the intersection rectangle
+        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
+        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
+        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
+        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
+        # Intersection area
+        inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None)
+                      * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None))
+        # Union Area
+        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
+        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
+
+        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
+
+        return iou
+
+    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
+        """
+        description: Removes detections with lower object confidence score than 'conf_thres' and performs
+        Non-Maximum Suppression to further filter detections.
+        param:
+            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
+            origin_h: original image height
+            origin_w: original image width
+            conf_thres: a confidence threshold to filter detections
+            nms_thres: a iou threshold to filter detections
+        return:
+            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
+        """
+        # Get the boxes that score > CONF_THRESH
+        boxes = prediction[prediction[:, 4] >= conf_thres]
+        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
+        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
+        # clip the coordinates
+        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
+        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
+        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
+        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
+        # Object confidence
+        confs = boxes[:, 4]
+        # Sort by the confs
+        boxes = boxes[np.argsort(-confs)]
+        # Perform non-maximum suppression
+        keep_boxes = []
+        while boxes.shape[0]:
+            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
+            label_match = boxes[0, -1] == boxes[:, -1]
+            # Indices of boxes with lower confidence scores, large IOUs and matching labels
+            invalid = large_overlap & label_match
+            keep_boxes += [boxes[0]]
+            boxes = boxes[~invalid]
+        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
+        return boxes
+
+
+class inferThread(threading.Thread):
+    def __init__(self, yolov8_wrapper, image_path_batch):
+        threading.Thread.__init__(self)
+        self.yolov8_wrapper = yolov8_wrapper
+        self.image_path_batch = image_path_batch
+
+    def run(self):
+        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch))
+        for i, img_path in enumerate(self.image_path_batch):
+            parent, filename = os.path.split(img_path)
+            save_name = os.path.join('output', filename)
+            # Save image
+            cv2.imwrite(save_name, batch_image_raw[i])
+        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))
+
+
+class warmUpThread(threading.Thread):
+    def __init__(self, yolov8_wrapper):
+        threading.Thread.__init__(self)
+        self.yolov8_wrapper = yolov8_wrapper
+
+    def run(self):
+        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros())
+        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
+
+
+if __name__ == "__main__":
+    # load custom plugin and engine
+    PLUGIN_LIBRARY = "build/libmyplugins.so"
+    engine_file_path = "yolov5xu.engine"
+
+    if len(sys.argv) > 1:
+        engine_file_path = sys.argv[1]
+    if len(sys.argv) > 2:
+        PLUGIN_LIBRARY = sys.argv[2]
+
+    ctypes.CDLL(PLUGIN_LIBRARY)
+
+    # load coco labels
+
+    categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
+                  "traffic light",
+                  "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+                  "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
+                  "frisbee",
+                  "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
+                  "surfboard",
+                  "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+                  "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+                  "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
+                  "cell phone",
+                  "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
+                  "teddy bear",
+                  "hair drier", "toothbrush"]
+
+    if os.path.exists('output/'):
+        shutil.rmtree('output/')
+    os.makedirs('output/')
+    # a YoLov8TRT instance
+    yolov8_wrapper = YoLov8TRT(engine_file_path)
+    try:
+        print('batch size is', yolov8_wrapper.batch_size)
+
+        image_dir = "samples/"
+        image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir)
+
+        for i in range(10):
+            # create a new thread to do warm_up
+            thread1 = warmUpThread(yolov8_wrapper)
+            thread1.start()
+            thread1.join()
+        for batch in image_path_batches:
+            # create a new thread to do inference
+            thread1 = inferThread(yolov8_wrapper, batch)
+            thread1.start()
+            thread1.join()
+    finally:
+        # destroy the instance
+        yolov8_wrapper.destroy()