From 37ddb3044f25706fa273aa55f3ddb9ede2f180e0 Mon Sep 17 00:00:00 2001 From: linds Date: Wed, 17 Apr 2024 16:01:35 +0800 Subject: [PATCH 01/24] yolov8 p2 --- yolov8/README.md | 17 +- yolov8/include/block.h | 3 +- yolov8/include/config.h | 7 +- yolov8/include/model.h | 4 + yolov8/plugin/yololayer.cu | 150 ++++++---- yolov8/plugin/yololayer.h | 130 ++++----- yolov8/src/block.cpp | 56 ++-- yolov8/src/model.cpp | 550 ++++++++++++++++++++++++++++++------- yolov8/yolov8_det.cpp | 24 +- 9 files changed, 696 insertions(+), 245 deletions(-) diff --git a/yolov8/README.md b/yolov8/README.md index 4ca978ba..6653e1f8 100644 --- a/yolov8/README.md +++ b/yolov8/README.md @@ -22,10 +22,10 @@ The tensorrt code is derived from [xiaocao-tian/yolov8_tensorrt](https://github. Currently, we support yolov8 - For yolov8 , download .pt from [https://github.com/ultralytics/assets/releases](https://github.com/ultralytics/assets/releases), then follow how-to-run in current page. - +[README.md](..%2FREADME.md) ## Config -- Choose the model n/s/m/l/x/n6/s6/m6/l6/x6 from command line arguments. +- Choose the model n/s/m/l/x/n6/s6/m6/l6/[README.md](..%2FREADME.md)x6 from command line arguments. - Check more configs in [include/config.h](./include/config.h) ## How to Run, yolov8n as example @@ -34,10 +34,13 @@ Currently, we support yolov8 ``` // download https://github.com/ultralytics/assets/releases/yolov8n.pt +// download https://github.com/lindsayshuo/yolov8-p2/releases/download/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt (only for 10 cls p2 model) cp {tensorrtx}/yolov8/gen_wts.py {ultralytics}/ultralytics cd {ultralytics}/ultralytics python gen_wts.py -w yolov8n.pt -o yolov8n.wts -t detect // a file 'yolov8n.wts' will be generated. +python gen_wts.py -w VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.pt -o VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts -t detect (only for 10 cls p2 model) +// a file 'VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts' will be generated. ``` 2. build tensorrtx/yolov8 and run @@ -51,12 +54,20 @@ cd build cp {ultralytics}/ultralytics/yolov8.wts {tensorrtx}/yolov8/build cmake .. make -sudo ./yolov8_det -s [.wts] [.engine] [n/s/m/l/x/n6/s6/m6/l6/x6] // serialize model to plan file +sudo ./yolov8_det -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to plan file sudo ./yolov8_det -d [.engine] [image folder] [c/g] // deserialize and run inference, the images in [image folder] will be processed. // For example yolov8 sudo ./yolov8_det -s yolov8n.wts yolov8.engine n sudo ./yolov8_det -d yolov8n.engine ../images c //cpu postprocess sudo ./yolov8_det -d yolov8n.engine ../images g //gpu postprocess + +for p2 model: +change the "const static int kNumClass" in config.h to 10; +sudo ./yolov8_det -s VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.wts VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine x2 +wget https://github.com/lindsayshuo/yolov8-p2/releases/download/VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last/0000008_01999_d_0000040.jpg +cp -r 0000008_01999_d_0000040.jpg ../images +sudo ./yolov8_det -d VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine ../images c //cpu postprocess +sudo ./yolov8_det -d VisDrone_train_yolov8x_p2_bs1_epochs_100_imgsz_1280_last.engine ../images g //gpu postprocess ``` ### Instance Segmentation diff --git a/yolov8/include/block.h b/yolov8/include/block.h index 3df49c8c..ed4c6ad1 100644 --- a/yolov8/include/block.h +++ b/yolov8/include/block.h @@ -26,4 +26,5 @@ nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map dets, bool is_segmentation = false); + std::vector dets, const int* px_arry, + int px_arry_num, bool is_segmentation); diff --git a/yolov8/include/config.h b/yolov8/include/config.h index f124d5d9..6af9f1a9 100644 --- a/yolov8/include/config.h +++ b/yolov8/include/config.h @@ -2,9 +2,9 @@ //#define USE_FP32 //#define USE_INT8 -const static char *kInputTensorName = "images"; -const static char *kOutputTensorName = "output"; -const static int kNumClass = 80; +const static char* kInputTensorName = "images"; +const static char* kOutputTensorName = "output"; +const static int kNumClass = 10; const static int kBatchSize = 1; const static int kGpuId = 0; const static int kInputH = 640; @@ -14,7 +14,6 @@ const static float kConfThresh = 0.5f; const static int kMaxInputImageSize = 3000 * 3000; const static int kMaxNumOutputBbox = 1000; - // Classfication model's number of classes constexpr static int kClsNumClass = 1000; // Classfication model's input shape diff --git a/yolov8/include/model.h b/yolov8/include/model.h index c8b30e98..0a2a40a3 100644 --- a/yolov8/include/model.h +++ b/yolov8/include/model.h @@ -11,6 +11,10 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels); +nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, + nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, + int& max_channels); + nvinfer1::IHostMemory* buildEngineYolov8Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw); diff --git a/yolov8/plugin/yololayer.cu b/yolov8/plugin/yololayer.cu index bdc073cc..9b786de7 100755 --- a/yolov8/plugin/yololayer.cu +++ b/yolov8/plugin/yololayer.cu @@ -1,45 +1,58 @@ -#include "yololayer.h" -#include "types.h" #include #include -#include "cuda_utils.h" -#include #include +#include +#include "cuda_utils.h" +#include "types.h" +#include "yololayer.h" namespace Tn { - template - void write(char*& buffer, const T& val) { - *reinterpret_cast(buffer) = val; - buffer += sizeof(T); - } +template +void write(char*& buffer, const T& val) { + *reinterpret_cast(buffer) = val; + buffer += sizeof(T); +} - template - void read(const char*& buffer, T& val) { - val = *reinterpret_cast(buffer); - buffer += sizeof(T); - } +template +void read(const char*& buffer, T& val) { + val = *reinterpret_cast(buffer); + buffer += sizeof(T); +} } // namespace Tn - namespace nvinfer1 { -YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation) { +YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation, + const int* strides, int stridesLength) { mClassCount = classCount; mYoloV8NetWidth = netWidth; mYoloV8netHeight = netHeight; mMaxOutObject = maxOut; + mStridesLength = stridesLength; + mStrides = new int[stridesLength]; + memcpy(mStrides, strides, stridesLength * sizeof(int)); is_segmentation_ = is_segmentation; } -YoloLayerPlugin::~YoloLayerPlugin() {} +YoloLayerPlugin::~YoloLayerPlugin() { + if (mStrides != nullptr) { + delete[] mStrides; + mStrides = nullptr; + } +} YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { using namespace Tn; - const char* d = reinterpret_cast(data), * a = d; + const char *d = reinterpret_cast(data), *a = d; read(d, mClassCount); read(d, mThreadCount); read(d, mYoloV8NetWidth); read(d, mYoloV8netHeight); read(d, mMaxOutObject); + read(d, mStridesLength); + mStrides = new int[mStridesLength]; + for (int i = 0; i < mStridesLength; ++i) { + read(d, mStrides[i]); + } read(d, is_segmentation_); assert(d == a + length); @@ -48,26 +61,32 @@ YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT { using namespace Tn; - char* d = static_cast(buffer), * a = d; + char *d = static_cast(buffer), *a = d; write(d, mClassCount); write(d, mThreadCount); write(d, mYoloV8NetWidth); write(d, mYoloV8netHeight); write(d, mMaxOutObject); + write(d, mStridesLength); + for (int i = 0; i < mStridesLength; ++i) { + write(d, mStrides[i]); + } write(d, is_segmentation_); assert(d == a + getSerializationSize()); } size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT { - return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(is_segmentation_); + return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + + sizeof(mMaxOutObject) + sizeof(mStridesLength) + sizeof(int) * mStridesLength + sizeof(is_segmentation_); } int YoloLayerPlugin::initialize() TRT_NOEXCEPT { return 0; } -nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT { +nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, + int nbInputDims) TRT_NOEXCEPT { int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float); return nvinfer1::Dims3(total_size + 1, 1, 1); } @@ -80,11 +99,13 @@ const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT { return mPluginNamespace; } -nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT { +nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, + int nbInputs) const TRT_NOEXCEPT { return nvinfer1::DataType::kFLOAT; } -bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT { +bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, + int nbInputs) const TRT_NOEXCEPT { return false; } @@ -94,9 +115,11 @@ bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOE return false; } -void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput, nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT {}; +void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput, + nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{}; -void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {}; +void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, + IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{}; void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {} @@ -116,28 +139,33 @@ void YoloLayerPlugin::destroy() TRT_NOEXCEPT { nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT { - YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV8NetWidth, mYoloV8netHeight, mMaxOutObject, is_segmentation_); + YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV8NetWidth, mYoloV8netHeight, mMaxOutObject, + is_segmentation_, mStrides, mStridesLength); p->setPluginNamespace(mPluginNamespace); return p; } -int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { +int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs, + void* workspace, cudaStream_t stream) TRT_NOEXCEPT { forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize); return 0; } +__device__ float Logist(float data) { + return 1.0f / (1.0f + expf(-data)); +}; -__device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); }; - -__global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, - const int grid_h, int grid_w, const int stride, int classes, int outputElem, bool is_segmentation) { +__global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h, + int grid_w, const int stride, int classes, int outputElem, bool is_segmentation) { int idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx >= numElements) return; + if (idx >= numElements) + return; int total_grid = grid_h * grid_w; int info_len = 4 + classes; - if (is_segmentation) info_len += 32; + if (is_segmentation) + info_len += 32; int batchIdx = idx / total_grid; int elemIdx = idx % total_grid; const float* curInput = input + batchIdx * total_grid * info_len; @@ -153,10 +181,12 @@ __global__ void CalDetection(const float* input, float* output, int numElements, } } - if (max_cls_prob < 0.1) return; + if (max_cls_prob < 0.1) + return; int count = (int)atomicAdd(output + outputIdx, 1); - if (count >= maxoutobject) return; + if (count >= maxoutobject) + return; char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection); Detection* det = (Detection*)(data); @@ -175,24 +205,33 @@ __global__ void CalDetection(const float* input, float* output, int numElements, } } -void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,int mYoloV8NetWidth, int batchSize) { +void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight, + int mYoloV8NetWidth, int batchSize) { int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float); cudaMemsetAsync(output, 0, sizeof(float), stream); for (int idx = 0; idx < batchSize; ++idx) { CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream)); } int numElem = 0; - int grids[3][2] = { {mYoloV8netHeight / 8, mYoloV8NetWidth / 8}, {mYoloV8netHeight / 16, mYoloV8NetWidth / 16}, {mYoloV8netHeight / 32, mYoloV8NetWidth / 32} }; - int strides[] = { 8, 16, 32 }; - for (unsigned int i = 0; i < 3; i++) { + + const int maxGrids = mStridesLength; + int grids[maxGrids][2]; + for (int i = 0; i < maxGrids; ++i) { + grids[i][0] = mYoloV8netHeight / mStrides[i]; + grids[i][1] = mYoloV8NetWidth / mStrides[i]; + } + + for (unsigned int i = 0; i < maxGrids; i++) { int grid_h = grids[i][0]; int grid_w = grids[i][1]; - int stride = strides[i]; + int stride = mStrides[i]; numElem = grid_h * grid_w * batchSize; - if (numElem < mThreadCount) mThreadCount = numElem; + if (numElem < mThreadCount) + mThreadCount = numElem; - CalDetection << <(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream >> > - (inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, outputElem, is_segmentation_); + CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>( + inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, outputElem, + is_segmentation_); } } @@ -219,19 +258,24 @@ const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT { IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { assert(fc->nbFields == 1); - assert(strcmp(fc->fields[0].name, "netinfo") == 0); - int* p_netinfo = (int*)(fc->fields[0].data); - int class_count = p_netinfo[0]; - int input_w = p_netinfo[1]; - int input_h = p_netinfo[2]; - int max_output_object_count = p_netinfo[3]; - bool is_segmentation = p_netinfo[4]; - YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, is_segmentation); + assert(strcmp(fc->fields[0].name, "combinedInfo") == 0); + const int* combinedInfo = static_cast(fc->fields[0].data); + int netinfo_count = 5; + int class_count = combinedInfo[0]; + int input_w = combinedInfo[1]; + int input_h = combinedInfo[2]; + int max_output_object_count = combinedInfo[3]; + bool is_segmentation = combinedInfo[4]; + const int* px_arry = combinedInfo + netinfo_count; + int px_arry_length = fc->fields[0].length - netinfo_count; + YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, is_segmentation, + px_arry, px_arry_length); obj->setPluginNamespace(mNamespace.c_str()); return obj; } -IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT { +IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, + size_t serialLength) TRT_NOEXCEPT { // This object will be deleted when the network is destroyed, which will // call YoloLayerPlugin::destroy() YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); @@ -239,4 +283,4 @@ IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const voi return obj; } -} // namespace nvinfer1 +} // namespace nvinfer1 diff --git a/yolov8/plugin/yololayer.h b/yolov8/plugin/yololayer.h index 514c1f12..9496e7fc 100644 --- a/yolov8/plugin/yololayer.h +++ b/yolov8/plugin/yololayer.h @@ -1,102 +1,106 @@ #pragma once -#include "macros.h" -#include "NvInfer.h" #include #include +#include "NvInfer.h" #include "macros.h" namespace nvinfer1 { class API YoloLayerPlugin : public IPluginV2IOExt { -public: - YoloLayerPlugin(int classCount, int netWdith, int netHeight, int maxOut, bool is_segmentation); - YoloLayerPlugin(const void* data, size_t length); - ~YoloLayerPlugin(); + public: + YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation, const int* strides, + int stridesLength); - int getNbOutputs() const TRT_NOEXCEPT override { - return 1; - } + YoloLayerPlugin(const void* data, size_t length); + ~YoloLayerPlugin(); - nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override; + int getNbOutputs() const TRT_NOEXCEPT override { return 1; } - int initialize() TRT_NOEXCEPT override; + nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override; - virtual void terminate() TRT_NOEXCEPT override {} + int initialize() TRT_NOEXCEPT override; - virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; } + virtual void terminate() TRT_NOEXCEPT override {} - virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; + virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; } - virtual size_t getSerializationSize() const TRT_NOEXCEPT override; + virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, + cudaStream_t stream) TRT_NOEXCEPT override; - virtual void serialize(void* buffer) const TRT_NOEXCEPT override; + virtual size_t getSerializationSize() const TRT_NOEXCEPT override; - bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override { - return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; - } + virtual void serialize(void* buffer) const TRT_NOEXCEPT override; + bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, + int nbOutputs) const TRT_NOEXCEPT override { + return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; + } - const char* getPluginType() const TRT_NOEXCEPT override; + const char* getPluginType() const TRT_NOEXCEPT override; - const char* getPluginVersion() const TRT_NOEXCEPT override; + const char* getPluginVersion() const TRT_NOEXCEPT override; - void destroy() TRT_NOEXCEPT override; + void destroy() TRT_NOEXCEPT override; - IPluginV2IOExt* clone() const TRT_NOEXCEPT override; + IPluginV2IOExt* clone() const TRT_NOEXCEPT override; - void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override; + void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override; - const char* getPluginNamespace() const TRT_NOEXCEPT override; + const char* getPluginNamespace() const TRT_NOEXCEPT override; - nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const TRT_NOEXCEPT; + nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, + int32_t nbInputs) const TRT_NOEXCEPT; - bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override; + bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, + int nbInputs) const TRT_NOEXCEPT override; - bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; + bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; - void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override; + void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, + IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override; - void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out, int32_t nbOutput) TRT_NOEXCEPT override; + void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out, + int32_t nbOutput) TRT_NOEXCEPT override; - void detachFromContext() TRT_NOEXCEPT override; + void detachFromContext() TRT_NOEXCEPT override; - private: - void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight, int mYoloV8NetWidth, int batchSize); - int mThreadCount = 256; - const char* mPluginNamespace; - int mClassCount; - int mYoloV8NetWidth; - int mYoloV8netHeight; - int mMaxOutObject; - bool is_segmentation_; - }; + private: + void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight, + int mYoloV8NetWidth, int batchSize); + int mThreadCount = 256; + const char* mPluginNamespace; + int mClassCount; + int mYoloV8NetWidth; + int mYoloV8netHeight; + int mMaxOutObject; + bool is_segmentation_; + int* mStrides; + int mStridesLength; +}; class API YoloPluginCreator : public IPluginCreator { -public: - YoloPluginCreator(); - ~YoloPluginCreator() override = default; - - const char* getPluginName() const TRT_NOEXCEPT override; + public: + YoloPluginCreator(); + ~YoloPluginCreator() override = default; - const char* getPluginVersion() const TRT_NOEXCEPT override; + const char* getPluginName() const TRT_NOEXCEPT override; - const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; + const char* getPluginVersion() const TRT_NOEXCEPT override; - nvinfer1::IPluginV2IOExt* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override; + const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; - nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override; + nvinfer1::IPluginV2IOExt* createPlugin(const char* name, + const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override; - void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { - mNamespace = libNamespace; - } + nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, + size_t serialLength) TRT_NOEXCEPT override; - const char* getPluginNamespace() const TRT_NOEXCEPT override { - return mNamespace.c_str(); - } + void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; } - private: - std::string mNamespace; - static PluginFieldCollection mFC; - static std::vector mPluginAttributes; - }; - REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); -} // namespace nvinfer1 + const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); } + private: + std::string mNamespace; + static PluginFieldCollection mFC; + static std::vector mPluginAttributes; +}; +REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); +} // namespace nvinfer1 diff --git a/yolov8/src/block.cpp b/yolov8/src/block.cpp index 96cf4e12..713f86ef 100644 --- a/yolov8/src/block.cpp +++ b/yolov8/src/block.cpp @@ -219,23 +219,47 @@ nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map dets, bool is_segmentation) { + std::vector dets, const int* px_arry, + int px_arry_num, bool is_segmentation) { auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); - - nvinfer1::PluginField plugin_fields[1]; - int netinfo[5] = {kNumClass, kInputW, kInputH, kMaxNumOutputBbox, is_segmentation}; - plugin_fields[0].data = netinfo; - plugin_fields[0].length = 5; - plugin_fields[0].name = "netinfo"; - plugin_fields[0].type = nvinfer1::PluginFieldType::kFLOAT32; - nvinfer1::PluginFieldCollection plugin_data; - plugin_data.nbFields = 1; - plugin_data.fields = plugin_fields; - nvinfer1::IPluginV2* plugin_obj = creator->createPlugin("yololayer", &plugin_data); - std::vector input_tensors; + const int netinfo_count = 5; // Assuming the first 5 elements are for netinfo as per existing code. + const int total_count = netinfo_count + px_arry_num; // Total number of elements for netinfo and px_arry combined. + + std::vector combinedInfo(total_count); + // Fill in the first 5 elements as per existing netinfo. + combinedInfo[0] = kNumClass; + combinedInfo[1] = kInputW; + combinedInfo[2] = kInputH; + combinedInfo[3] = kMaxNumOutputBbox; + combinedInfo[4] = is_segmentation; + + // Copy the contents of px_arry into the combinedInfo vector after the initial 5 elements. + std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count); + + // Now let's create the PluginField object to hold this combined information. + nvinfer1::PluginField pluginField; + pluginField.name = "combinedInfo"; // This can be any name that the plugin will recognize + pluginField.data = combinedInfo.data(); + pluginField.type = nvinfer1::PluginFieldType::kINT32; + pluginField.length = combinedInfo.size(); + + // Create the PluginFieldCollection to hold the PluginField object. + nvinfer1::PluginFieldCollection pluginFieldCollection; + pluginFieldCollection.nbFields = 1; // We have just one field, but it's a combined array + pluginFieldCollection.fields = &pluginField; + + // Create the plugin object using the PluginFieldCollection. + nvinfer1::IPluginV2* pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection); + + // We assume that the plugin is to be added onto the network. + // Prepare input tensors for the YOLO Layer. + std::vector inputTensors; for (auto det : dets) { - input_tensors.push_back(det->getOutput(0)); + inputTensors.push_back(det->getOutput(0)); // Assuming each IConcatenationLayer has one output tensor. } - auto yolo = network->addPluginV2(&input_tensors[0], input_tensors.size(), *plugin_obj); - return yolo; + + // Add the plugin to the network using the prepared input tensors. + nvinfer1::IPluginV2Layer* yoloLayer = network->addPluginV2(inputTensors.data(), inputTensors.size(), *pluginObject); + + return yoloLayer; // Return the added YOLO layer. } diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp index e0e7f088..9cfc8dd5 100644 --- a/yolov8/src/model.cpp +++ b/yolov8/src/model.cpp @@ -106,7 +106,6 @@ nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer nvinfer1::IElementWiseLayer* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5, "model.9"); - /******************************************************************************************************* ********************************************* YOLOV8 HEAD ******************************************** *******************************************************************************************************/ @@ -118,6 +117,7 @@ nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer nvinfer1::ITensor* inputTensor11[] = {upsample10->getOutput(0), conv6->getOutput(0)}; nvinfer1::IConcatenationLayer* cat11 = network->addConcatenation(inputTensor11, 2); + nvinfer1::IElementWiseLayer* conv12 = C2F(network, weightMap, *cat11->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12"); @@ -220,48 +220,54 @@ nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer ********************************************* YOLOV8 DETECT ****************************************** *******************************************************************************************************/ + int strides[] = {8, 16, 32}; + int stridesLength = sizeof(strides) / sizeof(int); + nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0)); - shuffle22_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 8) * (kInputW / 8)}); - - nvinfer1::ISliceLayer* split22_0_0 = - network->addSlice(*shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0}, - nvinfer1::Dims2{64, (kInputH / 8) * (kInputW / 8)}, nvinfer1::Dims2{1, 1}); - nvinfer1::ISliceLayer* split22_0_1 = - network->addSlice(*shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0}, - nvinfer1::Dims2{kNumClass, (kInputH / 8) * (kInputW / 8)}, nvinfer1::Dims2{1, 1}); - nvinfer1::IShuffleLayer* dfl22_0 = DFL(network, weightMap, *split22_0_0->getOutput(0), 4, - (kInputH / 8) * (kInputW / 8), 1, 1, 0, "model.22.dfl.conv.weight"); + shuffle22_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); + nvinfer1::ISliceLayer* split22_0_0 = network->addSlice( + *shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0}, + nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); + nvinfer1::ISliceLayer* split22_0_1 = network->addSlice( + *shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0}, + nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); + nvinfer1::IShuffleLayer* dfl22_0 = + DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, + 1, 0, "model.22.dfl.conv.weight"); nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 2); nvinfer1::IShuffleLayer* shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0)); - shuffle22_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 16) * (kInputW / 16)}); - nvinfer1::ISliceLayer* split22_1_0 = - network->addSlice(*shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0}, - nvinfer1::Dims2{64, (kInputH / 16) * (kInputW / 16)}, nvinfer1::Dims2{1, 1}); - nvinfer1::ISliceLayer* split22_1_1 = - network->addSlice(*shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0}, - nvinfer1::Dims2{kNumClass, (kInputH / 16) * (kInputW / 16)}, nvinfer1::Dims2{1, 1}); - nvinfer1::IShuffleLayer* dfl22_1 = DFL(network, weightMap, *split22_1_0->getOutput(0), 4, - (kInputH / 16) * (kInputW / 16), 1, 1, 0, "model.22.dfl.conv.weight"); + shuffle22_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); + nvinfer1::ISliceLayer* split22_1_0 = network->addSlice( + *shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0}, + nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); + nvinfer1::ISliceLayer* split22_1_1 = network->addSlice( + *shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0}, + nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); + nvinfer1::IShuffleLayer* dfl22_1 = + DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, + 1, 0, "model.22.dfl.conv.weight"); nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 2); nvinfer1::IShuffleLayer* shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0)); - shuffle22_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 32) * (kInputW / 32)}); - nvinfer1::ISliceLayer* split22_2_0 = - network->addSlice(*shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0}, - nvinfer1::Dims2{64, (kInputH / 32) * (kInputW / 32)}, nvinfer1::Dims2{1, 1}); - nvinfer1::ISliceLayer* split22_2_1 = - network->addSlice(*shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0}, - nvinfer1::Dims2{kNumClass, (kInputH / 32) * (kInputW / 32)}, nvinfer1::Dims2{1, 1}); - nvinfer1::IShuffleLayer* dfl22_2 = DFL(network, weightMap, *split22_2_0->getOutput(0), 4, - (kInputH / 32) * (kInputW / 32), 1, 1, 0, "model.22.dfl.conv.weight"); + shuffle22_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); + nvinfer1::ISliceLayer* split22_2_0 = network->addSlice( + *shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0}, + nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); + nvinfer1::ISliceLayer* split22_2_1 = network->addSlice( + *shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0}, + nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); + nvinfer1::IShuffleLayer* dfl22_2 = + DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, + 1, 0, "model.22.dfl.conv.weight"); nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 2); nvinfer1::IPluginV2Layer* yolo = - addYoLoLayer(network, std::vector{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2}); + addYoLoLayer(network, std::vector{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2}, + strides, stridesLength, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); @@ -295,7 +301,21 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels) { std::map weightMap = loadWeights(wts_path); + for (const auto& kv : weightMap) { + if (kv.first.find("conv.weight") != std::string::npos || + kv.first.find("linear.weight") != std::string::npos) { // 检查 conv.weight 或 linear.weight + std::cout << "Weight name: " << kv.first << ", "; + std::cout << "Count: " << kv.second.count << ", "; + std::cout << "Type: " + << (kv.second.type == nvinfer1::DataType::kFLOAT ? "FLOAT" + : kv.second.type == nvinfer1::DataType::kHALF ? "HALF" + : "INT8") + << std::endl; + } + } + nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); + std::cout << "gd: " << gd << ", gw: " << gw << std::endl; /******************************************************************************************************* ****************************************** YOLOV8 INPUT ********************************************** *******************************************************************************************************/ @@ -321,15 +341,18 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf // 22466 nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); + nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(768, gw, max_channels), 3, 2, 1, "model.7"); nvinfer1::IElementWiseLayer* conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(768, gw, max_channels), get_width(768, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8"); + nvinfer1::IElementWiseLayer* conv9 = convBnSiLU(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.9"); nvinfer1::IElementWiseLayer* conv10 = C2F(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.10"); + nvinfer1::IElementWiseLayer* conv11 = SPPF(network, weightMap, *conv10->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5, "model.11"); @@ -413,9 +436,12 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf network->addConvolutionNd(*conv30_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.30.cv2.0.2.weight"], weightMap["model.30.cv2.0.2.bias"]); conv30_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::IElementWiseLayer* conv30_cv3_0_0 = convBnSiLU(network, weightMap, *conv20->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.0.0"); + nvinfer1::IElementWiseLayer* conv30_cv3_0_1 = convBnSiLU(network, weightMap, *conv30_cv3_0_0->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.0.1"); nvinfer1::IConvolutionLayer* conv30_cv3_0_2 = @@ -495,67 +521,391 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf /******************************************************************************************************* ********************************************* YOLOV8 DETECT ****************************************** *******************************************************************************************************/ + int strides[] = {8, 16, 32, 64}; + int stridesLength = sizeof(strides) / sizeof(int); + // P3 processing steps (remains unchanged) nvinfer1::IShuffleLayer* shuffle30_0 = network->addShuffle(*cat30_0->getOutput(0)); // Reusing the previous cat30_0 as P3 concatenation layer - shuffle30_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 8) * (kInputW / 8)}); - nvinfer1::ISliceLayer* split30_0_0 = - network->addSlice(*shuffle30_0->getOutput(0), nvinfer1::Dims2{0, 0}, - nvinfer1::Dims2{64, (kInputH / 8) * (kInputW / 8)}, nvinfer1::Dims2{1, 1}); - nvinfer1::ISliceLayer* split30_0_1 = - network->addSlice(*shuffle30_0->getOutput(0), nvinfer1::Dims2{64, 0}, - nvinfer1::Dims2{kNumClass, (kInputH / 8) * (kInputW / 8)}, nvinfer1::Dims2{1, 1}); - nvinfer1::IShuffleLayer* dfl30_0 = DFL(network, weightMap, *split30_0_0->getOutput(0), 4, - (kInputH / 8) * (kInputW / 8), 1, 1, 0, "model.30.dfl.conv.weight"); + shuffle30_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); + nvinfer1::ISliceLayer* split30_0_0 = network->addSlice( + *shuffle30_0->getOutput(0), nvinfer1::Dims2{0, 0}, + nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); + nvinfer1::ISliceLayer* split30_0_1 = network->addSlice( + *shuffle30_0->getOutput(0), nvinfer1::Dims2{64, 0}, + nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); + nvinfer1::IShuffleLayer* dfl30_0 = + DFL(network, weightMap, *split30_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, + 1, 0, "model.30.dfl.conv.weight"); nvinfer1::ITensor* inputTensor30_dfl_0[] = {dfl30_0->getOutput(0), split30_0_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat30_dfl_0 = network->addConcatenation(inputTensor30_dfl_0, 2); // P4 processing steps (remains unchanged) nvinfer1::IShuffleLayer* shuffle30_1 = network->addShuffle(*cat30_1->getOutput(0)); // Reusing the previous cat30_1 as P4 concatenation layer - shuffle30_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 16) * (kInputW / 16)}); - nvinfer1::ISliceLayer* split30_1_0 = - network->addSlice(*shuffle30_1->getOutput(0), nvinfer1::Dims2{0, 0}, - nvinfer1::Dims2{64, (kInputH / 16) * (kInputW / 16)}, nvinfer1::Dims2{1, 1}); - nvinfer1::ISliceLayer* split30_1_1 = - network->addSlice(*shuffle30_1->getOutput(0), nvinfer1::Dims2{64, 0}, - nvinfer1::Dims2{kNumClass, (kInputH / 16) * (kInputW / 16)}, nvinfer1::Dims2{1, 1}); - nvinfer1::IShuffleLayer* dfl30_1 = DFL(network, weightMap, *split30_1_0->getOutput(0), 4, - (kInputH / 16) * (kInputW / 16), 1, 1, 0, "model.30.dfl.conv.weight"); + shuffle30_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); + nvinfer1::ISliceLayer* split30_1_0 = network->addSlice( + *shuffle30_1->getOutput(0), nvinfer1::Dims2{0, 0}, + nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); + nvinfer1::ISliceLayer* split30_1_1 = network->addSlice( + *shuffle30_1->getOutput(0), nvinfer1::Dims2{64, 0}, + nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); + nvinfer1::IShuffleLayer* dfl30_1 = + DFL(network, weightMap, *split30_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, + 1, 0, "model.30.dfl.conv.weight"); nvinfer1::ITensor* inputTensor30_dfl_1[] = {dfl30_1->getOutput(0), split30_1_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat30_dfl_1 = network->addConcatenation(inputTensor30_dfl_1, 2); // P5 processing steps (remains unchanged) nvinfer1::IShuffleLayer* shuffle30_2 = network->addShuffle(*cat30_2->getOutput(0)); // Reusing the previous cat30_2 as P5 concatenation layer - shuffle30_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 32) * (kInputW / 32)}); - nvinfer1::ISliceLayer* split30_2_0 = - network->addSlice(*shuffle30_2->getOutput(0), nvinfer1::Dims2{0, 0}, - nvinfer1::Dims2{64, (kInputH / 32) * (kInputW / 32)}, nvinfer1::Dims2{1, 1}); - nvinfer1::ISliceLayer* split30_2_1 = - network->addSlice(*shuffle30_2->getOutput(0), nvinfer1::Dims2{64, 0}, - nvinfer1::Dims2{kNumClass, (kInputH / 32) * (kInputW / 32)}, nvinfer1::Dims2{1, 1}); - nvinfer1::IShuffleLayer* dfl30_2 = DFL(network, weightMap, *split30_2_0->getOutput(0), 4, - (kInputH / 32) * (kInputW / 32), 1, 1, 0, "model.30.dfl.conv.weight"); + shuffle30_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); + nvinfer1::ISliceLayer* split30_2_0 = network->addSlice( + *shuffle30_2->getOutput(0), nvinfer1::Dims2{0, 0}, + nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); + nvinfer1::ISliceLayer* split30_2_1 = network->addSlice( + *shuffle30_2->getOutput(0), nvinfer1::Dims2{64, 0}, + nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); + nvinfer1::IShuffleLayer* dfl30_2 = + DFL(network, weightMap, *split30_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, + 1, 0, "model.30.dfl.conv.weight"); nvinfer1::ITensor* inputTensor30_dfl_2[] = {dfl30_2->getOutput(0), split30_2_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat30_dfl_2 = network->addConcatenation(inputTensor30_dfl_2, 2); // P6 processing steps nvinfer1::IShuffleLayer* shuffle30_3 = network->addShuffle(*cat30_3->getOutput(0)); - shuffle30_3->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 64) * (kInputW / 64)}); - nvinfer1::ISliceLayer* split30_3_0 = - network->addSlice(*shuffle30_3->getOutput(0), nvinfer1::Dims2{0, 0}, - nvinfer1::Dims2{64, (kInputH / 64) * (kInputW / 64)}, nvinfer1::Dims2{1, 1}); - nvinfer1::ISliceLayer* split30_3_1 = - network->addSlice(*shuffle30_3->getOutput(0), nvinfer1::Dims2{64, 0}, - nvinfer1::Dims2{kNumClass, (kInputH / 64) * (kInputW / 64)}, nvinfer1::Dims2{1, 1}); - nvinfer1::IShuffleLayer* dfl30_3 = DFL(network, weightMap, *split30_3_0->getOutput(0), 4, - (kInputH / 64) * (kInputW / 64), 1, 1, 0, "model.30.dfl.conv.weight"); + shuffle30_3->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}); + nvinfer1::ISliceLayer* split30_3_0 = network->addSlice( + *shuffle30_3->getOutput(0), nvinfer1::Dims2{0, 0}, + nvinfer1::Dims2{64, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1}); + nvinfer1::ISliceLayer* split30_3_1 = network->addSlice( + *shuffle30_3->getOutput(0), nvinfer1::Dims2{64, 0}, + nvinfer1::Dims2{kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1}); + nvinfer1::IShuffleLayer* dfl30_3 = + DFL(network, weightMap, *split30_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1, + 1, 0, "model.30.dfl.conv.weight"); nvinfer1::ITensor* inputTensor30_dfl_3[] = {dfl30_3->getOutput(0), split30_3_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat30_dfl_3 = network->addConcatenation(inputTensor30_dfl_3, 2); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer( - network, std::vector{cat30_dfl_0, cat30_dfl_1, cat30_dfl_2, cat30_dfl_3}); + network, std::vector{cat30_dfl_0, cat30_dfl_1, cat30_dfl_2, cat30_dfl_3}, + strides, stridesLength, false); + yolo->getOutput(0)->setName(kOutputTensorName); + network->markOutput(*yolo->getOutput(0)); + + builder->setMaxBatchSize(kBatchSize); + config->setMaxWorkspaceSize(16 * (1 << 20)); + +#if defined(USE_FP16) + config->setFlag(nvinfer1::BuilderFlag::kFP16); +#elif defined(USE_INT8) + std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; + assert(builder->platformHasFastInt8()); + config->setFlag(nvinfer1::BuilderFlag::kINT8); + auto* calibrator = + new Int8EntropyCalibrator2(1, kInputW, kInputH, "../coco_calib/", "int8calib.table", kInputTensorName); + config->setInt8Calibrator(calibrator); +#endif + + std::cout << "Building engine, please wait for a while..." << std::endl; + nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); + std::cout << "Build engine successfully!" << std::endl; + + delete network; + + for (auto& mem : weightMap) { + free((void*)(mem.second.values)); + } + return serialized_model; +} + +nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, + nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, + int& max_channels) { + + std::cout << "buildEngineYolov8DetP2 " << std::endl; + + std::map weightMap = loadWeights(wts_path); + nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); + + /******************************************************************************************************* + ****************************************** YOLOV8 INPUT ********************************************** + *******************************************************************************************************/ + nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW}); + assert(data); + + /******************************************************************************************************* + ***************************************** YOLOV8 BACKBONE ******************************************** + *******************************************************************************************************/ + nvinfer1::IElementWiseLayer* conv0 = + convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0"); + nvinfer1::IElementWiseLayer* conv1 = + convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1"); + // 11233 + nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), + get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); + nvinfer1::IElementWiseLayer* conv3 = + convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3"); + // 22466 + nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), + get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); + nvinfer1::IElementWiseLayer* conv5 = + convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5"); + // 22466 + nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); + nvinfer1::IElementWiseLayer* conv7 = + convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7"); + // 11233 + nvinfer1::IElementWiseLayer* conv8 = + C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8"); + nvinfer1::IElementWiseLayer* conv9 = + SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), 5, "model.9"); + + /******************************************************************************************************* + ********************************************* YOLOV8 HEAD ******************************************** + *******************************************************************************************************/ + // Head + float scale[] = {1.0, 2.0, 2.0}; // scale used for upsampling + + // P4 + nvinfer1::IResizeLayer* upsample10 = network->addResize( + *conv9->getOutput(0)); // Assuming conv9 is the last layer of the backbone as per P5 in your first section. + upsample10->setResizeMode(nvinfer1::ResizeMode::kNEAREST); + upsample10->setScales(scale, 3); + nvinfer1::ITensor* concat11_inputs[] = { + upsample10->getOutput(0), + conv6->getOutput(0)}; // Assuming conv6 corresponds to "backbone P4" as per your pseudocode + nvinfer1::IConcatenationLayer* concat11 = network->addConcatenation(concat11_inputs, 2); + nvinfer1::IElementWiseLayer* conv12 = + C2F(network, weightMap, *concat11->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12"); + + // P3 + nvinfer1::IResizeLayer* upsample13 = network->addResize(*conv12->getOutput(0)); + upsample13->setResizeMode(nvinfer1::ResizeMode::kNEAREST); + upsample13->setScales(scale, 3); + nvinfer1::ITensor* concat14_inputs[] = {upsample13->getOutput(0), + conv4->getOutput(0)}; // Assuming conv4 corresponds to "backbone P3" + nvinfer1::IConcatenationLayer* concat14 = network->addConcatenation(concat14_inputs, 2); + nvinfer1::IElementWiseLayer* conv15 = + C2F(network, weightMap, *concat14->getOutput(0), get_width(256, gw, max_channels), + get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15"); + + // P2 + nvinfer1::IResizeLayer* upsample16 = network->addResize(*conv15->getOutput(0)); + upsample16->setResizeMode(nvinfer1::ResizeMode::kNEAREST); + upsample16->setScales(scale, 3); + nvinfer1::ITensor* concat17_inputs[] = {upsample16->getOutput(0), + conv2->getOutput(0)}; // Assuming conv2 corresponds to "backbone P2" + nvinfer1::IConcatenationLayer* concat17 = network->addConcatenation(concat17_inputs, 2); + nvinfer1::IElementWiseLayer* conv18 = + C2F(network, weightMap, *concat17->getOutput(0), get_width(128, gw, max_channels), + get_width(128, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18"); + + // Additional layers for P3, P4, P5 + // Downsample and concatenate for P3 + nvinfer1::IElementWiseLayer* conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0), + get_width(128, gw, max_channels), 3, 2, 1, "model.19"); + nvinfer1::ITensor* concat20_inputs[] = { + conv19->getOutput(0), conv15->getOutput(0)}; // concatenate with higher-resolution feature map from P3 + nvinfer1::IConcatenationLayer* concat20 = network->addConcatenation(concat20_inputs, 2); + nvinfer1::IElementWiseLayer* conv21 = + C2F(network, weightMap, *concat20->getOutput(0), get_width(256, gw, max_channels), + get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21"); + + // Downsample and concatenate for P4 + nvinfer1::IElementWiseLayer* conv22 = convBnSiLU(network, weightMap, *conv21->getOutput(0), + get_width(256, gw, max_channels), 3, 2, 1, "model.22"); + nvinfer1::ITensor* concat23_inputs[] = { + conv22->getOutput(0), conv12->getOutput(0)}; // concatenate with higher-resolution feature map from P4 + nvinfer1::IConcatenationLayer* concat23 = network->addConcatenation(concat23_inputs, 2); + nvinfer1::IElementWiseLayer* conv24 = + C2F(network, weightMap, *concat23->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.24"); + + // Downsample and concatenate for P5 + nvinfer1::IElementWiseLayer* conv25 = convBnSiLU(network, weightMap, *conv24->getOutput(0), + get_width(512, gw, max_channels), 3, 2, 1, "model.25"); + nvinfer1::ITensor* concat26_inputs[] = { + conv25->getOutput(0), conv9->getOutput(0)}; // concatenate with higher-resolution feature map from P5 + nvinfer1::IConcatenationLayer* concat26 = network->addConcatenation(concat26_inputs, 2); + nvinfer1::IElementWiseLayer* conv27 = + C2F(network, weightMap, *concat26->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.27"); + + /******************************************************************************************************* + ********************************************* YOLOV8 OUTPUT ****************************************** + *******************************************************************************************************/ + int base_in_channel = 64; + int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(128, gw, max_channels); + + std::cout << "base_in_channel is : " << base_in_channel << std::endl; + std::cout << "base_out_channel is : " << base_out_channel << std::endl; + + // output0 + nvinfer1::IElementWiseLayer* conv28_cv2_0_0 = + convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.0.0"); + nvinfer1::IElementWiseLayer* conv28_cv2_0_1 = + convBnSiLU(network, weightMap, *conv28_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.0.1"); + nvinfer1::IConvolutionLayer* conv28_cv2_0_2 = + network->addConvolutionNd(*conv28_cv2_0_1->getOutput(0), base_in_channel, nvinfer1::DimsHW{1, 1}, + weightMap["model.28.cv2.0.2.weight"], weightMap["model.28.cv2.0.2.bias"]); + conv28_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv28_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::IElementWiseLayer* conv28_cv3_0_0 = + convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.0.0"); + nvinfer1::IElementWiseLayer* conv28_cv3_0_1 = convBnSiLU(network, weightMap, *conv28_cv3_0_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.28.cv3.0.1"); + nvinfer1::IConvolutionLayer* conv28_cv3_0_2 = + network->addConvolutionNd(*conv28_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.28.cv3.0.2.weight"], weightMap["model.28.cv3.0.2.bias"]); + conv28_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1}); + conv28_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor* inputTensor28_0[] = {conv28_cv2_0_2->getOutput(0), conv28_cv3_0_2->getOutput(0)}; + nvinfer1::IConcatenationLayer* cat28_0 = network->addConcatenation(inputTensor28_0, 2); + + // output1 + nvinfer1::IElementWiseLayer* conv28_cv2_1_0 = + convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.1.0"); + nvinfer1::IElementWiseLayer* conv28_cv2_1_1 = + convBnSiLU(network, weightMap, *conv28_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.1.1"); + nvinfer1::IConvolutionLayer* conv28_cv2_1_2 = + network->addConvolutionNd(*conv28_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.28.cv2.1.2.weight"], weightMap["model.28.cv2.1.2.bias"]); + conv28_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv28_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::IElementWiseLayer* conv28_cv3_1_0 = + convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.1.0"); + nvinfer1::IElementWiseLayer* conv28_cv3_1_1 = convBnSiLU(network, weightMap, *conv28_cv3_1_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.28.cv3.1.1"); + nvinfer1::IConvolutionLayer* conv28_cv3_1_2 = + network->addConvolutionNd(*conv28_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.28.cv3.1.2.weight"], weightMap["model.28.cv3.1.2.bias"]); + conv28_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv28_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor* inputTensor28_1[] = {conv28_cv2_1_2->getOutput(0), conv28_cv3_1_2->getOutput(0)}; + nvinfer1::IConcatenationLayer* cat28_1 = network->addConcatenation(inputTensor28_1, 2); + + // output2 + nvinfer1::IElementWiseLayer* conv28_cv2_2_0 = + convBnSiLU(network, weightMap, *conv24->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.2.0"); + nvinfer1::IElementWiseLayer* conv28_cv2_2_1 = + convBnSiLU(network, weightMap, *conv28_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.2.1"); + nvinfer1::IConvolutionLayer* conv28_cv2_2_2 = + network->addConvolution(*conv28_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.28.cv2.2.2.weight"], weightMap["model.28.cv2.2.2.bias"]); + conv28_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv28_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::IElementWiseLayer* conv28_cv3_2_0 = + convBnSiLU(network, weightMap, *conv24->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.2.0"); + nvinfer1::IElementWiseLayer* conv28_cv3_2_1 = convBnSiLU(network, weightMap, *conv28_cv3_2_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.28.cv3.2.1"); + nvinfer1::IConvolutionLayer* conv28_cv3_2_2 = + network->addConvolution(*conv28_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.28.cv3.2.2.weight"], weightMap["model.28.cv3.2.2.bias"]); + conv28_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv28_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor* inputTensor28_2[] = {conv28_cv2_2_2->getOutput(0), conv28_cv3_2_2->getOutput(0)}; + nvinfer1::IConcatenationLayer* cat28_2 = network->addConcatenation(inputTensor28_2, 2); + + // output3 + nvinfer1::IElementWiseLayer* conv28_cv2_3_0 = + convBnSiLU(network, weightMap, *conv27->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.3.0"); + nvinfer1::IElementWiseLayer* conv28_cv2_3_1 = + convBnSiLU(network, weightMap, *conv28_cv2_3_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.3.1"); + nvinfer1::IConvolutionLayer* conv28_cv2_3_2 = + network->addConvolution(*conv28_cv2_3_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.28.cv2.3.2.weight"], weightMap["model.28.cv2.3.2.bias"]); + conv28_cv2_3_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv28_cv2_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::IElementWiseLayer* conv28_cv3_3_0 = + convBnSiLU(network, weightMap, *conv27->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.3.0"); + nvinfer1::IElementWiseLayer* conv28_cv3_3_1 = convBnSiLU(network, weightMap, *conv28_cv3_3_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.28.cv3.3.1"); + nvinfer1::IConvolutionLayer* conv28_cv3_3_2 = + network->addConvolution(*conv28_cv3_3_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.28.cv3.3.2.weight"], weightMap["model.28.cv3.3.2.bias"]); + conv28_cv3_3_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv28_cv3_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor* inputTensor28_3[] = {conv28_cv2_3_2->getOutput(0), conv28_cv3_3_2->getOutput(0)}; + nvinfer1::IConcatenationLayer* cat28_3 = network->addConcatenation(inputTensor28_3, 2); + + /******************************************************************************************************* + ********************************************* YOLOV8 DETECT ****************************************** + *******************************************************************************************************/ + + int strides[] = {4, 8, 16, 32}; + int stridesLength = sizeof(strides) / sizeof(int); + + // P2 processing steps (remains unchanged) + std::cout << "kNumClass is : " << kNumClass << std::endl; + std::cout << "kInputH is : " << kInputH << std::endl; + nvinfer1::IShuffleLayer* shuffle28_0 = network->addShuffle(*cat28_0->getOutput(0)); + shuffle28_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); + nvinfer1::ISliceLayer* split28_0_0 = network->addSlice( + *shuffle28_0->getOutput(0), nvinfer1::Dims2{0, 0}, + nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); + nvinfer1::ISliceLayer* split28_0_1 = network->addSlice( + *shuffle28_0->getOutput(0), nvinfer1::Dims2{64, 0}, + nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); + nvinfer1::IShuffleLayer* dfl28_0 = + DFL(network, weightMap, *split28_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, + 1, 0, "model.28.dfl.conv.weight"); + nvinfer1::ITensor* inputTensor28_dfl_0[] = {dfl28_0->getOutput(0), split28_0_1->getOutput(0)}; + nvinfer1::IConcatenationLayer* cat28_dfl_0 = network->addConcatenation(inputTensor28_dfl_0, 2); + + // P3 processing steps (remains unchanged) + nvinfer1::IShuffleLayer* shuffle28_1 = network->addShuffle(*cat28_1->getOutput(0)); + shuffle28_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); + nvinfer1::ISliceLayer* split28_1_0 = network->addSlice( + *shuffle28_1->getOutput(0), nvinfer1::Dims2{0, 0}, + nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); + nvinfer1::ISliceLayer* split28_1_1 = network->addSlice( + *shuffle28_1->getOutput(0), nvinfer1::Dims2{64, 0}, + nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); + nvinfer1::IShuffleLayer* dfl28_1 = + DFL(network, weightMap, *split28_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, + 1, 0, "model.28.dfl.conv.weight"); + nvinfer1::ITensor* inputTensor28_dfl_1[] = {dfl28_1->getOutput(0), split28_1_1->getOutput(0)}; + nvinfer1::IConcatenationLayer* cat28_dfl_1 = network->addConcatenation(inputTensor28_dfl_1, 2); + + // P4 processing steps (remains unchanged) + nvinfer1::IShuffleLayer* shuffle28_2 = network->addShuffle(*cat28_2->getOutput(0)); + shuffle28_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); + nvinfer1::ISliceLayer* split28_2_0 = network->addSlice( + *shuffle28_2->getOutput(0), nvinfer1::Dims2{0, 0}, + nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); + nvinfer1::ISliceLayer* split28_2_1 = network->addSlice( + *shuffle28_2->getOutput(0), nvinfer1::Dims2{64, 0}, + nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); + nvinfer1::IShuffleLayer* dfl28_2 = + DFL(network, weightMap, *split28_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, + 1, 0, "model.28.dfl.conv.weight"); + nvinfer1::ITensor* inputTensor28_dfl_2[] = {dfl28_2->getOutput(0), split28_2_1->getOutput(0)}; + nvinfer1::IConcatenationLayer* cat28_dfl_2 = network->addConcatenation(inputTensor28_dfl_2, 2); + + // P5 processing steps + nvinfer1::IShuffleLayer* shuffle28_3 = network->addShuffle(*cat28_3->getOutput(0)); + shuffle28_3->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}); + nvinfer1::ISliceLayer* split28_3_0 = network->addSlice( + *shuffle28_3->getOutput(0), nvinfer1::Dims2{0, 0}, + nvinfer1::Dims2{64, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1}); + nvinfer1::ISliceLayer* split28_3_1 = network->addSlice( + *shuffle28_3->getOutput(0), nvinfer1::Dims2{64, 0}, + nvinfer1::Dims2{kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1}); + nvinfer1::IShuffleLayer* dfl28_3 = + DFL(network, weightMap, *split28_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1, + 1, 0, "model.28.dfl.conv.weight"); + nvinfer1::ITensor* inputTensor28_dfl_3[] = {dfl28_3->getOutput(0), split28_3_1->getOutput(0)}; + nvinfer1::IConcatenationLayer* cat28_dfl_3 = network->addConcatenation(inputTensor28_dfl_3, 2); + + nvinfer1::IPluginV2Layer* yolo = addYoLoLayer( + network, std::vector{cat28_dfl_0, cat28_dfl_1, cat28_dfl_2, cat28_dfl_3}, + strides, stridesLength, false); + yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); @@ -820,39 +1170,44 @@ nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer ********************************************* YOLOV8 DETECT ****************************************** *******************************************************************************************************/ - nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0)); - shuffle22_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 8) * (kInputW / 8)}); + int strides[] = {8, 16, 32}; + int stridesLength = sizeof(strides) / sizeof(int); - nvinfer1::ISliceLayer* split22_0_0 = - network->addSlice(*shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0}, - nvinfer1::Dims2{64, (kInputH / 8) * (kInputW / 8)}, nvinfer1::Dims2{1, 1}); - nvinfer1::ISliceLayer* split22_0_1 = - network->addSlice(*shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0}, - nvinfer1::Dims2{kNumClass, (kInputH / 8) * (kInputW / 8)}, nvinfer1::Dims2{1, 1}); - nvinfer1::IShuffleLayer* dfl22_0 = DFL(network, weightMap, *split22_0_0->getOutput(0), 4, - (kInputH / 8) * (kInputW / 8), 1, 1, 0, "model.22.dfl.conv.weight"); + nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0)); + shuffle22_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); + nvinfer1::ISliceLayer* split22_0_0 = network->addSlice( + *shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0}, + nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); + nvinfer1::ISliceLayer* split22_0_1 = network->addSlice( + *shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0}, + nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); + nvinfer1::IShuffleLayer* dfl22_0 = + DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, + 1, 0, "model.22.dfl.conv.weight"); nvinfer1::IShuffleLayer* shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0)); - shuffle22_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 16) * (kInputW / 16)}); - nvinfer1::ISliceLayer* split22_1_0 = - network->addSlice(*shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0}, - nvinfer1::Dims2{64, (kInputH / 16) * (kInputW / 16)}, nvinfer1::Dims2{1, 1}); - nvinfer1::ISliceLayer* split22_1_1 = - network->addSlice(*shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0}, - nvinfer1::Dims2{kNumClass, (kInputH / 16) * (kInputW / 16)}, nvinfer1::Dims2{1, 1}); - nvinfer1::IShuffleLayer* dfl22_1 = DFL(network, weightMap, *split22_1_0->getOutput(0), 4, - (kInputH / 16) * (kInputW / 16), 1, 1, 0, "model.22.dfl.conv.weight"); + shuffle22_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); + nvinfer1::ISliceLayer* split22_1_0 = network->addSlice( + *shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0}, + nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); + nvinfer1::ISliceLayer* split22_1_1 = network->addSlice( + *shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0}, + nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); + nvinfer1::IShuffleLayer* dfl22_1 = + DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, + 1, 0, "model.22.dfl.conv.weight"); nvinfer1::IShuffleLayer* shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0)); - shuffle22_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / 32) * (kInputW / 32)}); - nvinfer1::ISliceLayer* split22_2_0 = - network->addSlice(*shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0}, - nvinfer1::Dims2{64, (kInputH / 32) * (kInputW / 32)}, nvinfer1::Dims2{1, 1}); - nvinfer1::ISliceLayer* split22_2_1 = - network->addSlice(*shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0}, - nvinfer1::Dims2{kNumClass, (kInputH / 32) * (kInputW / 32)}, nvinfer1::Dims2{1, 1}); - nvinfer1::IShuffleLayer* dfl22_2 = DFL(network, weightMap, *split22_2_0->getOutput(0), 4, - (kInputH / 32) * (kInputW / 32), 1, 1, 0, "model.22.dfl.conv.weight"); + shuffle22_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); + nvinfer1::ISliceLayer* split22_2_0 = network->addSlice( + *shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0}, + nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); + nvinfer1::ISliceLayer* split22_2_1 = network->addSlice( + *shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0}, + nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); + nvinfer1::IShuffleLayer* dfl22_2 = + DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, + 1, 0, "model.22.dfl.conv.weight"); // det0 auto proto_coef_0 = ProtoCoef(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0", 6400, gw); @@ -872,8 +1227,9 @@ nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer proto_coef_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 3); - nvinfer1::IPluginV2Layer* yolo = addYoLoLayer( - network, std::vector{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2}, true); + nvinfer1::IPluginV2Layer* yolo = + addYoLoLayer(network, std::vector{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2}, + strides, stridesLength, true); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); diff --git a/yolov8/yolov8_det.cpp b/yolov8/yolov8_det.cpp index 9fb55c87..b3fee0f9 100644 --- a/yolov8/yolov8_det.cpp +++ b/yolov8/yolov8_det.cpp @@ -13,14 +13,17 @@ Logger gLogger; using namespace nvinfer1; const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; -void serialize_engine(std::string& wts_name, std::string& engine_name, bool& is_p6, std::string& sub_type, float& gd, +void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd, float& gw, int& max_channels) { IBuilder* builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig(); IHostMemory* serialized_engine = nullptr; - if (is_p6) { + if (is_p == 6) { + std::cout << "Subtype: " << is_p << std::endl; serialized_engine = buildEngineYolov8DetP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); + } else if (is_p == 2) { + serialized_engine = buildEngineYolov8DetP2(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); } else { serialized_engine = buildEngineYolov8Det(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); } @@ -116,7 +119,7 @@ void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, flo CUDA_CHECK(cudaStreamSynchronize(stream)); } -bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, bool& is_p6, std::string& img_dir, +bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir, std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) { if (argc < 4) return false; @@ -149,7 +152,11 @@ bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, bo return false; } if (sub_type.size() == 2 && sub_type[1] == '6') { - is_p6 = true; + std::cout << "Subtype: " << sub_type[1] << std::endl; + is_p = 6; + } else if (sub_type.size() == 2 && sub_type[1] == '2') { + std::cout << "Subtype: " << sub_type[1] << std::endl; + is_p = 2; } } else if (std::string(argv[1]) == "-d" && argc == 5) { engine = std::string(argv[2]); @@ -169,14 +176,15 @@ int main(int argc, char** argv) { std::string sub_type = ""; std::string cuda_post_process = ""; int model_bboxes; - bool is_p6 = false; + int is_p = 0; float gd = 0.0f, gw = 0.0f; int max_channels = 0; - if (!parse_args(argc, argv, wts_name, engine_name, is_p6, img_dir, sub_type, cuda_post_process, gd, gw, + if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw, max_channels)) { std::cerr << "Arguments not right!" << std::endl; - std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n6/s6/m6/l6/x6] // serialize model to plan file" + std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to " + "plan file" << std::endl; std::cerr << "./yolov8 -d [.engine] ../samples [c/g]// deserialize plan file and run inference" << std::endl; return -1; @@ -184,7 +192,7 @@ int main(int argc, char** argv) { // Create a model using the API directly and serialize it to a file if (!wts_name.empty()) { - serialize_engine(wts_name, engine_name, is_p6, sub_type, gd, gw, max_channels); + serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels); return 0; } From df11eab9056201646d729588f54f80f12bd50e46 Mon Sep 17 00:00:00 2001 From: linds Date: Wed, 17 Apr 2024 16:08:31 +0800 Subject: [PATCH 02/24] yolov8 p2 --- yolov8/README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/yolov8/README.md b/yolov8/README.md index 6653e1f8..1ec527b3 100644 --- a/yolov8/README.md +++ b/yolov8/README.md @@ -21,11 +21,10 @@ The tensorrt code is derived from [xiaocao-tian/yolov8_tensorrt](https://github. Currently, we support yolov8 -- For yolov8 , download .pt from [https://github.com/ultralytics/assets/releases](https://github.com/ultralytics/assets/releases), then follow how-to-run in current page. -[README.md](..%2FREADME.md) +- For yolov8 , download .pt from https://github.com/ultralytics/assets/releases, then follow how-to-run in current page. ## Config -- Choose the model n/s/m/l/x/n6/s6/m6/l6/[README.md](..%2FREADME.md)x6 from command line arguments. +- Choose the model n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6 from command line arguments. - Check more configs in [include/config.h](./include/config.h) ## How to Run, yolov8n as example From ee640a87e876e5f4b26d33ee2be2a7e0f24fae33 Mon Sep 17 00:00:00 2001 From: linds Date: Wed, 17 Apr 2024 16:09:54 +0800 Subject: [PATCH 03/24] yolov8 p2 --- yolov8/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yolov8/README.md b/yolov8/README.md index 1ec527b3..e77af8d5 100644 --- a/yolov8/README.md +++ b/yolov8/README.md @@ -21,7 +21,7 @@ The tensorrt code is derived from [xiaocao-tian/yolov8_tensorrt](https://github. Currently, we support yolov8 -- For yolov8 , download .pt from https://github.com/ultralytics/assets/releases, then follow how-to-run in current page. +- For yolov8 , download .pt from [https://github.com/ultralytics/assets/releases](https://github.com/ultralytics/assets/releases), then follow how-to-run in current page. ## Config - Choose the model n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6 from command line arguments. From cea11955f9167845bd0d88c7c790849e2f3638d5 Mon Sep 17 00:00:00 2001 From: linds Date: Wed, 17 Apr 2024 16:10:57 +0800 Subject: [PATCH 04/24] yolov8 p2 --- yolov8/include/config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yolov8/include/config.h b/yolov8/include/config.h index 6af9f1a9..d283860d 100644 --- a/yolov8/include/config.h +++ b/yolov8/include/config.h @@ -4,7 +4,7 @@ const static char* kInputTensorName = "images"; const static char* kOutputTensorName = "output"; -const static int kNumClass = 10; +const static int kNumClass = 80; const static int kBatchSize = 1; const static int kGpuId = 0; const static int kInputH = 640; From 20ae8f4ea44083e0aa618c143a4adf3182d97fa1 Mon Sep 17 00:00:00 2001 From: lindsayshuo <932695342@qq.com> Date: Wed, 17 Apr 2024 17:19:04 +0800 Subject: [PATCH 05/24] Update yolov8_det.cpp --- yolov8/yolov8_det.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/yolov8/yolov8_det.cpp b/yolov8/yolov8_det.cpp index b3fee0f9..b5f478bb 100644 --- a/yolov8/yolov8_det.cpp +++ b/yolov8/yolov8_det.cpp @@ -20,7 +20,6 @@ void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p IHostMemory* serialized_engine = nullptr; if (is_p == 6) { - std::cout << "Subtype: " << is_p << std::endl; serialized_engine = buildEngineYolov8DetP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); } else if (is_p == 2) { serialized_engine = buildEngineYolov8DetP2(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); @@ -152,10 +151,8 @@ bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, in return false; } if (sub_type.size() == 2 && sub_type[1] == '6') { - std::cout << "Subtype: " << sub_type[1] << std::endl; is_p = 6; } else if (sub_type.size() == 2 && sub_type[1] == '2') { - std::cout << "Subtype: " << sub_type[1] << std::endl; is_p = 2; } } else if (std::string(argv[1]) == "-d" && argc == 5) { From 6ce06d5383f8163c9eff0a2eefd2e8bbac1033eb Mon Sep 17 00:00:00 2001 From: lindsayshuo <932695342@qq.com> Date: Wed, 17 Apr 2024 17:20:22 +0800 Subject: [PATCH 06/24] Update model.cpp --- yolov8/src/model.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp index 9cfc8dd5..56c5d6e7 100644 --- a/yolov8/src/model.cpp +++ b/yolov8/src/model.cpp @@ -622,9 +622,6 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels) { - - std::cout << "buildEngineYolov8DetP2 " << std::endl; - std::map weightMap = loadWeights(wts_path); nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); From 246cb7509a37eeef84b26b35b5f63570b612e66f Mon Sep 17 00:00:00 2001 From: lindsayshuo <932695342@qq.com> Date: Wed, 17 Apr 2024 17:21:36 +0800 Subject: [PATCH 07/24] Update model.cpp --- yolov8/src/model.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp index 56c5d6e7..5e85a7a5 100644 --- a/yolov8/src/model.cpp +++ b/yolov8/src/model.cpp @@ -739,9 +739,6 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinf int base_in_channel = 64; int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(128, gw, max_channels); - std::cout << "base_in_channel is : " << base_in_channel << std::endl; - std::cout << "base_out_channel is : " << base_out_channel << std::endl; - // output0 nvinfer1::IElementWiseLayer* conv28_cv2_0_0 = convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.0.0"); @@ -838,8 +835,6 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinf int stridesLength = sizeof(strides) / sizeof(int); // P2 processing steps (remains unchanged) - std::cout << "kNumClass is : " << kNumClass << std::endl; - std::cout << "kInputH is : " << kInputH << std::endl; nvinfer1::IShuffleLayer* shuffle28_0 = network->addShuffle(*cat28_0->getOutput(0)); shuffle28_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); nvinfer1::ISliceLayer* split28_0_0 = network->addSlice( From ebba1f9396f9454cb3fe0b4365ed4d396587eb55 Mon Sep 17 00:00:00 2001 From: lindsayshuo <932695342@qq.com> Date: Wed, 17 Apr 2024 17:22:22 +0800 Subject: [PATCH 08/24] Update model.cpp --- yolov8/src/model.cpp | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp index 5e85a7a5..08a7b8fd 100644 --- a/yolov8/src/model.cpp +++ b/yolov8/src/model.cpp @@ -301,19 +301,6 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels) { std::map weightMap = loadWeights(wts_path); - for (const auto& kv : weightMap) { - if (kv.first.find("conv.weight") != std::string::npos || - kv.first.find("linear.weight") != std::string::npos) { // 检查 conv.weight 或 linear.weight - std::cout << "Weight name: " << kv.first << ", "; - std::cout << "Count: " << kv.second.count << ", "; - std::cout << "Type: " - << (kv.second.type == nvinfer1::DataType::kFLOAT ? "FLOAT" - : kv.second.type == nvinfer1::DataType::kHALF ? "HALF" - : "INT8") - << std::endl; - } - } - nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); std::cout << "gd: " << gd << ", gw: " << gw << std::endl; /******************************************************************************************************* From a149c50f3d957a790b5b41c7d037c72ba018e474 Mon Sep 17 00:00:00 2001 From: lindsayshuo <932695342@qq.com> Date: Wed, 17 Apr 2024 17:23:12 +0800 Subject: [PATCH 09/24] Update model.cpp --- yolov8/src/model.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp index 08a7b8fd..316b1301 100644 --- a/yolov8/src/model.cpp +++ b/yolov8/src/model.cpp @@ -302,7 +302,6 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf int& max_channels) { std::map weightMap = loadWeights(wts_path); nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); - std::cout << "gd: " << gd << ", gw: " << gw << std::endl; /******************************************************************************************************* ****************************************** YOLOV8 INPUT ********************************************** *******************************************************************************************************/ From 917210a4f75e881850fdc726d556ef23f6a17b22 Mon Sep 17 00:00:00 2001 From: linds Date: Tue, 23 Apr 2024 10:08:22 +0800 Subject: [PATCH 10/24] fix strides --- yolov8/src/model.cpp | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp index 316b1301..0b105c97 100644 --- a/yolov8/src/model.cpp +++ b/yolov8/src/model.cpp @@ -20,6 +20,15 @@ static int get_depth(int x, float gd) { return std::max(r, 1); } +void calculateStrides(nvinfer1::IElementWiseLayer* conv_layers[], int size, int reference_size, int strides[]) { + for (int i = 0; i < size; ++i) { + nvinfer1::ILayer* layer = conv_layers[i]; + nvinfer1::Dims dims = layer->getOutput(0)->getDimensions(); + int feature_map_size = dims.d[1]; + strides[i] = reference_size / feature_map_size; + } +} + static nvinfer1::IElementWiseLayer* Proto(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, std::string lname, float gw, int max_channels) { @@ -220,7 +229,9 @@ nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer ********************************************* YOLOV8 DETECT ****************************************** *******************************************************************************************************/ - int strides[] = {8, 16, 32}; + nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7}; + int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; + calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0)); @@ -507,7 +518,9 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf /******************************************************************************************************* ********************************************* YOLOV8 DETECT ****************************************** *******************************************************************************************************/ - int strides[] = {8, 16, 32, 64}; + nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7, conv9}; + int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; + calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); // P3 processing steps (remains unchanged) @@ -817,7 +830,9 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinf ********************************************* YOLOV8 DETECT ****************************************** *******************************************************************************************************/ - int strides[] = {4, 8, 16, 32}; + nvinfer1::IElementWiseLayer* conv_layers[] = {conv1, conv3, conv5, conv7}; + int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; + calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); // P2 processing steps (remains unchanged) @@ -1148,7 +1163,9 @@ nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer ********************************************* YOLOV8 DETECT ****************************************** *******************************************************************************************************/ - int strides[] = {8, 16, 32}; + nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7}; + int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; + calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); int stridesLength = sizeof(strides) / sizeof(int); nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0)); From cfb1d32b207bf6a460ccd0f609ea4cb769eefeb2 Mon Sep 17 00:00:00 2001 From: linds Date: Fri, 26 Apr 2024 17:20:43 +0800 Subject: [PATCH 11/24] yolov8 pose --- yolov8/CMakeLists.txt | 10 +- yolov8/README.md | 11 ++ yolov8/include/block.h | 2 +- yolov8/include/config.h | 2 + yolov8/include/model.h | 4 + yolov8/include/postprocess.h | 25 ++- yolov8/include/types.h | 14 +- yolov8/plugin/yololayer.cu | 92 +++++++--- yolov8/plugin/yololayer.h | 7 +- yolov8/src/block.cpp | 15 +- yolov8/src/model.cpp | 344 +++++++++++++++++++++++++++++++++++ yolov8/src/postprocess.cpp | 62 +++++++ yolov8/yolov8_pose.cpp | 276 ++++++++++++++++++++++++++++ 13 files changed, 815 insertions(+), 49 deletions(-) create mode 100644 yolov8/yolov8_pose.cpp diff --git a/yolov8/CMakeLists.txt b/yolov8/CMakeLists.txt index d5332a57..a702b341 100644 --- a/yolov8/CMakeLists.txt +++ b/yolov8/CMakeLists.txt @@ -27,8 +27,8 @@ else() # tensorrt include_directories(/home/lindsay/TensorRT-8.4.1.5/include) link_directories(/home/lindsay/TensorRT-8.4.1.5/lib) -# include_directories(/home/lindsay/TensorRT-7.2.3.4/include) -# link_directories(/home/lindsay/TensorRT-7.2.3.4/lib) + # include_directories(/home/lindsay/TensorRT-7.2.3.4/include) + # link_directories(/home/lindsay/TensorRT-7.2.3.4/lib) endif() @@ -51,5 +51,9 @@ target_link_libraries(yolov8_det ${OpenCV_LIBS}) add_executable(yolov8_seg ${PROJECT_SOURCE_DIR}/yolov8_seg.cpp ${SRCS}) target_link_libraries(yolov8_seg nvinfer cudart myplugins ${OpenCV_LIBS}) + +add_executable(yolov8_pose ${PROJECT_SOURCE_DIR}/yolov8_pose.cpp ${SRCS}) +target_link_libraries(yolov8_pose nvinfer cudart myplugins ${OpenCV_LIBS}) + add_executable(yolov8_cls ${PROJECT_SOURCE_DIR}/yolov8_cls.cpp ${SRCS}) -target_link_libraries(yolov8_cls nvinfer cudart myplugins ${OpenCV_LIBS}) \ No newline at end of file +target_link_libraries(yolov8_cls nvinfer cudart myplugins ${OpenCV_LIBS}) diff --git a/yolov8/README.md b/yolov8/README.md index dad23bcb..8f198c65 100644 --- a/yolov8/README.md +++ b/yolov8/README.md @@ -105,6 +105,17 @@ sudo ./yolov8_cls -s yolov8n-cls.wts yolov8-cls.engine n sudo ./yolov8_cls -d yolov8n-cls.engine ../samples ``` + +### Pose Estimation +``` +# Build and serialize TensorRT engine +./yolov8_pose -s yolov8n-pose.wts yolov8n-pose.engine n + +# Run inference with labels file +./yolov8_pose -d yolov8n-pose.engine ../images c +``` + + 4. optional, load and run the tensorrt model in python ``` diff --git a/yolov8/include/block.h b/yolov8/include/block.h index ed4c6ad1..650cacda 100644 --- a/yolov8/include/block.h +++ b/yolov8/include/block.h @@ -27,4 +27,4 @@ nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map dets, const int* px_arry, - int px_arry_num, bool is_segmentation); + int px_arry_num, bool is_segmentation, bool is_pose); diff --git a/yolov8/include/config.h b/yolov8/include/config.h index d283860d..66ef7e4c 100644 --- a/yolov8/include/config.h +++ b/yolov8/include/config.h @@ -5,12 +5,14 @@ const static char* kInputTensorName = "images"; const static char* kOutputTensorName = "output"; const static int kNumClass = 80; +const static int kNumberOfPoints = 17; // number of keypoints total const static int kBatchSize = 1; const static int kGpuId = 0; const static int kInputH = 640; const static int kInputW = 640; const static float kNmsThresh = 0.45f; const static float kConfThresh = 0.5f; +const static float kConfThreshKeypoints = 0.5f; // keypoints confidence const static int kMaxInputImageSize = 3000 * 3000; const static int kMaxNumOutputBbox = 1000; diff --git a/yolov8/include/model.h b/yolov8/include/model.h index 0a2a40a3..6546aa54 100644 --- a/yolov8/include/model.h +++ b/yolov8/include/model.h @@ -21,3 +21,7 @@ nvinfer1::IHostMemory* buildEngineYolov8Cls(nvinfer1::IBuilder* builder, nvinfer nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels); + +nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, + nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, + int& max_channels); diff --git a/yolov8/include/postprocess.h b/yolov8/include/postprocess.h index c6c8b92a..eb18d542 100644 --- a/yolov8/include/postprocess.h +++ b/yolov8/include/postprocess.h @@ -1,23 +1,30 @@ #pragma once -#include "types.h" -#include "NvInfer.h" #include +#include "NvInfer.h" +#include "types.h" cv::Rect get_rect(cv::Mat& img, float bbox[4]); -void nms(std::vector& res, float *output, float conf_thresh, float nms_thresh = 0.5); +void nms(std::vector& res, float* output, float conf_thresh, float nms_thresh = 0.5); + +void batch_nms(std::vector>& batch_res, float* output, int batch_size, int output_size, + float conf_thresh, float nms_thresh = 0.5); -void batch_nms(std::vector>& batch_res, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5); +void draw_bbox(std::vector& img_batch, std::vector>& res_batch); -void draw_bbox(std::vector &img_batch, std::vector> &res_batch); +void draw_bbox_keypoints_line(std::vector& img_batch, std::vector>& res_batch); -void batch_process(std::vector> &res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch); +void batch_process(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, + int bbox_element, const std::vector& img_batch); -void process_decode_ptr_host(std::vector &res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count); +void process_decode_ptr_host(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, + int count); -void cuda_decode(float* predict, int num_bboxes, float confidence_threshold,float* parray,int max_objects, cudaStream_t stream); +void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects, + cudaStream_t stream); void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream); -void draw_mask_bbox(cv::Mat& img, std::vector& dets, std::vector& masks, std::unordered_map& labels_map); +void draw_mask_bbox(cv::Mat& img, std::vector& dets, std::vector& masks, + std::unordered_map& labels_map); diff --git a/yolov8/include/types.h b/yolov8/include/types.h index 1eac8f4b..472c7354 100644 --- a/yolov8/include/types.h +++ b/yolov8/include/types.h @@ -2,15 +2,17 @@ #include "config.h" struct alignas(float) Detection { - //center_x center_y w h - float bbox[4]; - float conf; // bbox_conf * cls_conf - float class_id; - float mask[32]; + //center_x center_y w h + float bbox[4]; + float conf; // bbox_conf * cls_conf + float class_id; + float mask[32]; + float keypoints[51]; // 17*3 keypoints }; struct AffineMatrix { float value[6]; }; -const int bbox_element = sizeof(AffineMatrix) / sizeof(float)+1; // left, top, right, bottom, confidence, class, keepflag +const int bbox_element = + sizeof(AffineMatrix) / sizeof(float) + 1; // left, top, right, bottom, confidence, class, keepflag diff --git a/yolov8/plugin/yololayer.cu b/yolov8/plugin/yololayer.cu index 9b786de7..592914fa 100755 --- a/yolov8/plugin/yololayer.cu +++ b/yolov8/plugin/yololayer.cu @@ -20,10 +20,18 @@ void read(const char*& buffer, T& val) { } } // namespace Tn +__device__ float sigmoid(float x) { + return 1.0f / (1.0f + exp(-x)); +} + namespace nvinfer1 { -YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation, - const int* strides, int stridesLength) { +YoloLayerPlugin::YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, + int netHeight, int maxOut, bool is_segmentation, bool is_pose, const int* strides, + int stridesLength) { + mClassCount = classCount; + mNumberofpoints = numberofpoints; + mConfthreshkeypoints = confthreshkeypoints; mYoloV8NetWidth = netWidth; mYoloV8netHeight = netHeight; mMaxOutObject = maxOut; @@ -31,6 +39,7 @@ YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, in mStrides = new int[stridesLength]; memcpy(mStrides, strides, stridesLength * sizeof(int)); is_segmentation_ = is_segmentation; + is_pose_ = is_pose; } YoloLayerPlugin::~YoloLayerPlugin() { @@ -44,6 +53,8 @@ YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { using namespace Tn; const char *d = reinterpret_cast(data), *a = d; read(d, mClassCount); + read(d, mNumberofpoints); + read(d, mConfthreshkeypoints); read(d, mThreadCount); read(d, mYoloV8NetWidth); read(d, mYoloV8netHeight); @@ -54,6 +65,7 @@ YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { read(d, mStrides[i]); } read(d, is_segmentation_); + read(d, is_pose_); assert(d == a + length); } @@ -63,6 +75,8 @@ void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT { using namespace Tn; char *d = static_cast(buffer), *a = d; write(d, mClassCount); + write(d, mNumberofpoints); + write(d, mConfthreshkeypoints); write(d, mThreadCount); write(d, mYoloV8NetWidth); write(d, mYoloV8netHeight); @@ -72,13 +86,15 @@ void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT { write(d, mStrides[i]); } write(d, is_segmentation_); + write(d, is_pose_); assert(d == a + getSerializationSize()); } size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT { - return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + - sizeof(mMaxOutObject) + sizeof(mStridesLength) + sizeof(int) * mStridesLength + sizeof(is_segmentation_); + return sizeof(mClassCount) + sizeof(mNumberofpoints) + sizeof(mConfthreshkeypoints) + sizeof(mThreadCount) + + sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(mStridesLength) + + sizeof(int) * mStridesLength + sizeof(is_segmentation_) + sizeof(is_pose_); } int YoloLayerPlugin::initialize() TRT_NOEXCEPT { @@ -133,14 +149,14 @@ const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT { } void YoloLayerPlugin::destroy() TRT_NOEXCEPT { - delete this; } nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT { - YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV8NetWidth, mYoloV8netHeight, mMaxOutObject, - is_segmentation_, mStrides, mStridesLength); + YoloLayerPlugin* p = + new YoloLayerPlugin(mClassCount, mNumberofpoints, mConfthreshkeypoints, mYoloV8NetWidth, mYoloV8netHeight, + mMaxOutObject, is_segmentation_, is_pose_, mStrides, mStridesLength); p->setPluginNamespace(mPluginNamespace); return p; } @@ -157,15 +173,15 @@ __device__ float Logist(float data) { }; __global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h, - int grid_w, const int stride, int classes, int outputElem, bool is_segmentation) { + int grid_w, const int stride, int classes, int nk, float confkeypoints, int outputElem, + bool is_segmentation, bool is_pose) { int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= numElements) return; + const int N_kpts = nk; int total_grid = grid_h * grid_w; - int info_len = 4 + classes; - if (is_segmentation) - info_len += 32; + int info_len = 4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0); int batchIdx = idx / total_grid; int elemIdx = idx % total_grid; const float* curInput = input + batchIdx * total_grid * info_len; @@ -200,8 +216,36 @@ __global__ void CalDetection(const float* input, float* output, int numElements, det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride; det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride; - for (int k = 0; is_segmentation && k < 32; k++) { - det->mask[k] = curInput[elemIdx + (k + 4 + classes) * total_grid]; + if (is_segmentation) { + for (int k = 0; k < 32; ++k) { + det->mask[k] = curInput[elemIdx + (4 + classes + k) * total_grid]; + } + } + + if (is_pose) { + for (int kpt = 0; kpt < N_kpts; kpt++) { + int kpt_x_idx = (4 + classes + (is_segmentation ? 32 : 0) + kpt * 3) * total_grid; + int kpt_y_idx = (4 + classes + (is_segmentation ? 32 : 0) + kpt * 3 + 1) * total_grid; + int kpt_conf_idx = (4 + classes + (is_segmentation ? 32 : 0) + kpt * 3 + 2) * total_grid; + + float kpt_confidence = sigmoid(curInput[elemIdx + kpt_conf_idx]); + + float kpt_x = (curInput[elemIdx + kpt_x_idx] * 2.0 + col) * stride; + float kpt_y = (curInput[elemIdx + kpt_y_idx] * 2.0 + row) * stride; + + bool is_within_bbox = + kpt_x >= det->bbox[0] && kpt_x <= det->bbox[2] && kpt_y >= det->bbox[1] && kpt_y <= det->bbox[3]; + + if (kpt_confidence < confkeypoints || !is_within_bbox) { + det->keypoints[kpt * 3] = -1; + det->keypoints[kpt * 3 + 1] = -1; + det->keypoints[kpt * 3 + 2] = -1; + } else { + det->keypoints[kpt * 3] = kpt_x; + det->keypoints[kpt * 3 + 1] = kpt_y; + det->keypoints[kpt * 3 + 2] = kpt_confidence; + } + } } } @@ -230,8 +274,8 @@ void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cuda mThreadCount = numElem; CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>( - inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, outputElem, - is_segmentation_); + inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, mNumberofpoints, + mConfthreshkeypoints, outputElem, is_segmentation_, is_pose_); } } @@ -260,16 +304,20 @@ IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFi assert(fc->nbFields == 1); assert(strcmp(fc->fields[0].name, "combinedInfo") == 0); const int* combinedInfo = static_cast(fc->fields[0].data); - int netinfo_count = 5; + int netinfo_count = 8; int class_count = combinedInfo[0]; - int input_w = combinedInfo[1]; - int input_h = combinedInfo[2]; - int max_output_object_count = combinedInfo[3]; - bool is_segmentation = combinedInfo[4]; + int numberofpoints = combinedInfo[1]; + float confthreshkeypoints = combinedInfo[2]; + int input_w = combinedInfo[3]; + int input_h = combinedInfo[4]; + int max_output_object_count = combinedInfo[5]; + bool is_segmentation = combinedInfo[6]; + bool is_pose = combinedInfo[7]; const int* px_arry = combinedInfo + netinfo_count; int px_arry_length = fc->fields[0].length - netinfo_count; - YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, is_segmentation, - px_arry, px_arry_length); + YoloLayerPlugin* obj = + new YoloLayerPlugin(class_count, numberofpoints, confthreshkeypoints, input_w, input_h, + max_output_object_count, is_segmentation, is_pose, px_arry, px_arry_length); obj->setPluginNamespace(mNamespace.c_str()); return obj; } diff --git a/yolov8/plugin/yololayer.h b/yolov8/plugin/yololayer.h index 9496e7fc..b516ad87 100644 --- a/yolov8/plugin/yololayer.h +++ b/yolov8/plugin/yololayer.h @@ -6,8 +6,8 @@ namespace nvinfer1 { class API YoloLayerPlugin : public IPluginV2IOExt { public: - YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation, const int* strides, - int stridesLength); + YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, int netHeight, + int maxOut, bool is_segmentation, bool is_pose, const int* strides, int stridesLength); YoloLayerPlugin(const void* data, size_t length); ~YoloLayerPlugin(); @@ -68,10 +68,13 @@ class API YoloLayerPlugin : public IPluginV2IOExt { int mThreadCount = 256; const char* mPluginNamespace; int mClassCount; + int mNumberofpoints; + float mConfthreshkeypoints; int mYoloV8NetWidth; int mYoloV8netHeight; int mMaxOutObject; bool is_segmentation_; + bool is_pose_; int* mStrides; int mStridesLength; }; diff --git a/yolov8/src/block.cpp b/yolov8/src/block.cpp index 713f86ef..75d280e7 100644 --- a/yolov8/src/block.cpp +++ b/yolov8/src/block.cpp @@ -220,18 +220,21 @@ nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map dets, const int* px_arry, - int px_arry_num, bool is_segmentation) { + int px_arry_num, bool is_segmentation, bool is_pose) { auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); - const int netinfo_count = 5; // Assuming the first 5 elements are for netinfo as per existing code. + const int netinfo_count = 8; // Assuming the first 5 elements are for netinfo as per existing code. const int total_count = netinfo_count + px_arry_num; // Total number of elements for netinfo and px_arry combined. std::vector combinedInfo(total_count); // Fill in the first 5 elements as per existing netinfo. combinedInfo[0] = kNumClass; - combinedInfo[1] = kInputW; - combinedInfo[2] = kInputH; - combinedInfo[3] = kMaxNumOutputBbox; - combinedInfo[4] = is_segmentation; + combinedInfo[1] = kNumberOfPoints; + combinedInfo[2] = kConfThreshKeypoints; + combinedInfo[3] = kInputW; + combinedInfo[4] = kInputH; + combinedInfo[5] = kMaxNumOutputBbox; + combinedInfo[6] = is_segmentation; + combinedInfo[7] = is_pose; // Copy the contents of px_arry into the combinedInfo vector after the initial 5 elements. std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count); diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp index 0b105c97..2e67bacb 100644 --- a/yolov8/src/model.cpp +++ b/yolov8/src/model.cpp @@ -1257,3 +1257,347 @@ nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer } return serialized_model; } + +nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, + nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, + int& max_channels) { + std::map weightMap = loadWeights(wts_path); + nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); + + /******************************************************************************************************* + ****************************************** YOLOV8 INPUT ********************************************** + *******************************************************************************************************/ + nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW}); + assert(data); + + /******************************************************************************************************* + ***************************************** YOLOV8 BACKBONE ******************************************** + *******************************************************************************************************/ + nvinfer1::IElementWiseLayer* conv0 = + convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0"); + printLayerDims(conv0, "conv0"); + nvinfer1::IElementWiseLayer* conv1 = + convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1"); + printLayerDims(conv1, "conv1"); + nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), + get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); + printLayerDims(conv2, "conv2"); + nvinfer1::IElementWiseLayer* conv3 = + convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3"); + printLayerDims(conv3, "conv3"); + nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), + get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); + printLayerDims(conv4, "conv4"); + nvinfer1::IElementWiseLayer* conv5 = + convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5"); + printLayerDims(conv5, "conv5"); + nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); + printLayerDims(conv6, "conv6"); + nvinfer1::IElementWiseLayer* conv7 = + convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7"); + printLayerDims(conv7, "conv7"); + nvinfer1::IElementWiseLayer* conv8 = + C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8"); + printLayerDims(conv8, "conv8"); + nvinfer1::IElementWiseLayer* conv9 = + SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), 5, "model.9"); + printLayerDims(conv9, "conv9"); + /******************************************************************************************************* + ********************************************* YOLOV8 HEAD ******************************************** + *******************************************************************************************************/ + float scale[] = {1.0, 2.0, 2.0}; + nvinfer1::IResizeLayer* upsample10 = network->addResize(*conv9->getOutput(0)); + printLayerDims(upsample10, "upsample10"); + assert(upsample10); + upsample10->setResizeMode(nvinfer1::ResizeMode::kNEAREST); + printLayerDims(upsample10, "upsample10"); + upsample10->setScales(scale, 3); + printLayerDims(upsample10, "upsample10"); + + nvinfer1::ITensor* inputTensor11[] = {upsample10->getOutput(0), conv6->getOutput(0)}; + printTensorsDims(inputTensor11, 2, "inputTensor11"); + nvinfer1::IConcatenationLayer* cat11 = network->addConcatenation(inputTensor11, 2); + printLayerDims(cat11, "cat11"); + nvinfer1::IElementWiseLayer* conv12 = + C2F(network, weightMap, *cat11->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12"); + printLayerDims(conv12, "conv12"); + + nvinfer1::IResizeLayer* upsample13 = network->addResize(*conv12->getOutput(0)); + printLayerDims(upsample13, "upsample13"); + assert(upsample13); + upsample13->setResizeMode(nvinfer1::ResizeMode::kNEAREST); + printLayerDims(upsample13, "upsample13"); + upsample13->setScales(scale, 3); + printLayerDims(upsample13, "upsample13"); + + nvinfer1::ITensor* inputTensor14[] = {upsample13->getOutput(0), conv4->getOutput(0)}; + printTensorsDims(inputTensor14, 2, "inputTensor14"); + nvinfer1::IConcatenationLayer* cat14 = network->addConcatenation(inputTensor14, 2); + printLayerDims(cat14, "cat14"); + nvinfer1::IElementWiseLayer* conv15 = + C2F(network, weightMap, *cat14->getOutput(0), get_width(256, gw, max_channels), + get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15"); + printLayerDims(conv15, "conv15"); + nvinfer1::IElementWiseLayer* conv16 = convBnSiLU(network, weightMap, *conv15->getOutput(0), + get_width(256, gw, max_channels), 3, 2, 1, "model.16"); + printLayerDims(conv16, "conv16"); + nvinfer1::ITensor* inputTensor17[] = {conv16->getOutput(0), conv12->getOutput(0)}; + printTensorsDims(inputTensor17, 2, "inputTensor17"); + nvinfer1::IConcatenationLayer* cat17 = network->addConcatenation(inputTensor17, 2); + printLayerDims(cat17, "cat17"); + nvinfer1::IElementWiseLayer* conv18 = + C2F(network, weightMap, *cat17->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18"); + printLayerDims(conv18, "conv18"); + nvinfer1::IElementWiseLayer* conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0), + get_width(512, gw, max_channels), 3, 2, 1, "model.19"); + printLayerDims(conv19, "conv19"); + nvinfer1::ITensor* inputTensor20[] = {conv19->getOutput(0), conv9->getOutput(0)}; + printTensorsDims(inputTensor20, 2, "inputTensor20"); + nvinfer1::IConcatenationLayer* cat20 = network->addConcatenation(inputTensor20, 2); + printLayerDims(cat20, "cat20"); + nvinfer1::IElementWiseLayer* conv21 = + C2F(network, weightMap, *cat20->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21"); + printLayerDims(conv21, "conv21"); + printf("\n\n"); + + /******************************************************************************************************* + ********************************************* YOLOV8 OUTPUT ****************************************** + *******************************************************************************************************/ + int base_in_channel = (gw == 1.25) ? 80 : 64; + int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(256, gw, max_channels); + + // output0 + nvinfer1::IElementWiseLayer* conv22_cv2_0_0 = + convBnSiLU(network, weightMap, *conv15->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.0"); + printLayerDims(conv22_cv2_0_0, "conv22_cv2_0_0"); + nvinfer1::IElementWiseLayer* conv22_cv2_0_1 = + convBnSiLU(network, weightMap, *conv22_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.1"); + printLayerDims(conv22_cv2_0_1, "conv22_cv2_0_1"); + nvinfer1::IConvolutionLayer* conv22_cv2_0_2 = + network->addConvolutionNd(*conv22_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv2.0.2.weight"], weightMap["model.22.cv2.0.2.bias"]); + printLayerDims(conv22_cv2_0_2, "conv22_cv2_0_2"); + conv22_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + printLayerDims(conv22_cv2_0_2, "conv22_cv2_0_2"); + conv22_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + printLayerDims(conv22_cv2_0_2, "conv22_cv2_0_2"); + nvinfer1::IElementWiseLayer* conv22_cv3_0_0 = + convBnSiLU(network, weightMap, *conv15->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.0"); + printLayerDims(conv22_cv3_0_0, "conv22_cv3_0_0"); + nvinfer1::IElementWiseLayer* conv22_cv3_0_1 = convBnSiLU(network, weightMap, *conv22_cv3_0_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.22.cv3.0.1"); + printLayerDims(conv22_cv3_0_1, "conv22_cv3_0_1"); + nvinfer1::IConvolutionLayer* conv22_cv3_0_2 = + network->addConvolutionNd(*conv22_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv3.0.2.weight"], weightMap["model.22.cv3.0.2.bias"]); + printLayerDims(conv22_cv3_0_2, "conv22_cv3_0_2"); + conv22_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1}); + printLayerDims(conv22_cv3_0_2, "conv22_cv3_0_2"); + conv22_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0}); + printLayerDims(conv22_cv3_0_2, "conv22_cv3_0_2"); + nvinfer1::ITensor* inputTensor22_0[] = {conv22_cv2_0_2->getOutput(0), conv22_cv3_0_2->getOutput(0)}; + printTensorsDims(inputTensor22_0, 2, "inputTensor22_0"); + nvinfer1::IConcatenationLayer* cat22_0 = network->addConcatenation(inputTensor22_0, 2); + printLayerDims(cat22_0, "cat22_0"); + printf("\n\n"); + + // output1 + nvinfer1::IElementWiseLayer* conv22_cv2_1_0 = + convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.0"); + printLayerDims(conv22_cv2_1_0, "conv22_cv2_1_0"); + nvinfer1::IElementWiseLayer* conv22_cv2_1_1 = + convBnSiLU(network, weightMap, *conv22_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.1"); + printLayerDims(conv22_cv2_1_1, "conv22_cv2_1_1"); + nvinfer1::IConvolutionLayer* conv22_cv2_1_2 = + network->addConvolutionNd(*conv22_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv2.1.2.weight"], weightMap["model.22.cv2.1.2.bias"]); + printLayerDims(conv22_cv2_1_2, "conv22_cv2_1_2"); + conv22_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + printLayerDims(conv22_cv2_1_2, "conv22_cv2_1_2"); + conv22_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + printLayerDims(conv22_cv2_1_2, "conv22_cv2_1_2"); + nvinfer1::IElementWiseLayer* conv22_cv3_1_0 = + convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.0"); + printLayerDims(conv22_cv3_1_0, "conv22_cv3_1_0"); + nvinfer1::IElementWiseLayer* conv22_cv3_1_1 = convBnSiLU(network, weightMap, *conv22_cv3_1_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.22.cv3.1.1"); + printLayerDims(conv22_cv3_1_1, "conv22_cv3_1_1"); + nvinfer1::IConvolutionLayer* conv22_cv3_1_2 = + network->addConvolutionNd(*conv22_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv3.1.2.weight"], weightMap["model.22.cv3.1.2.bias"]); + printLayerDims(conv22_cv3_1_2, "conv22_cv3_1_2"); + conv22_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + printLayerDims(conv22_cv3_1_2, "conv22_cv3_1_2"); + conv22_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + printLayerDims(conv22_cv3_1_2, "conv22_cv3_1_2"); + nvinfer1::ITensor* inputTensor22_1[] = {conv22_cv2_1_2->getOutput(0), conv22_cv3_1_2->getOutput(0)}; + printTensorsDims(inputTensor22_1, 2, "inputTensor22_1"); + nvinfer1::IConcatenationLayer* cat22_1 = network->addConcatenation(inputTensor22_1, 2); + printLayerDims(cat22_1, "cat22_1"); + printf("\n\n"); + + // output2 + nvinfer1::IElementWiseLayer* conv22_cv2_2_0 = + convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.0"); + printLayerDims(conv22_cv2_2_0, "conv22_cv2_2_0"); + nvinfer1::IElementWiseLayer* conv22_cv2_2_1 = + convBnSiLU(network, weightMap, *conv22_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.1"); + printLayerDims(conv22_cv2_2_1, "conv22_cv2_2_1"); + nvinfer1::IConvolutionLayer* conv22_cv2_2_2 = + network->addConvolution(*conv22_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv2.2.2.weight"], weightMap["model.22.cv2.2.2.bias"]); + printLayerDims(conv22_cv2_2_2, "conv22_cv2_2_2"); + nvinfer1::IElementWiseLayer* conv22_cv3_2_0 = + convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.0"); + printLayerDims(conv22_cv3_2_0, "conv22_cv3_2_0"); + nvinfer1::IElementWiseLayer* conv22_cv3_2_1 = convBnSiLU(network, weightMap, *conv22_cv3_2_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.22.cv3.2.1"); + printLayerDims(conv22_cv3_2_1, "conv22_cv3_2_1"); + nvinfer1::IConvolutionLayer* conv22_cv3_2_2 = + network->addConvolution(*conv22_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv3.2.2.weight"], weightMap["model.22.cv3.2.2.bias"]); + printLayerDims(conv22_cv3_2_2, "conv22_cv3_2_2"); + nvinfer1::ITensor* inputTensor22_2[] = {conv22_cv2_2_2->getOutput(0), conv22_cv3_2_2->getOutput(0)}; + printTensorsDims(inputTensor22_2, 2, "inputTensor22_2"); + nvinfer1::IConcatenationLayer* cat22_2 = network->addConcatenation(inputTensor22_2, 2); + printLayerDims(cat22_2, "cat22_2"); + printf("\n\n"); + /******************************************************************************************************* + ********************************************* YOLOV8 DETECT ****************************************** + *******************************************************************************************************/ + + nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7}; + int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; + calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); + int stridesLength = sizeof(strides) / sizeof(int); + + /**************************************************************************************P3****************************************************************************************************************************************/ + nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0)); + printLayerDims(shuffle22_0, "shuffle22_0"); + shuffle22_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); + printLayerDims(shuffle22_0, "shuffle22_0->setReshapeDimensions"); + nvinfer1::ISliceLayer* split22_0_0 = network->addSlice( + *shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0}, + nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); + printLayerDims(split22_0_0, "split22_0_0"); + nvinfer1::ISliceLayer* split22_0_1 = network->addSlice( + *shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0}, + nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); + printLayerDims(split22_0_1, "split22_0_1"); + nvinfer1::IShuffleLayer* dfl22_0 = + DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, + 1, 0, "model.22.dfl.conv.weight"); + printLayerDims(dfl22_0, "dfl22_0"); + + // det0 + printLayerDims(conv15, "conv15"); + std::cout << "conv15->getOutput(0)->getDimensions().d[0] : " << conv15->getOutput(0)->getDimensions().d[0] + << " (kInputH / strides[0]) * (kInputW / strides[0]) : " + << (kInputH / strides[0]) * (kInputW / strides[0]) << std::endl; + auto shuffle_conv15 = cv4_conv_combined(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0", + (kInputH / strides[0]) * (kInputW / strides[0]), gw, "pose"); + + printLayerDims(shuffle_conv15, "shuffle_conv15"); + + nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0), + shuffle_conv15->getOutput(0)}; + printTensorsDims(inputTensor22_dfl_0, 3, "inputTensor22_dfl_0"); + nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 3); + printLayerDims(cat22_dfl_0, "cat22_dfl_0"); + printf("\n\n"); + + /********************************************************************************************P4**********************************************************************************************************************************/ + nvinfer1::IShuffleLayer* shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0)); + printLayerDims(shuffle22_1, "shuffle22_1"); + shuffle22_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); + printLayerDims(shuffle22_1, "shuffle22_1->setReshapeDimensions"); + nvinfer1::ISliceLayer* split22_1_0 = network->addSlice( + *shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0}, + nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); + printLayerDims(split22_1_0, "split22_1_0"); + nvinfer1::ISliceLayer* split22_1_1 = network->addSlice( + *shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0}, + nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); + printLayerDims(split22_1_1, "split22_1_1"); + nvinfer1::IShuffleLayer* dfl22_1 = + DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, + 1, 0, "model.22.dfl.conv.weight"); + printLayerDims(dfl22_1, "dfl22_1"); + + // det1 + auto shuffle_conv18 = cv4_conv_combined(network, weightMap, *conv18->getOutput(0), "model.22.cv4.1", + (kInputH / strides[1]) * (kInputW / strides[1]), gw, "pose"); + + nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0), + shuffle_conv18->getOutput(0)}; + printTensorsDims(inputTensor22_dfl_1, 3, "inputTensor22_dfl_1"); + nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 3); + printLayerDims(cat22_dfl_1, "cat22_dfl_1"); + printf("\n\n"); + + /********************************************************************************************P5**********************************************************************************************************************************/ + nvinfer1::IShuffleLayer* shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0)); + printLayerDims(shuffle22_2, "shuffle22_2"); + shuffle22_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); + printLayerDims(shuffle22_2, "shuffle22_2->setReshapeDimensions"); + nvinfer1::ISliceLayer* split22_2_0 = network->addSlice( + *shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0}, + nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); + printLayerDims(split22_2_0, "split22_2_0"); + nvinfer1::ISliceLayer* split22_2_1 = network->addSlice( + *shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0}, + nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); + printLayerDims(split22_2_1, "split22_2_1"); + nvinfer1::IShuffleLayer* dfl22_2 = + DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, + 1, 0, "model.22.dfl.conv.weight"); + printLayerDims(dfl22_2, "dfl22_2"); + + // det2 + auto shuffle_conv21 = cv4_conv_combined(network, weightMap, *conv21->getOutput(0), "model.22.cv4.2", + (kInputH / strides[2]) * (kInputW / strides[2]), gw, "pose"); + nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0), + shuffle_conv21->getOutput(0)}; + printTensorsDims(inputTensor22_dfl_2, 3, "inputTensor22_dfl_2"); + nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 3); + printLayerDims(cat22_dfl_2, "cat22_dfl_2"); + printf("\n\n"); + + nvinfer1::IPluginV2Layer* yolo = + addYoLoLayer(network, std::vector{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2}, + strides, stridesLength, false, true); + printLayerDims(yolo, "yolo"); + yolo->getOutput(0)->setName(kOutputTensorName); + network->markOutput(*yolo->getOutput(0)); + + builder->setMaxBatchSize(kBatchSize); + config->setMaxWorkspaceSize(16 * (1 << 20)); + +#if defined(USE_FP16) + config->setFlag(nvinfer1::BuilderFlag::kFP16); +#elif defined(USE_INT8) + std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; + assert(builder->platformHasFastInt8()); + config->setFlag(nvinfer1::BuilderFlag::kINT8); + auto* calibrator = + new Int8EntropyCalibrator2(1, kInputW, kInputH, "../coco_calib/", "int8calib.table", kInputTensorName); + config->setInt8Calibrator(calibrator); +#endif + + std::cout << "Building engine, please wait for a while..." << std::endl; + nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); + std::cout << "Build engine successfully!" << std::endl; + + delete network; + + for (auto& mem : weightMap) { + free((void*)(mem.second.values)); + } + return serialized_model; +} diff --git a/yolov8/src/postprocess.cpp b/yolov8/src/postprocess.cpp index 574e15b5..25f86cd3 100644 --- a/yolov8/src/postprocess.cpp +++ b/yolov8/src/postprocess.cpp @@ -28,6 +28,34 @@ cv::Rect get_rect(cv::Mat& img, float bbox[4]) { return cv::Rect(round(l), round(t), round(r - l), round(b - t)); } +cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[51]) { + int l, r, t, b; + float r_w = kInputW / (img.cols * 1.0); + float r_h = kInputH / (img.rows * 1.0); + if (r_h > r_w) { + l = bbox[0] / r_w; + r = bbox[2] / r_w; + t = (bbox[1] - (kInputH - r_w * img.rows) / 2) / r_w; + b = (bbox[3] - (kInputH - r_w * img.rows) / 2) / r_w; + for (int i = 0; i < 51; i += 3) { + lmk[i] /= r_w; + lmk[i + 1] = (lmk[i + 1] - (kInputH - r_w * img.rows) / 2) / r_w; + // lmk[i + 2] + } + } else { + l = (bbox[0] - (kInputW - r_h * img.cols) / 2) / r_h; + r = (bbox[2] - (kInputW - r_h * img.cols) / 2) / r_h; + t = bbox[1] / r_h; + b = bbox[3] / r_h; + for (int i = 0; i < 51; i += 3) { + lmk[i] = (lmk[i] - (kInputW - r_h * img.cols) / 2) / r_h; + lmk[i + 1] /= r_h; + // lmk[i + 2] + } + } + return cv::Rect(l, t, r - l, b - t); +} + static float iou(float lbox[4], float rbox[4]) { float interBox[] = { (std::max)(lbox[0], rbox[0]), @@ -130,6 +158,40 @@ void draw_bbox(std::vector& img_batch, std::vector& img_batch, std::vector>& res_batch) { + const std::vector> skeleton_pairs = { + {0, 1}, {0, 2}, {0, 5}, {0, 6}, {1, 2}, {1, 3}, {2, 4}, {5, 6}, {5, 7}, {5, 11}, + {6, 8}, {6, 12}, {7, 9}, {8, 10}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}}; + + for (size_t i = 0; i < img_batch.size(); i++) { + auto& res = res_batch[i]; + cv::Mat img = img_batch[i]; + for (size_t j = 0; j < res.size(); j++) { + cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].keypoints); + cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); + cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, + cv::Scalar(0xFF, 0xFF, 0xFF), 2); + + for (int k = 0; k < 51; k += 3) { + if (res[j].keypoints[k + 2] > 0.5) { + cv::circle(img, cv::Point((int)res[j].keypoints[k], (int)res[j].keypoints[k + 1]), 3, + cv::Scalar(0, 0x27, 0xC1), -1); + } + } + + for (const auto& bone : skeleton_pairs) { + int kp1_idx = bone.first * 3; + int kp2_idx = bone.second * 3; + if (res[j].keypoints[kp1_idx + 2] > 0.5 && res[j].keypoints[kp2_idx + 2] > 0.5) { + cv::Point p1((int)res[j].keypoints[kp1_idx], (int)res[j].keypoints[kp1_idx + 1]); + cv::Point p2((int)res[j].keypoints[kp2_idx], (int)res[j].keypoints[kp2_idx + 1]); + cv::line(img, p1, p2, cv::Scalar(0, 0x27, 0xC1), 2); + } + } + } + } +} + cv::Mat scale_mask(cv::Mat mask, cv::Mat img) { int x, y, w, h; float r_w = kInputW / (img.cols * 1.0); diff --git a/yolov8/yolov8_pose.cpp b/yolov8/yolov8_pose.cpp new file mode 100644 index 00000000..5b848a6b --- /dev/null +++ b/yolov8/yolov8_pose.cpp @@ -0,0 +1,276 @@ + +#include +#include +#include +#include "cuda_utils.h" +#include "logging.h" +#include "model.h" +#include "postprocess.h" +#include "preprocess.h" +#include "utils.h" + +Logger gLogger; +using namespace nvinfer1; +const int kOutputSize = kMaxNumOutputBbox * (sizeof(Detection) - sizeof(float) * 32) / sizeof(float) + 1; + +void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd, + float& gw, int& max_channels) { + IBuilder* builder = createInferBuilder(gLogger); + IBuilderConfig* config = builder->createBuilderConfig(); + IHostMemory* serialized_engine = nullptr; + + if (is_p == 6) { + serialized_engine = buildEngineYolov8Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); + } else if (is_p == 2) { + serialized_engine = buildEngineYolov8Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); + } else { + serialized_engine = buildEngineYolov8Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); + } + + assert(serialized_engine); + std::ofstream p(engine_name, std::ios::binary); + if (!p) { + std::cout << "could not open plan output file" << std::endl; + assert(false); + } + p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); + + delete serialized_engine; + delete config; + delete builder; +} + +void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, + IExecutionContext** context) { + std::ifstream file(engine_name, std::ios::binary); + if (!file.good()) { + std::cerr << "read " << engine_name << " error!" << std::endl; + assert(false); + } + size_t size = 0; + file.seekg(0, file.end); + size = file.tellg(); + file.seekg(0, file.beg); + char* serialized_engine = new char[size]; + assert(serialized_engine); + file.read(serialized_engine, size); + file.close(); + + *runtime = createInferRuntime(gLogger); + assert(*runtime); + *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); + assert(*engine); + *context = (*engine)->createExecutionContext(); + assert(*context); + delete[] serialized_engine; +} + +void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, + float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device, + std::string cuda_post_process) { + assert(engine->getNbBindings() == 2); + // In order to bind the buffers, we need to know the names of the input and output tensors. + // Note that indices are guaranteed to be less than IEngine::getNbBindings() + const int inputIndex = engine->getBindingIndex(kInputTensorName); + const int outputIndex = engine->getBindingIndex(kOutputTensorName); + assert(inputIndex == 0); + assert(outputIndex == 1); + // Create GPU buffers on device + CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); + CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); + if (cuda_post_process == "c") { + *output_buffer_host = new float[kBatchSize * kOutputSize]; + } else if (cuda_post_process == "g") { + if (kBatchSize > 1) { + std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl; + exit(0); + } + // Allocate memory for decode_ptr_host and copy to device + *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element]; + CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element))); + } +} + +void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize, + float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) { + // infer on the batch asynchronously, and DMA output back to host + auto start = std::chrono::system_clock::now(); + context.enqueue(batchsize, buffers, stream, nullptr); + if (cuda_post_process == "c") { + CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, + stream)); + auto end = std::chrono::system_clock::now(); + std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() + << "ms" << std::endl; + } else if (cuda_post_process == "g") { + CUDA_CHECK( + cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream)); + cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream); + cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms + CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, + sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, + stream)); + auto end = std::chrono::system_clock::now(); + std::cout << "inference and gpu postprocess time: " + << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; + } + + CUDA_CHECK(cudaStreamSynchronize(stream)); +} + +bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir, + std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) { + if (argc < 4) + return false; + if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) { + wts = std::string(argv[2]); + engine = std::string(argv[3]); + auto sub_type = std::string(argv[4]); + + if (sub_type[0] == 'n') { + gd = 0.33; + gw = 0.25; + max_channels = 1024; + } else if (sub_type[0] == 's') { + gd = 0.33; + gw = 0.50; + max_channels = 1024; + } else if (sub_type[0] == 'm') { + gd = 0.67; + gw = 0.75; + max_channels = 576; + } else if (sub_type[0] == 'l') { + gd = 1.0; + gw = 1.0; + max_channels = 512; + } else if (sub_type[0] == 'x') { + gd = 1.0; + gw = 1.25; + max_channels = 640; + } else { + return false; + } + if (sub_type.size() == 2 && sub_type[1] == '6') { + is_p = 6; + } else if (sub_type.size() == 2 && sub_type[1] == '2') { + is_p = 2; + } + } else if (std::string(argv[1]) == "-d" && argc == 5) { + engine = std::string(argv[2]); + img_dir = std::string(argv[3]); + cuda_post_process = std::string(argv[4]); + } else { + return false; + } + return true; +} + +int main(int argc, char** argv) { + cudaSetDevice(kGpuId); + std::string wts_name = ""; + std::string engine_name = ""; + std::string img_dir; + std::string sub_type = ""; + std::string cuda_post_process = ""; + int model_bboxes; + int is_p = 0; + float gd = 0.0f, gw = 0.0f; + int max_channels = 0; + + if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw, + max_channels)) { + std::cerr << "Arguments not right!" << std::endl; + std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n6/s6/m6/l6/x6] // serialize model to plan file" + << std::endl; + std::cerr << "./yolov8 -d [.engine] ../samples [c/g]// deserialize plan file and run inference" << std::endl; + return -1; + } + + // Create a model using the API directly and serialize it to a file + if (!wts_name.empty()) { + serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels); + return 0; + } + + // Deserialize the engine from file + IRuntime* runtime = nullptr; + ICudaEngine* engine = nullptr; + IExecutionContext* context = nullptr; + deserialize_engine(engine_name, &runtime, &engine, &context); + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + cuda_preprocess_init(kMaxInputImageSize); + auto out_dims = engine->getBindingDimensions(1); + model_bboxes = out_dims.d[0]; + // Prepare cpu and gpu buffers + float* device_buffers[2]; + float* output_buffer_host = nullptr; + float* decode_ptr_host = nullptr; + float* decode_ptr_device = nullptr; + + // Read images from directory + std::vector file_names; + if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { + std::cerr << "read_files_in_dir failed." << std::endl; + return -1; + } + + prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host, + &decode_ptr_device, cuda_post_process); + + // batch predict + for (size_t i = 0; i < file_names.size(); i += kBatchSize) { + // Get a batch of images + std::vector img_batch; + std::vector img_name_batch; + for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { + cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); + img_batch.push_back(img); + img_name_batch.push_back(file_names[j]); + } + // Preprocess + cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); + // Run inference + infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host, + decode_ptr_device, model_bboxes, cuda_post_process); + std::vector> res_batch; + if (cuda_post_process == "c") { + // NMS + batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); + } else if (cuda_post_process == "g") { + // Process gpu decode and nms results + // todo pose in gpu + std::cerr << "pose_postprocess is not support in gpu right now" << std::endl; + } + // Draw bounding boxes + draw_bbox_keypoints_line(img_batch, res_batch); + // Save images + for (size_t j = 0; j < img_batch.size(); j++) { + cv::imwrite("_" + img_name_batch[j], img_batch[j]); + } + } + + // Release stream and buffers + cudaStreamDestroy(stream); + CUDA_CHECK(cudaFree(device_buffers[0])); + CUDA_CHECK(cudaFree(device_buffers[1])); + CUDA_CHECK(cudaFree(decode_ptr_device)); + delete[] decode_ptr_host; + delete[] output_buffer_host; + cuda_preprocess_destroy(); + // Destroy the engine + delete context; + delete engine; + delete runtime; + + // Print histogram of the output distribution + //std::cout << "\nOutput:\n\n"; + //for (unsigned int i = 0; i < kOutputSize; i++) + //{ + // std::cout << prob[i] << ", "; + // if (i % 10 == 0) std::cout << std::endl; + //} + //std::cout << std::endl; + + return 0; +} From 77e3f057a2c9d17a514b791a09cd348976225a38 Mon Sep 17 00:00:00 2001 From: linds Date: Fri, 26 Apr 2024 17:27:37 +0800 Subject: [PATCH 12/24] yolov8 pose --- yolov8/yolov8_seg.cpp | 227 ++++++++++++++++++++++-------------------- 1 file changed, 119 insertions(+), 108 deletions(-) diff --git a/yolov8/yolov8_seg.cpp b/yolov8/yolov8_seg.cpp index 8ba71421..5a86f974 100644 --- a/yolov8/yolov8_seg.cpp +++ b/yolov8/yolov8_seg.cpp @@ -1,90 +1,87 @@ -#include #include +#include #include -#include "model.h" -#include "utils.h" -#include "preprocess.h" -#include "postprocess.h" #include "cuda_utils.h" #include "logging.h" +#include "model.h" +#include "postprocess.h" +#include "preprocess.h" +#include "utils.h" Logger gLogger; using namespace nvinfer1; -const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; +const int kOutputSize = kMaxNumOutputBbox * (sizeof(Detection) - sizeof(float) * 51) / sizeof(float) + 1; const static int kOutputSegSize = 32 * (kInputH / 4) * (kInputW / 4); static cv::Rect get_downscale_rect(float bbox[4], float scale) { - float left = bbox[0]; - float top = bbox[1]; - float right = bbox[0] + bbox[2]; - float bottom = bbox[1] + bbox[3]; - - left = left < 0 ? 0 : left; - top = top < 0 ? 0: top; - right = right > 640 ? 640 : right; - bottom = bottom > 640 ? 640: bottom; - - left /= scale; - top /= scale; - right /= scale; - bottom /= scale; - return cv::Rect(int(left), int(top), int(right - left), int(bottom - top)); + float left = bbox[0]; + float top = bbox[1]; + float right = bbox[0] + bbox[2]; + float bottom = bbox[1] + bbox[3]; + + left = left < 0 ? 0 : left; + top = top < 0 ? 0 : top; + right = right > 640 ? 640 : right; + bottom = bottom > 640 ? 640 : bottom; + + left /= scale; + top /= scale; + right /= scale; + bottom /= scale; + return cv::Rect(int(left), int(top), int(right - left), int(bottom - top)); } std::vector process_mask(const float* proto, int proto_size, std::vector& dets) { - std::vector masks; - for (size_t i = 0; i < dets.size(); i++) { + std::vector masks; + for (size_t i = 0; i < dets.size(); i++) { - cv::Mat mask_mat = cv::Mat::zeros(kInputH / 4, kInputW / 4, CV_32FC1); - auto r = get_downscale_rect(dets[i].bbox, 4); + cv::Mat mask_mat = cv::Mat::zeros(kInputH / 4, kInputW / 4, CV_32FC1); + auto r = get_downscale_rect(dets[i].bbox, 4); - for (int x = r.x; x < r.x + r.width; x++) { - for (int y = r.y; y < r.y + r.height; y++) { - float e = 0.0f; - for (int j = 0; j < 32; j++) { - e += dets[i].mask[j] * proto[j * proto_size / 32 + y * mask_mat.cols + x]; + for (int x = r.x; x < r.x + r.width; x++) { + for (int y = r.y; y < r.y + r.height; y++) { + float e = 0.0f; + for (int j = 0; j < 32; j++) { + e += dets[i].mask[j] * proto[j * proto_size / 32 + y * mask_mat.cols + x]; + } + e = 1.0f / (1.0f + expf(-e)); + mask_mat.at(y, x) = e; + } } - e = 1.0f / (1.0f + expf(-e)); - mask_mat.at(y, x) = e; - } + cv::resize(mask_mat, mask_mat, cv::Size(kInputW, kInputH)); + masks.push_back(mask_mat); } - cv::resize(mask_mat, mask_mat, cv::Size(kInputW, kInputH)); - masks.push_back(mask_mat); - } - return masks; + return masks; } - -void serialize_engine(std::string &wts_name, std::string &engine_name, std::string &sub_type, float &gd, float &gw, int &max_channels) -{ - IBuilder *builder = createInferBuilder(gLogger); - IBuilderConfig *config = builder->createBuilderConfig(); - IHostMemory *serialized_engine = nullptr; +void serialize_engine(std::string& wts_name, std::string& engine_name, std::string& sub_type, float& gd, float& gw, + int& max_channels) { + IBuilder* builder = createInferBuilder(gLogger); + IBuilderConfig* config = builder->createBuilderConfig(); + IHostMemory* serialized_engine = nullptr; serialized_engine = buildEngineYolov8Seg(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); assert(serialized_engine); std::ofstream p(engine_name, std::ios::binary); - if (!p) - { + if (!p) { std::cout << "could not open plan output file" << std::endl; assert(false); } - p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); + p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); delete serialized_engine; delete config; delete builder; } -void deserialize_engine(std::string &engine_name, IRuntime **runtime, ICudaEngine **engine, IExecutionContext **context) -{ +void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, + IExecutionContext** context) { std::ifstream file(engine_name, std::ios::binary); - if (!file.good()) - { + if (!file.good()) { std::cerr << "read " << engine_name << " error!" << std::endl; assert(false); } @@ -92,7 +89,7 @@ void deserialize_engine(std::string &engine_name, IRuntime **runtime, ICudaEngin file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); - char *serialized_engine = new char[size]; + char* serialized_engine = new char[size]; assert(serialized_engine); file.read(serialized_engine, size); file.close(); @@ -106,8 +103,9 @@ void deserialize_engine(std::string &engine_name, IRuntime **runtime, ICudaEngin delete[] serialized_engine; } -void prepare_buffer(ICudaEngine *engine, float **input_buffer_device, float **output_buffer_device, float **output_seg_buffer_device, - float **output_buffer_host,float **output_seg_buffer_host ,float **decode_ptr_host, float **decode_ptr_device, std::string cuda_post_process) { +void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, + float** output_seg_buffer_device, float** output_buffer_host, float** output_seg_buffer_host, + float** decode_ptr_host, float** decode_ptr_device, std::string cuda_post_process) { assert(engine->getNbBindings() == 3); // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() @@ -119,10 +117,10 @@ void prepare_buffer(ICudaEngine *engine, float **input_buffer_device, float **ou assert(outputIndex == 1); assert(outputIndex_seg == 2); // Create GPU buffers on device - CUDA_CHECK(cudaMalloc((void **) input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); - CUDA_CHECK(cudaMalloc((void **) output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); - CUDA_CHECK(cudaMalloc((void **) output_seg_buffer_device, kBatchSize * kOutputSegSize * sizeof(float))); - + CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); + CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); + CUDA_CHECK(cudaMalloc((void**)output_seg_buffer_device, kBatchSize * kOutputSegSize * sizeof(float))); + if (cuda_post_process == "c") { *output_buffer_host = new float[kBatchSize * kOutputSize]; *output_seg_buffer_host = new float[kBatchSize * kOutputSegSize]; @@ -133,38 +131,47 @@ void prepare_buffer(ICudaEngine *engine, float **input_buffer_device, float **ou } // Allocate memory for decode_ptr_host and copy to device *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element]; - CUDA_CHECK(cudaMalloc((void **)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element))); + CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element))); } } -void infer(IExecutionContext &context, cudaStream_t &stream, void **buffers, float *output, float *output_seg,int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) { +void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, float* output_seg, + int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, + std::string cuda_post_process) { // infer on the batch asynchronously, and DMA output back to host auto start = std::chrono::system_clock::now(); context.enqueue(batchsize, buffers, stream, nullptr); if (cuda_post_process == "c") { - std::cout << "kOutputSize:" << kOutputSize <(end - start).count() << "ms" << std::endl; + std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() + << "ms" << std::endl; } else if (cuda_post_process == "g") { - CUDA_CHECK(cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream)); - cuda_decode((float *)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream); - cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);//cuda nms - CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK( + cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream)); + cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream); + cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms + CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, + sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, + stream)); auto end = std::chrono::system_clock::now(); - std::cout << "inference and gpu postprocess time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; + std::cout << "inference and gpu postprocess time: " + << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; } CUDA_CHECK(cudaStreamSynchronize(stream)); } -bool parse_args(int argc, char **argv, std::string &wts, std::string &engine, std::string &img_dir, std::string &sub_type, - std::string &cuda_post_process, std::string& labels_filename, float &gd, float &gw, int &max_channels) -{ +bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, std::string& img_dir, + std::string& sub_type, std::string& cuda_post_process, std::string& labels_filename, float& gd, + float& gw, int& max_channels) { if (argc < 4) return false; if (std::string(argv[1]) == "-s" && argc == 5) { @@ -172,40 +179,40 @@ bool parse_args(int argc, char **argv, std::string &wts, std::string &engine, st engine = std::string(argv[3]); sub_type = std::string(argv[4]); if (sub_type == "n") { - gd = 0.33; - gw = 0.25; - max_channels = 1024; + gd = 0.33; + gw = 0.25; + max_channels = 1024; } else if (sub_type == "s") { - gd = 0.33; - gw = 0.50; - max_channels = 1024; + gd = 0.33; + gw = 0.50; + max_channels = 1024; } else if (sub_type == "m") { - gd = 0.67; - gw = 0.75; - max_channels = 576; + gd = 0.67; + gw = 0.75; + max_channels = 576; } else if (sub_type == "l") { - gd = 1.0; - gw = 1.0; - max_channels = 512; + gd = 1.0; + gw = 1.0; + max_channels = 512; } else if (sub_type == "x") { - gd = 1.0; - gw = 1.25; - max_channels = 640; - } else{ - return false; + gd = 1.0; + gw = 1.25; + max_channels = 640; + } else { + return false; } } else if (std::string(argv[1]) == "-d" && argc == 6) { - engine = std::string(argv[2]); - img_dir = std::string(argv[3]); - cuda_post_process = std::string(argv[4]); - labels_filename = std::string(argv[5]); + engine = std::string(argv[2]); + img_dir = std::string(argv[3]); + cuda_post_process = std::string(argv[4]); + labels_filename = std::string(argv[5]); } else { - return false; + return false; } return true; } -int main(int argc, char **argv) { +int main(int argc, char** argv) { cudaSetDevice(kGpuId); std::string wts_name = ""; std::string engine_name = ""; @@ -217,10 +224,12 @@ int main(int argc, char **argv) { float gd = 0.0f, gw = 0.0f; int max_channels = 0; - if (!parse_args(argc, argv, wts_name, engine_name, img_dir, sub_type, cuda_post_process, labels_filename, gd, gw, max_channels)) { + if (!parse_args(argc, argv, wts_name, engine_name, img_dir, sub_type, cuda_post_process, labels_filename, gd, gw, + max_channels)) { std::cerr << "Arguments not right!" << std::endl; std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x] // serialize model to plan file" << std::endl; - std::cerr << "./yolov8 -d [.engine] ../samples [c/g] coco_file// deserialize plan file and run inference" << std::endl; + std::cerr << "./yolov8 -d [.engine] ../samples [c/g] coco_file// deserialize plan file and run inference" + << std::endl; return -1; } @@ -230,10 +239,10 @@ int main(int argc, char **argv) { return 0; } - // Deserialize the engine from file - IRuntime *runtime = nullptr; - ICudaEngine *engine = nullptr; - IExecutionContext *context = nullptr; + // Deserialize the engine from file + IRuntime* runtime = nullptr; + ICudaEngine* engine = nullptr; + IExecutionContext* context = nullptr; deserialize_engine(engine_name, &runtime, &engine, &context); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -241,11 +250,11 @@ int main(int argc, char **argv) { auto out_dims = engine->getBindingDimensions(1); model_bboxes = out_dims.d[0]; // Prepare cpu and gpu buffers - float *device_buffers[3]; - float *output_buffer_host = nullptr; - float *output_seg_buffer_host = nullptr; - float *decode_ptr_host=nullptr; - float *decode_ptr_device=nullptr; + float* device_buffers[3]; + float* output_buffer_host = nullptr; + float* output_seg_buffer_host = nullptr; + float* decode_ptr_host = nullptr; + float* decode_ptr_device = nullptr; // Read images from directory std::vector file_names; @@ -258,7 +267,8 @@ int main(int argc, char **argv) { read_labels(labels_filename, labels_map); assert(kNumClass == labels_map.size()); - prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &device_buffers[2], &output_buffer_host, &output_seg_buffer_host,&decode_ptr_host, &decode_ptr_device, cuda_post_process); + prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &device_buffers[2], &output_buffer_host, + &output_seg_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process); // // batch predict for (size_t i = 0; i < file_names.size(); i += kBatchSize) { @@ -273,7 +283,8 @@ int main(int argc, char **argv) { // Preprocess cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); // Run inference - infer(*context, stream, (void **)device_buffers, output_buffer_host, output_seg_buffer_host,kBatchSize, decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process); + infer(*context, stream, (void**)device_buffers, output_buffer_host, output_seg_buffer_host, kBatchSize, + decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process); std::vector> res_batch; if (cuda_post_process == "c") { // NMS From 16088b37227433bc8eaed382120f6472b793f9f2 Mon Sep 17 00:00:00 2001 From: linds Date: Fri, 26 Apr 2024 17:30:22 +0800 Subject: [PATCH 13/24] yolov8 pose --- yolov8/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yolov8/README.md b/yolov8/README.md index 8f198c65..8a42fe18 100644 --- a/yolov8/README.md +++ b/yolov8/README.md @@ -111,7 +111,7 @@ sudo ./yolov8_cls -d yolov8n-cls.engine ../samples # Build and serialize TensorRT engine ./yolov8_pose -s yolov8n-pose.wts yolov8n-pose.engine n -# Run inference with labels file +# Run inference ./yolov8_pose -d yolov8n-pose.engine ../images c ``` From 7e3d564d75bad6ec5a422f5927865eb10605f3e0 Mon Sep 17 00:00:00 2001 From: linds Date: Sun, 28 Apr 2024 08:22:51 +0800 Subject: [PATCH 14/24] yolov8 pose --- yolov8/src/model.cpp | 94 -------------------------------------------- 1 file changed, 94 deletions(-) diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp index 2e67bacb..b7b14bbe 100644 --- a/yolov8/src/model.cpp +++ b/yolov8/src/model.cpp @@ -1275,96 +1275,65 @@ nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfe *******************************************************************************************************/ nvinfer1::IElementWiseLayer* conv0 = convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0"); - printLayerDims(conv0, "conv0"); nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1"); - printLayerDims(conv1, "conv1"); nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); - printLayerDims(conv2, "conv2"); nvinfer1::IElementWiseLayer* conv3 = convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3"); - printLayerDims(conv3, "conv3"); nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); - printLayerDims(conv4, "conv4"); nvinfer1::IElementWiseLayer* conv5 = convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5"); - printLayerDims(conv5, "conv5"); nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); - printLayerDims(conv6, "conv6"); nvinfer1::IElementWiseLayer* conv7 = convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7"); - printLayerDims(conv7, "conv7"); nvinfer1::IElementWiseLayer* conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8"); - printLayerDims(conv8, "conv8"); nvinfer1::IElementWiseLayer* conv9 = SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), 5, "model.9"); - printLayerDims(conv9, "conv9"); /******************************************************************************************************* ********************************************* YOLOV8 HEAD ******************************************** *******************************************************************************************************/ float scale[] = {1.0, 2.0, 2.0}; nvinfer1::IResizeLayer* upsample10 = network->addResize(*conv9->getOutput(0)); - printLayerDims(upsample10, "upsample10"); assert(upsample10); upsample10->setResizeMode(nvinfer1::ResizeMode::kNEAREST); - printLayerDims(upsample10, "upsample10"); upsample10->setScales(scale, 3); - printLayerDims(upsample10, "upsample10"); nvinfer1::ITensor* inputTensor11[] = {upsample10->getOutput(0), conv6->getOutput(0)}; - printTensorsDims(inputTensor11, 2, "inputTensor11"); nvinfer1::IConcatenationLayer* cat11 = network->addConcatenation(inputTensor11, 2); - printLayerDims(cat11, "cat11"); nvinfer1::IElementWiseLayer* conv12 = C2F(network, weightMap, *cat11->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12"); - printLayerDims(conv12, "conv12"); nvinfer1::IResizeLayer* upsample13 = network->addResize(*conv12->getOutput(0)); - printLayerDims(upsample13, "upsample13"); assert(upsample13); upsample13->setResizeMode(nvinfer1::ResizeMode::kNEAREST); - printLayerDims(upsample13, "upsample13"); upsample13->setScales(scale, 3); - printLayerDims(upsample13, "upsample13"); nvinfer1::ITensor* inputTensor14[] = {upsample13->getOutput(0), conv4->getOutput(0)}; - printTensorsDims(inputTensor14, 2, "inputTensor14"); nvinfer1::IConcatenationLayer* cat14 = network->addConcatenation(inputTensor14, 2); - printLayerDims(cat14, "cat14"); nvinfer1::IElementWiseLayer* conv15 = C2F(network, weightMap, *cat14->getOutput(0), get_width(256, gw, max_channels), get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15"); - printLayerDims(conv15, "conv15"); nvinfer1::IElementWiseLayer* conv16 = convBnSiLU(network, weightMap, *conv15->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.16"); - printLayerDims(conv16, "conv16"); nvinfer1::ITensor* inputTensor17[] = {conv16->getOutput(0), conv12->getOutput(0)}; - printTensorsDims(inputTensor17, 2, "inputTensor17"); nvinfer1::IConcatenationLayer* cat17 = network->addConcatenation(inputTensor17, 2); - printLayerDims(cat17, "cat17"); nvinfer1::IElementWiseLayer* conv18 = C2F(network, weightMap, *cat17->getOutput(0), get_width(512, gw, max_channels), get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18"); - printLayerDims(conv18, "conv18"); nvinfer1::IElementWiseLayer* conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.19"); - printLayerDims(conv19, "conv19"); nvinfer1::ITensor* inputTensor20[] = {conv19->getOutput(0), conv9->getOutput(0)}; - printTensorsDims(inputTensor20, 2, "inputTensor20"); nvinfer1::IConcatenationLayer* cat20 = network->addConcatenation(inputTensor20, 2); - printLayerDims(cat20, "cat20"); nvinfer1::IElementWiseLayer* conv21 = C2F(network, weightMap, *cat20->getOutput(0), get_width(1024, gw, max_channels), get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21"); - printLayerDims(conv21, "conv21"); - printf("\n\n"); /******************************************************************************************************* ********************************************* YOLOV8 OUTPUT ****************************************** @@ -1375,99 +1344,64 @@ nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfe // output0 nvinfer1::IElementWiseLayer* conv22_cv2_0_0 = convBnSiLU(network, weightMap, *conv15->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.0"); - printLayerDims(conv22_cv2_0_0, "conv22_cv2_0_0"); nvinfer1::IElementWiseLayer* conv22_cv2_0_1 = convBnSiLU(network, weightMap, *conv22_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.1"); - printLayerDims(conv22_cv2_0_1, "conv22_cv2_0_1"); nvinfer1::IConvolutionLayer* conv22_cv2_0_2 = network->addConvolutionNd(*conv22_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv2.0.2.weight"], weightMap["model.22.cv2.0.2.bias"]); - printLayerDims(conv22_cv2_0_2, "conv22_cv2_0_2"); conv22_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); - printLayerDims(conv22_cv2_0_2, "conv22_cv2_0_2"); conv22_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); - printLayerDims(conv22_cv2_0_2, "conv22_cv2_0_2"); nvinfer1::IElementWiseLayer* conv22_cv3_0_0 = convBnSiLU(network, weightMap, *conv15->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.0"); - printLayerDims(conv22_cv3_0_0, "conv22_cv3_0_0"); nvinfer1::IElementWiseLayer* conv22_cv3_0_1 = convBnSiLU(network, weightMap, *conv22_cv3_0_0->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.1"); - printLayerDims(conv22_cv3_0_1, "conv22_cv3_0_1"); nvinfer1::IConvolutionLayer* conv22_cv3_0_2 = network->addConvolutionNd(*conv22_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv3.0.2.weight"], weightMap["model.22.cv3.0.2.bias"]); - printLayerDims(conv22_cv3_0_2, "conv22_cv3_0_2"); conv22_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1}); - printLayerDims(conv22_cv3_0_2, "conv22_cv3_0_2"); conv22_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0}); - printLayerDims(conv22_cv3_0_2, "conv22_cv3_0_2"); nvinfer1::ITensor* inputTensor22_0[] = {conv22_cv2_0_2->getOutput(0), conv22_cv3_0_2->getOutput(0)}; - printTensorsDims(inputTensor22_0, 2, "inputTensor22_0"); nvinfer1::IConcatenationLayer* cat22_0 = network->addConcatenation(inputTensor22_0, 2); - printLayerDims(cat22_0, "cat22_0"); - printf("\n\n"); // output1 nvinfer1::IElementWiseLayer* conv22_cv2_1_0 = convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.0"); - printLayerDims(conv22_cv2_1_0, "conv22_cv2_1_0"); nvinfer1::IElementWiseLayer* conv22_cv2_1_1 = convBnSiLU(network, weightMap, *conv22_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.1"); - printLayerDims(conv22_cv2_1_1, "conv22_cv2_1_1"); nvinfer1::IConvolutionLayer* conv22_cv2_1_2 = network->addConvolutionNd(*conv22_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv2.1.2.weight"], weightMap["model.22.cv2.1.2.bias"]); - printLayerDims(conv22_cv2_1_2, "conv22_cv2_1_2"); conv22_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); - printLayerDims(conv22_cv2_1_2, "conv22_cv2_1_2"); conv22_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); - printLayerDims(conv22_cv2_1_2, "conv22_cv2_1_2"); nvinfer1::IElementWiseLayer* conv22_cv3_1_0 = convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.0"); - printLayerDims(conv22_cv3_1_0, "conv22_cv3_1_0"); nvinfer1::IElementWiseLayer* conv22_cv3_1_1 = convBnSiLU(network, weightMap, *conv22_cv3_1_0->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.1"); - printLayerDims(conv22_cv3_1_1, "conv22_cv3_1_1"); nvinfer1::IConvolutionLayer* conv22_cv3_1_2 = network->addConvolutionNd(*conv22_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv3.1.2.weight"], weightMap["model.22.cv3.1.2.bias"]); - printLayerDims(conv22_cv3_1_2, "conv22_cv3_1_2"); conv22_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); - printLayerDims(conv22_cv3_1_2, "conv22_cv3_1_2"); conv22_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); - printLayerDims(conv22_cv3_1_2, "conv22_cv3_1_2"); nvinfer1::ITensor* inputTensor22_1[] = {conv22_cv2_1_2->getOutput(0), conv22_cv3_1_2->getOutput(0)}; - printTensorsDims(inputTensor22_1, 2, "inputTensor22_1"); nvinfer1::IConcatenationLayer* cat22_1 = network->addConcatenation(inputTensor22_1, 2); - printLayerDims(cat22_1, "cat22_1"); - printf("\n\n"); // output2 nvinfer1::IElementWiseLayer* conv22_cv2_2_0 = convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.0"); - printLayerDims(conv22_cv2_2_0, "conv22_cv2_2_0"); nvinfer1::IElementWiseLayer* conv22_cv2_2_1 = convBnSiLU(network, weightMap, *conv22_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.1"); - printLayerDims(conv22_cv2_2_1, "conv22_cv2_2_1"); nvinfer1::IConvolutionLayer* conv22_cv2_2_2 = network->addConvolution(*conv22_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv2.2.2.weight"], weightMap["model.22.cv2.2.2.bias"]); - printLayerDims(conv22_cv2_2_2, "conv22_cv2_2_2"); nvinfer1::IElementWiseLayer* conv22_cv3_2_0 = convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.0"); - printLayerDims(conv22_cv3_2_0, "conv22_cv3_2_0"); nvinfer1::IElementWiseLayer* conv22_cv3_2_1 = convBnSiLU(network, weightMap, *conv22_cv3_2_0->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.1"); - printLayerDims(conv22_cv3_2_1, "conv22_cv3_2_1"); nvinfer1::IConvolutionLayer* conv22_cv3_2_2 = network->addConvolution(*conv22_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, weightMap["model.22.cv3.2.2.weight"], weightMap["model.22.cv3.2.2.bias"]); - printLayerDims(conv22_cv3_2_2, "conv22_cv3_2_2"); nvinfer1::ITensor* inputTensor22_2[] = {conv22_cv2_2_2->getOutput(0), conv22_cv3_2_2->getOutput(0)}; - printTensorsDims(inputTensor22_2, 2, "inputTensor22_2"); nvinfer1::IConcatenationLayer* cat22_2 = network->addConcatenation(inputTensor22_2, 2); - printLayerDims(cat22_2, "cat22_2"); - printf("\n\n"); /******************************************************************************************************* ********************************************* YOLOV8 DETECT ****************************************** *******************************************************************************************************/ @@ -1479,56 +1413,40 @@ nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfe /**************************************************************************************P3****************************************************************************************************************************************/ nvinfer1::IShuffleLayer* shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0)); - printLayerDims(shuffle22_0, "shuffle22_0"); shuffle22_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); - printLayerDims(shuffle22_0, "shuffle22_0->setReshapeDimensions"); nvinfer1::ISliceLayer* split22_0_0 = network->addSlice( *shuffle22_0->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); - printLayerDims(split22_0_0, "split22_0_0"); nvinfer1::ISliceLayer* split22_0_1 = network->addSlice( *shuffle22_0->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); - printLayerDims(split22_0_1, "split22_0_1"); nvinfer1::IShuffleLayer* dfl22_0 = DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.22.dfl.conv.weight"); - printLayerDims(dfl22_0, "dfl22_0"); // det0 - printLayerDims(conv15, "conv15"); std::cout << "conv15->getOutput(0)->getDimensions().d[0] : " << conv15->getOutput(0)->getDimensions().d[0] << " (kInputH / strides[0]) * (kInputW / strides[0]) : " << (kInputH / strides[0]) * (kInputW / strides[0]) << std::endl; auto shuffle_conv15 = cv4_conv_combined(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0", (kInputH / strides[0]) * (kInputW / strides[0]), gw, "pose"); - printLayerDims(shuffle_conv15, "shuffle_conv15"); - nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0), shuffle_conv15->getOutput(0)}; - printTensorsDims(inputTensor22_dfl_0, 3, "inputTensor22_dfl_0"); nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 3); - printLayerDims(cat22_dfl_0, "cat22_dfl_0"); - printf("\n\n"); /********************************************************************************************P4**********************************************************************************************************************************/ nvinfer1::IShuffleLayer* shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0)); - printLayerDims(shuffle22_1, "shuffle22_1"); shuffle22_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); - printLayerDims(shuffle22_1, "shuffle22_1->setReshapeDimensions"); nvinfer1::ISliceLayer* split22_1_0 = network->addSlice( *shuffle22_1->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); - printLayerDims(split22_1_0, "split22_1_0"); nvinfer1::ISliceLayer* split22_1_1 = network->addSlice( *shuffle22_1->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); - printLayerDims(split22_1_1, "split22_1_1"); nvinfer1::IShuffleLayer* dfl22_1 = DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, 1, 0, "model.22.dfl.conv.weight"); - printLayerDims(dfl22_1, "dfl22_1"); // det1 auto shuffle_conv18 = cv4_conv_combined(network, weightMap, *conv18->getOutput(0), "model.22.cv4.1", @@ -1536,43 +1454,31 @@ nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfe nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0), shuffle_conv18->getOutput(0)}; - printTensorsDims(inputTensor22_dfl_1, 3, "inputTensor22_dfl_1"); nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 3); - printLayerDims(cat22_dfl_1, "cat22_dfl_1"); - printf("\n\n"); /********************************************************************************************P5**********************************************************************************************************************************/ nvinfer1::IShuffleLayer* shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0)); - printLayerDims(shuffle22_2, "shuffle22_2"); shuffle22_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); - printLayerDims(shuffle22_2, "shuffle22_2->setReshapeDimensions"); nvinfer1::ISliceLayer* split22_2_0 = network->addSlice( *shuffle22_2->getOutput(0), nvinfer1::Dims2{0, 0}, nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); - printLayerDims(split22_2_0, "split22_2_0"); nvinfer1::ISliceLayer* split22_2_1 = network->addSlice( *shuffle22_2->getOutput(0), nvinfer1::Dims2{64, 0}, nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); - printLayerDims(split22_2_1, "split22_2_1"); nvinfer1::IShuffleLayer* dfl22_2 = DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, 1, 0, "model.22.dfl.conv.weight"); - printLayerDims(dfl22_2, "dfl22_2"); // det2 auto shuffle_conv21 = cv4_conv_combined(network, weightMap, *conv21->getOutput(0), "model.22.cv4.2", (kInputH / strides[2]) * (kInputW / strides[2]), gw, "pose"); nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0), shuffle_conv21->getOutput(0)}; - printTensorsDims(inputTensor22_dfl_2, 3, "inputTensor22_dfl_2"); nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 3); - printLayerDims(cat22_dfl_2, "cat22_dfl_2"); - printf("\n\n"); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, std::vector{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2}, strides, stridesLength, false, true); - printLayerDims(yolo, "yolo"); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); From 255c191860fcf4da4cee5c9c7d02c1e50cb10241 Mon Sep 17 00:00:00 2001 From: linds Date: Sun, 28 Apr 2024 08:25:46 +0800 Subject: [PATCH 15/24] yolov8 pose --- yolov8/src/model.cpp | 92 +++++++++++++++++++++++++++++++++----------- 1 file changed, 70 insertions(+), 22 deletions(-) diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp index b7b14bbe..f2930e8e 100644 --- a/yolov8/src/model.cpp +++ b/yolov8/src/model.cpp @@ -6,6 +6,31 @@ #include "config.h" #include "model.h" +// Get the dimensions of the layer's output. +nvinfer1::Dims dims = layer->getOutput(0)->getDimensions(); + +// Print the layer's name and output dimensions. +std::cout << "name: " << layerName << " Layer name: " << layer->getName() << " Output Dims: "; +for (int i = 0; i < dims.nbDims; ++i) { + std::cout << dims.d[i] << (i < dims.nbDims - 1 ? "x" : ""); +} +std::cout << std::endl; +} + +for (int t = 0; t < numTensors; ++t) { + std::cout << tensorsName << "[" << t << "]: "; + if (tensors[t] != nullptr) { + nvinfer1::Dims dims = tensors[t]->getDimensions(); + for (int i = 0; i < dims.nbDims; ++i) { + std::cout << dims.d[i] << (i < dims.nbDims - 1 ? "x" : ""); + } + std::cout << std::endl; + } else { + std::cout << "nullptr" << std::endl; + } +} +} + static int get_width(int x, float gw, int max_channels, int divisor = 8) { auto channel = int(ceil((x * gw) / divisor)) * divisor; return channel >= max_channels ? max_channels : channel; @@ -47,30 +72,49 @@ static nvinfer1::IElementWiseLayer* Proto(nvinfer1::INetworkDefinition* network, return cv3; } -static nvinfer1::IShuffleLayer* ProtoCoef(nvinfer1::INetworkDefinition* network, - std::map& weightMap, nvinfer1::ITensor& input, - std::string lname, int grid_shape, float gw) { - +static nvinfer1::IShuffleLayer* cv4_conv_combined(nvinfer1::INetworkDefinition* network, + std::map& weightMap, + nvinfer1::ITensor& input, std::string lname, int grid_shape, float gw, + std::string algo_type) { int mid_channle = 0; - if (gw == 0.25 || gw == 0.5) { - mid_channle = 32; - } else if (gw == 0.75) { - mid_channle = 48; - } else if (gw == 1.00) { - mid_channle = 64; - } else if (gw == 1.25) { - mid_channle = 80; + int output_channel = 0; + + if (algo_type == "seg") { + if (gw == 0.25 || gw == 0.5) { + mid_channle = 32; + } else if (gw == 0.75) { + mid_channle = 48; + } else if (gw == 1.00) { + mid_channle = 64; + } else if (gw == 1.25) { + mid_channle = 80; + } + + output_channel = 32; + + } else if (algo_type == "pose") { + if (gw == 0.25 || gw == 0.5 || gw == 0.75) { + mid_channle = 51; + } else if (gw == 1.00) { + mid_channle = 64; + } else if (gw == 1.25) { + mid_channle = 80; + } + + output_channel = 51; } + auto cv0 = convBnSiLU(network, weightMap, input, mid_channle, 3, 1, 1, lname + ".0"); auto cv1 = convBnSiLU(network, weightMap, *cv0->getOutput(0), mid_channle, 3, 1, 1, lname + ".1"); float* cv2_bais_value = (float*)weightMap[lname + ".2" + ".bias"].values; int cv2_bais_len = weightMap[lname + ".2" + ".bias"].count; nvinfer1::Weights cv2_bais{nvinfer1::DataType::kFLOAT, cv2_bais_value, cv2_bais_len}; - auto cv2 = network->addConvolutionNd(*cv1->getOutput(0), 32, nvinfer1::DimsHW{1, 1}, + auto cv2 = network->addConvolutionNd(*cv1->getOutput(0), output_channel, nvinfer1::DimsHW{1, 1}, weightMap[lname + ".2" + ".weight"], cv2_bais); cv2->setStrideNd(nvinfer1::DimsHW{1, 1}); nvinfer1::IShuffleLayer* cv2_shuffle = network->addShuffle(*cv2->getOutput(0)); - cv2_shuffle->setReshapeDimensions(nvinfer1::Dims2{32, grid_shape}); + cv2_shuffle->setReshapeDimensions(nvinfer1::Dims2{output_channel, grid_shape}); + return cv2_shuffle; } @@ -278,7 +322,8 @@ nvinfer1::IHostMemory* buildEngineYolov8Det(nvinfer1::IBuilder* builder, nvinfer nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, std::vector{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2}, - strides, stridesLength, false); + strides, stridesLength, false, false); + yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); @@ -588,7 +633,7 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP6(nvinfer1::IBuilder* builder, nvinf nvinfer1::IPluginV2Layer* yolo = addYoLoLayer( network, std::vector{cat30_dfl_0, cat30_dfl_1, cat30_dfl_2, cat30_dfl_3}, - strides, stridesLength, false); + strides, stridesLength, false, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); @@ -847,6 +892,7 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinf nvinfer1::IShuffleLayer* dfl28_0 = DFL(network, weightMap, *split28_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, 1, 0, "model.28.dfl.conv.weight"); + nvinfer1::ITensor* inputTensor28_dfl_0[] = {dfl28_0->getOutput(0), split28_0_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat28_dfl_0 = network->addConcatenation(inputTensor28_dfl_0, 2); @@ -897,8 +943,7 @@ nvinfer1::IHostMemory* buildEngineYolov8DetP2(nvinfer1::IBuilder* builder, nvinf nvinfer1::IPluginV2Layer* yolo = addYoLoLayer( network, std::vector{cat28_dfl_0, cat28_dfl_1, cat28_dfl_2, cat28_dfl_3}, - strides, stridesLength, false); - + strides, stridesLength, false, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); @@ -1205,26 +1250,29 @@ nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer 1, 0, "model.22.dfl.conv.weight"); // det0 - auto proto_coef_0 = ProtoCoef(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0", 6400, gw); + auto proto_coef_0 = cv4_conv_combined(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0", + (kInputH / strides[0]) * (kInputW / strides[0]), gw, "seg"); nvinfer1::ITensor* inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0), proto_coef_0->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 3); // det1 - auto proto_coef_1 = ProtoCoef(network, weightMap, *conv18->getOutput(0), "model.22.cv4.1", 1600, gw); + auto proto_coef_1 = cv4_conv_combined(network, weightMap, *conv18->getOutput(0), "model.22.cv4.1", + (kInputH / strides[1]) * (kInputW / strides[1]), gw, "seg"); nvinfer1::ITensor* inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0), proto_coef_1->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 3); // det2 - auto proto_coef_2 = ProtoCoef(network, weightMap, *conv21->getOutput(0), "model.22.cv4.2", 400, gw); + auto proto_coef_2 = cv4_conv_combined(network, weightMap, *conv21->getOutput(0), "model.22.cv4.2", + (kInputH / strides[2]) * (kInputW / strides[2]), gw, "seg"); nvinfer1::ITensor* inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0), proto_coef_2->getOutput(0)}; nvinfer1::IConcatenationLayer* cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 3); nvinfer1::IPluginV2Layer* yolo = addYoLoLayer(network, std::vector{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2}, - strides, stridesLength, true); + strides, stridesLength, true, false); yolo->getOutput(0)->setName(kOutputTensorName); network->markOutput(*yolo->getOutput(0)); From a8ea19b814ee0752adb8b1ecb5a44396566e1688 Mon Sep 17 00:00:00 2001 From: linds Date: Sun, 28 Apr 2024 08:40:01 +0800 Subject: [PATCH 16/24] yolov8 pose --- yolov8/src/model.cpp | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp index f2930e8e..4cc4088e 100644 --- a/yolov8/src/model.cpp +++ b/yolov8/src/model.cpp @@ -6,31 +6,6 @@ #include "config.h" #include "model.h" -// Get the dimensions of the layer's output. -nvinfer1::Dims dims = layer->getOutput(0)->getDimensions(); - -// Print the layer's name and output dimensions. -std::cout << "name: " << layerName << " Layer name: " << layer->getName() << " Output Dims: "; -for (int i = 0; i < dims.nbDims; ++i) { - std::cout << dims.d[i] << (i < dims.nbDims - 1 ? "x" : ""); -} -std::cout << std::endl; -} - -for (int t = 0; t < numTensors; ++t) { - std::cout << tensorsName << "[" << t << "]: "; - if (tensors[t] != nullptr) { - nvinfer1::Dims dims = tensors[t]->getDimensions(); - for (int i = 0; i < dims.nbDims; ++i) { - std::cout << dims.d[i] << (i < dims.nbDims - 1 ? "x" : ""); - } - std::cout << std::endl; - } else { - std::cout << "nullptr" << std::endl; - } -} -} - static int get_width(int x, float gw, int max_channels, int divisor = 8) { auto channel = int(ceil((x * gw) / divisor)) * divisor; return channel >= max_channels ? max_channels : channel; From 99f946c38a908701c8372088acadf661621840dd Mon Sep 17 00:00:00 2001 From: linds Date: Sun, 28 Apr 2024 08:42:18 +0800 Subject: [PATCH 17/24] yolov8 pose --- yolov8/yolov8_pose.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yolov8/yolov8_pose.cpp b/yolov8/yolov8_pose.cpp index 5b848a6b..4ef16598 100644 --- a/yolov8/yolov8_pose.cpp +++ b/yolov8/yolov8_pose.cpp @@ -20,9 +20,9 @@ void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p IHostMemory* serialized_engine = nullptr; if (is_p == 6) { - serialized_engine = buildEngineYolov8Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); + std::cout << "p6 is not support in gpu right nowe" << std::endl; } else if (is_p == 2) { - serialized_engine = buildEngineYolov8Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); + std::cout << "p2 is not support in gpu right now" << std::endl; } else { serialized_engine = buildEngineYolov8Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); } From 6369d79d2d3d701b5c19575897ad64c5d54306c7 Mon Sep 17 00:00:00 2001 From: linds Date: Sun, 28 Apr 2024 08:43:31 +0800 Subject: [PATCH 18/24] yolov8 pose --- yolov8/yolov8_pose.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yolov8/yolov8_pose.cpp b/yolov8/yolov8_pose.cpp index 4ef16598..12aa1b13 100644 --- a/yolov8/yolov8_pose.cpp +++ b/yolov8/yolov8_pose.cpp @@ -20,9 +20,9 @@ void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p IHostMemory* serialized_engine = nullptr; if (is_p == 6) { - std::cout << "p6 is not support in gpu right nowe" << std::endl; + std::cout << "p6 is not supported right nowe" << std::endl; } else if (is_p == 2) { - std::cout << "p2 is not support in gpu right now" << std::endl; + std::cout << "p2 is not supported right now" << std::endl; } else { serialized_engine = buildEngineYolov8Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); } From 47fb20eeff68b356fec0c2e0ab7d4c5bcaf4c81c Mon Sep 17 00:00:00 2001 From: linds Date: Sun, 28 Apr 2024 08:45:41 +0800 Subject: [PATCH 19/24] yolov8 pose --- yolov8/yolov8_pose.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yolov8/yolov8_pose.cpp b/yolov8/yolov8_pose.cpp index 12aa1b13..2b354e46 100644 --- a/yolov8/yolov8_pose.cpp +++ b/yolov8/yolov8_pose.cpp @@ -180,7 +180,8 @@ int main(int argc, char** argv) { if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw, max_channels)) { std::cerr << "Arguments not right!" << std::endl; - std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n6/s6/m6/l6/x6] // serialize model to plan file" + std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to " + "plan file" << std::endl; std::cerr << "./yolov8 -d [.engine] ../samples [c/g]// deserialize plan file and run inference" << std::endl; return -1; From c1eaf4b3982170a56e9c214dc88670738da9b313 Mon Sep 17 00:00:00 2001 From: linds Date: Sun, 28 Apr 2024 08:50:22 +0800 Subject: [PATCH 20/24] yolov8 pose --- yolov8/README.md | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/yolov8/README.md b/yolov8/README.md index 8a42fe18..fadf8ce2 100644 --- a/yolov8/README.md +++ b/yolov8/README.md @@ -108,11 +108,19 @@ sudo ./yolov8_cls -d yolov8n-cls.engine ../samples ### Pose Estimation ``` -# Build and serialize TensorRT engine -./yolov8_pose -s yolov8n-pose.wts yolov8n-pose.engine n - -# Run inference -./yolov8_pose -d yolov8n-pose.engine ../images c +cd {tensorrtx}/yolov8/ +// update "kNumClass = 1" in config.h +mkdir build +cd build +cp {ultralytics}/ultralytics/yolov8-pose.wts {tensorrtx}/yolov8/build +cmake .. +make +sudo ./yolov8_pose -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to plan file +sudo ./yolov8_pose -d [.engine] [image folder] [c/g] // deserialize and run inference, the images in [image folder] will be processed. +// For example yolov8-pose +sudo ./yolov8_pose -s yolov8n.wts yolov8.engine n +sudo ./yolov8_pose -d yolov8n.engine ../images c //cpu postprocess +sudo ./yolov8_pose -d yolov8n.engine ../images g //gpu postprocess ``` From 6ed8ce182b5b392e4edd6b6e645d7ab628c80e7e Mon Sep 17 00:00:00 2001 From: linds Date: Sun, 28 Apr 2024 08:51:53 +0800 Subject: [PATCH 21/24] yolov8 pose --- yolov8/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/yolov8/README.md b/yolov8/README.md index fadf8ce2..c014514a 100644 --- a/yolov8/README.md +++ b/yolov8/README.md @@ -118,9 +118,9 @@ make sudo ./yolov8_pose -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to plan file sudo ./yolov8_pose -d [.engine] [image folder] [c/g] // deserialize and run inference, the images in [image folder] will be processed. // For example yolov8-pose -sudo ./yolov8_pose -s yolov8n.wts yolov8.engine n -sudo ./yolov8_pose -d yolov8n.engine ../images c //cpu postprocess -sudo ./yolov8_pose -d yolov8n.engine ../images g //gpu postprocess +sudo ./yolov8_pose -s yolov8n-pose.wts yolov8.engine n +sudo ./yolov8_pose -d yolov8n-pose.engine ../images c //cpu postprocess +sudo ./yolov8_pose -d yolov8n-pose.engine ../images g //gpu postprocess ``` From 4b447e8b6c74b356c45a435c0bdd48dc619e6f46 Mon Sep 17 00:00:00 2001 From: linds Date: Sun, 28 Apr 2024 08:52:33 +0800 Subject: [PATCH 22/24] yolov8 pose --- yolov8/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yolov8/README.md b/yolov8/README.md index c014514a..e4762e2f 100644 --- a/yolov8/README.md +++ b/yolov8/README.md @@ -118,7 +118,7 @@ make sudo ./yolov8_pose -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to plan file sudo ./yolov8_pose -d [.engine] [image folder] [c/g] // deserialize and run inference, the images in [image folder] will be processed. // For example yolov8-pose -sudo ./yolov8_pose -s yolov8n-pose.wts yolov8.engine n +sudo ./yolov8_pose -s yolov8n-pose.wts yolov8n-pose.engine n sudo ./yolov8_pose -d yolov8n-pose.engine ../images c //cpu postprocess sudo ./yolov8_pose -d yolov8n-pose.engine ../images g //gpu postprocess ``` From c6110dde5e728e0d5bf4b200dc7ced0f6280234e Mon Sep 17 00:00:00 2001 From: linds Date: Mon, 29 Apr 2024 10:12:32 +0800 Subject: [PATCH 23/24] yolov8_pose_trt.py --- yolov8/README.md | 7 +- yolov8/yolov8_pose_trt.py | 500 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 504 insertions(+), 3 deletions(-) create mode 100644 yolov8/yolov8_pose_trt.py diff --git a/yolov8/README.md b/yolov8/README.md index e4762e2f..0ceb1b5e 100644 --- a/yolov8/README.md +++ b/yolov8/README.md @@ -129,9 +129,10 @@ sudo ./yolov8_pose -d yolov8n-pose.engine ../images g //gpu postprocess ``` // install python-tensorrt, pycuda, etc. // ensure the yolov8n.engine and libmyplugins.so have been built -python yolov8_det.py # Detection -python yolov8_seg.py # Segmentation -python yolov8_cls.py # Classification +python yolov8_det_trt.py # Detection +python yolov8_seg_trt.py # Segmentation +python yolov8_cls_trt.py # Classification +python yolov8_pose_trt.py # Pose Estimation ``` # INT8 Quantization diff --git a/yolov8/yolov8_pose_trt.py b/yolov8/yolov8_pose_trt.py new file mode 100644 index 00000000..f56a61f2 --- /dev/null +++ b/yolov8/yolov8_pose_trt.py @@ -0,0 +1,500 @@ +""" +An example that uses TensorRT's Python api to make inferences. +""" +import ctypes +import os +import shutil +import random +import sys +import threading +import time +import cv2 +import numpy as np +import pycuda.autoinit # noqa: F401 +import pycuda.driver as cuda +import tensorrt as trt + + +CONF_THRESH = 0.5 +IOU_THRESHOLD = 0.4 + +keypoint_pairs = [ + (0, 1), (0, 2), (0, 5), (0, 6), (1, 2), + (1, 3), (2, 4), (5, 6), (5, 7), (5, 11), + (6, 8), (6, 12), (7, 9), (8, 10), (11, 12), + (11, 13), (12, 14), (13, 15), (14, 16) +] + + +def get_img_path_batches(batch_size, img_dir): + ret = [] + batch = [] + for root, dirs, files in os.walk(img_dir): + for name in files: + if len(batch) == batch_size: + ret.append(batch) + batch = [] + batch.append(os.path.join(root, name)) + if len(batch) > 0: + ret.append(batch) + return ret + + +def plot_one_box(x, img, color=None, label=None, line_thickness=None): + """ + description: Plots one bounding box on image img, + this function comes from YoLov8 project. + param: + x: a box likes [x1,y1,x2,y2] + img: a opencv image object + color: color to draw rectangle, such as (0,255,0) + label: str + line_thickness: int + return: + no return + + """ + tl = ( + line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 + ) # line/font thickness + color = color or [random.randint(0, 255) for _ in range(3)] + c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) + cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) + if label: + tf = max(tl - 1, 1) # font thickness + t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] + c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 + cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled + cv2.putText( + img, + label, + (c1[0], c1[1] - 2), + 0, + tl / 3, + [225, 255, 255], + thickness=tf, + lineType=cv2.LINE_AA, + ) + + +class YoLov8TRT(object): + """ + description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops. + """ + + def __init__(self, engine_file_path): + # Create a Context on this device, + self.ctx = cuda.Device(0).make_context() + stream = cuda.Stream() + TRT_LOGGER = trt.Logger(trt.Logger.INFO) + runtime = trt.Runtime(TRT_LOGGER) + + # Deserialize the engine from file + with open(engine_file_path, "rb") as f: + engine = runtime.deserialize_cuda_engine(f.read()) + context = engine.create_execution_context() + + host_inputs = [] + cuda_inputs = [] + host_outputs = [] + cuda_outputs = [] + bindings = [] + + for binding in engine: + print('bingding:', binding, engine.get_binding_shape(binding)) + size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size + dtype = trt.nptype(engine.get_binding_dtype(binding)) + # Allocate host and device buffers + host_mem = cuda.pagelocked_empty(size, dtype) + cuda_mem = cuda.mem_alloc(host_mem.nbytes) + # Append the device buffer to device bindings. + bindings.append(int(cuda_mem)) + # Append to the appropriate list. + if engine.binding_is_input(binding): + self.input_w = engine.get_binding_shape(binding)[-1] + self.input_h = engine.get_binding_shape(binding)[-2] + host_inputs.append(host_mem) + cuda_inputs.append(cuda_mem) + else: + host_outputs.append(host_mem) + cuda_outputs.append(cuda_mem) + + # Store + self.stream = stream + self.context = context + self.host_inputs = host_inputs + self.cuda_inputs = cuda_inputs + self.host_outputs = host_outputs + self.cuda_outputs = cuda_outputs + self.bindings = bindings + self.batch_size = engine.max_batch_size + self.det_output_size = 89001 + + def infer(self, raw_image_generator): + threading.Thread.__init__(self) + # Make self the active context, pushing it on top of the context stack. + self.ctx.push() + # Restore + stream = self.stream + context = self.context + host_inputs = self.host_inputs + cuda_inputs = self.cuda_inputs + host_outputs = self.host_outputs + cuda_outputs = self.cuda_outputs + bindings = self.bindings + # Do image preprocess + batch_image_raw = [] + batch_origin_h = [] + batch_origin_w = [] + batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) + for i, image_raw in enumerate(raw_image_generator): + input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) + batch_image_raw.append(image_raw) + batch_origin_h.append(origin_h) + batch_origin_w.append(origin_w) + np.copyto(batch_input_image[i], + input_image) + batch_input_image = np.ascontiguousarray(batch_input_image) + + # Copy input image to host buffer + np.copyto(host_inputs[0], batch_input_image.ravel()) + start = time.time() + # Transfer input data to the GPU. + cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) + # Run inference. + context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle) + # Transfer predictions back from the GPU. + cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) + # Synchronize the stream + stream.synchronize() + end = time.time() + # Remove any context from the top of the context stack, deactivating it. + self.ctx.pop() + # Here we use the first row of output in that batch_size = 1 + output = host_outputs[0] + # Do postprocess + for i in range(self.batch_size): + + result_boxes, result_scores, result_classid, keypoints = self.post_process( + output[i * (self.det_output_size): (i + 1) * (self.det_output_size)], + batch_origin_h[i], batch_origin_w[i] + ) + + # Draw rectangles and labels on the original image + for j in range(len(result_boxes)): + box = result_boxes[j] + plot_one_box( + box, + batch_image_raw[i], + label="{}:{:.2f}".format( + categories[int(result_classid[j])], result_scores[j] + ), + ) + + num_keypoints = len(keypoints[j]) // 3 + points = [] + for k in range(num_keypoints): + x = keypoints[j][k * 3] + y = keypoints[j][k * 3 + 1] + confidence = keypoints[j][k * 3 + 2] + if confidence > 0: + points.append((int(x), int(y))) + else: + points.append(None) + + # 根据关键点索引对绘制线条 + for pair in keypoint_pairs: + partA, partB = pair + if points[partA] and points[partB]: + cv2.line(batch_image_raw[i], points[partA], points[partB], (0, 255, 0), 2) + + return batch_image_raw, end - start + + def destroy(self): + # Remove any context from the top of the context stack, deactivating it. + self.ctx.pop() + + def get_raw_image(self, image_path_batch): + """ + description: Read an image from image path + """ + for img_path in image_path_batch: + yield cv2.imread(img_path) + + def get_raw_image_zeros(self, image_path_batch=None): + """ + description: Ready data for warmup + """ + for _ in range(self.batch_size): + yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) + + def preprocess_image(self, raw_bgr_image): + """ + description: Convert BGR image to RGB, + resize and pad it to target size, normalize to [0,1], + transform to NCHW format. + param: + input_image_path: str, image path + return: + image: the processed image + image_raw: the original image + h: original height + w: original width + """ + image_raw = raw_bgr_image + h, w, c = image_raw.shape + image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) + # Calculate widht and height and paddings + r_w = self.input_w / w + r_h = self.input_h / h + if r_h > r_w: + tw = self.input_w + th = int(r_w * h) + tx1 = tx2 = 0 + ty1 = int((self.input_h - th) / 2) + ty2 = self.input_h - th - ty1 + else: + tw = int(r_h * w) + th = self.input_h + tx1 = int((self.input_w - tw) / 2) + tx2 = self.input_w - tw - tx1 + ty1 = ty2 = 0 + # Resize the image with long side while maintaining ratio + image = cv2.resize(image, (tw, th)) + # Pad the short side with (128,128,128) + image = cv2.copyMakeBorder( + image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) + ) + image = image.astype(np.float32) + # Normalize to [0,1] + image /= 255.0 + # HWC to CHW format: + image = np.transpose(image, [2, 0, 1]) + # CHW to NCHW format + image = np.expand_dims(image, axis=0) + # Convert the image to row-major order, also known as "C order": + image = np.ascontiguousarray(image) + return image, image_raw, h, w + + def xywh2xyxy_with_keypoints(self, origin_h, origin_w, boxes, keypoints): + + n = len(boxes) + box_array = np.zeros_like(boxes) + keypoint_array = np.zeros_like(keypoints) + r_w = self.input_w / origin_w + r_h = self.input_h / origin_h + for i in range(n): + if r_h > r_w: + box = boxes[i] + lmk = keypoints[i] + box_array[i, 0] = box[0] / r_w + box_array[i, 2] = box[2] / r_w + box_array[i, 1] = (box[1] - (self.input_h - r_w * origin_h) / 2) / r_w + box_array[i, 3] = (box[3] - (self.input_h - r_w * origin_h) / 2) / r_w + + for j in range(0, len(lmk), 3): + keypoint_array[i, j] = lmk[j] / r_w + keypoint_array[i, j + 1] = (lmk[j + 1] - (self.input_h - r_w * origin_h) / 2) / r_w + keypoint_array[i, j + 2] = lmk[j + 2] + else: + + box = boxes[i] + lmk = keypoints[i] + + box_array[i, 0] = (box[0] - (self.input_w - r_h * origin_w) / 2) / r_h + box_array[i, 2] = (box[2] - (self.input_w - r_h * origin_w) / 2) / r_h + box_array[i, 1] = box[1] / r_h + box_array[i, 3] = box[3] / r_h + + for j in range(0, len(lmk), 3): + keypoint_array[i, j] = (lmk[j] - (self.input_w - r_h * origin_w) / 2) / r_h + keypoint_array[i, j + 1] = lmk[j + 1] / r_h + keypoint_array[i, j + 2] = lmk[j + 2] + + return box_array, keypoint_array + + def post_process(self, output, origin_h, origin_w): + """ + description: Post-process the prediction to include pose keypoints + param: + output: A numpy array like [num_boxes, cx, cy, w, h, conf, + cls_id, px1, py1, pconf1,...px17, py17, pconf17] where p denotes pose keypoint + origin_h: Height of original image + origin_w: Width of original image + return: + result_boxes: Final boxes, a numpy array, each row is a box [x1, y1, x2, y2] + result_scores: Final scores, a numpy array, each element is the score corresponding to box + result_classid: Final classID, a numpy array, each element is the classid corresponding to box + result_keypoints: Final keypoints, a list of numpy arrays, + each element represents keypoints for a box, shaped as (#keypoints, 3) + """ + # Number of values per detection: 38 base values + 17 keypoints * 3 values each + num_values_per_detection = 38 + 17 * 3 + # Get the number of boxes detected + num = int(output[0]) + # Reshape to a two-dimensional ndarray with the full detection shape + pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :] + + # Perform non-maximum suppression to filter the detections + boxes = self.non_max_suppression( + pred[:, :num_values_per_detection], origin_h, origin_w, + conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) + + # Extract the bounding boxes, confidence scores, and class IDs + result_boxes = boxes[:, :4] if len(boxes) else np.array([]) + result_scores = boxes[:, 4] if len(boxes) else np.array([]) + result_classid = boxes[:, 5] if len(boxes) else np.array([]) + result_keypoints = boxes[:, -51:] if len(boxes) else np.array([]) + + # Return the post-processed results including keypoints + return result_boxes, result_scores, result_classid, result_keypoints + + def bbox_iou(self, box1, box2, x1y1x2y2=True): + """ + description: compute the IoU of two bounding boxes + param: + box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) + box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) + x1y1x2y2: select the coordinate format + return: + iou: computed iou + """ + if not x1y1x2y2: + # Transform from center and width to exact coordinates + b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 + b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 + b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 + b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 + else: + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] + b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] + + # Get the coordinates of the intersection rectangle + inter_rect_x1 = np.maximum(b1_x1, b2_x1) + inter_rect_y1 = np.maximum(b1_y1, b2_y1) + inter_rect_x2 = np.minimum(b1_x2, b2_x2) + inter_rect_y2 = np.minimum(b1_y2, b2_y2) + # Intersection area + inter_area = np.clip( + inter_rect_x2 - inter_rect_x1 + 1, 0, None) * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None) + # Union Area + b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) + b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) + + iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) + + return iou + + def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): + """ + description: Removes detections with lower object confidence score than 'conf_thres' and performs + Non-Maximum Suppression to further filter detections. + param: + prediction: detections, (x1, y1, x2, y2, conf, cls_id) + origin_h: original image height + origin_w: original image width + conf_thres: a confidence threshold to filter detections + nms_thres: a iou threshold to filter detections + return: + boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) + """ + # Get the boxes that score > CONF_THRESH + boxes = prediction[prediction[:, 4] >= conf_thres] + # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] + res_array = np.copy(boxes) + box_pred_deep_copy = np.copy(boxes[:, :4]) + keypoints_pred_deep_copy = np.copy(boxes[:, -51:]) + res_box, res_keypoints = self.xywh2xyxy_with_keypoints( + origin_h, origin_w, box_pred_deep_copy, keypoints_pred_deep_copy) + res_array[:, :4] = res_box + res_array[:, -51:] = res_keypoints + # clip the coordinates + res_array[:, 0] = np.clip(res_array[:, 0], 0, origin_w - 1) + res_array[:, 2] = np.clip(res_array[:, 2], 0, origin_w - 1) + res_array[:, 1] = np.clip(res_array[:, 1], 0, origin_h - 1) + res_array[:, 3] = np.clip(res_array[:, 3], 0, origin_h - 1) + # Object confidence + confs = res_array[:, 4] + # Sort by the confs + res_array = res_array[np.argsort(-confs)] + # Perform non-maximum suppression + keep_res_array = [] + while res_array.shape[0]: + large_overlap = self.bbox_iou(np.expand_dims(res_array[0, :4], 0), res_array[:, :4]) > nms_thres + label_match = res_array[0, 5] == res_array[:, 5] + invalid = large_overlap & label_match + keep_res_array.append(res_array[0]) + res_array = res_array[~invalid] + + res_array = np.stack(keep_res_array, 0) if len(keep_res_array) else np.array([]) + return res_array + + +class inferThread(threading.Thread): + def __init__(self, yolov8_wrapper, image_path_batch): + threading.Thread.__init__(self) + self.yolov8_wrapper = yolov8_wrapper + self.image_path_batch = image_path_batch + + def run(self): + batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch)) + for i, img_path in enumerate(self.image_path_batch): + parent, filename = os.path.split(img_path) + save_name = os.path.join('output', filename) + # Save image + + cv2.imwrite(save_name, batch_image_raw[i]) + print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) + + +class warmUpThread(threading.Thread): + def __init__(self, yolov8_wrapper): + threading.Thread.__init__(self) + self.yolov8_wrapper = yolov8_wrapper + + def run(self): + batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros()) + print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) + + +if __name__ == "__main__": + # load custom plugin and engine + PLUGIN_LIBRARY = "./build/libmyplugins.so" + engine_file_path = "yolov8n-pose.engine" + + if len(sys.argv) > 1: + engine_file_path = sys.argv[1] + if len(sys.argv) > 2: + PLUGIN_LIBRARY = sys.argv[2] + + ctypes.CDLL(PLUGIN_LIBRARY) + + # load coco labels + + categories = ["person"] + + if os.path.exists('output/'): + shutil.rmtree('output/') + os.makedirs('output/') + # a YoLov8TRT instance + yolov8_wrapper = YoLov8TRT(engine_file_path) + try: + print('batch size is', yolov8_wrapper.batch_size) + + image_dir = "samples/" + image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir) + + for i in range(10): + # create a new thread to do warm_up + thread1 = warmUpThread(yolov8_wrapper) + thread1.start() + thread1.join() + for batch in image_path_batches: + # create a new thread to do inference + thread1 = inferThread(yolov8_wrapper, batch) + thread1.start() + thread1.join() + finally: + # destroy the instance + yolov8_wrapper.destroy() From 5172d155ffcd029d02046fb99e914f12a36700ba Mon Sep 17 00:00:00 2001 From: linds Date: Mon, 29 Apr 2024 11:15:51 +0800 Subject: [PATCH 24/24] yolov8 pose p6 --- yolov8/include/model.h | 4 + yolov8/src/model.cpp | 333 ++++++++++++++++++++++++++++++++++++++++- yolov8/yolov8_pose.cpp | 2 +- 3 files changed, 335 insertions(+), 4 deletions(-) diff --git a/yolov8/include/model.h b/yolov8/include/model.h index 6546aa54..82586da1 100644 --- a/yolov8/include/model.h +++ b/yolov8/include/model.h @@ -25,3 +25,7 @@ nvinfer1::IHostMemory* buildEngineYolov8Seg(nvinfer1::IBuilder* builder, nvinfer nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, int& max_channels); + +nvinfer1::IHostMemory* buildEngineYolov8PoseP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, + nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, + int& max_channels); diff --git a/yolov8/src/model.cpp b/yolov8/src/model.cpp index 4cc4088e..a5f7e8e5 100644 --- a/yolov8/src/model.cpp +++ b/yolov8/src/model.cpp @@ -1448,9 +1448,6 @@ nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfe 1, 0, "model.22.dfl.conv.weight"); // det0 - std::cout << "conv15->getOutput(0)->getDimensions().d[0] : " << conv15->getOutput(0)->getDimensions().d[0] - << " (kInputH / strides[0]) * (kInputW / strides[0]) : " - << (kInputH / strides[0]) * (kInputW / strides[0]) << std::endl; auto shuffle_conv15 = cv4_conv_combined(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0", (kInputH / strides[0]) * (kInputW / strides[0]), gw, "pose"); @@ -1530,3 +1527,333 @@ nvinfer1::IHostMemory* buildEngineYolov8Pose(nvinfer1::IBuilder* builder, nvinfe } return serialized_model; } + +nvinfer1::IHostMemory* buildEngineYolov8PoseP6(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, + nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, + int& max_channels) { + std::map weightMap = loadWeights(wts_path); + nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U); + /******************************************************************************************************* + ****************************************** YOLOV8 INPUT ********************************************** + *******************************************************************************************************/ + nvinfer1::ITensor* data = network->addInput(kInputTensorName, dt, nvinfer1::Dims3{3, kInputH, kInputW}); + assert(data); + /******************************************************************************************************* + ***************************************** YOLOV8 BACKBONE ******************************************** + *******************************************************************************************************/ + nvinfer1::IElementWiseLayer* conv0 = + convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0"); + nvinfer1::IElementWiseLayer* conv1 = + convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1"); + // 11233 + nvinfer1::IElementWiseLayer* conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), + get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); + nvinfer1::IElementWiseLayer* conv3 = + convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3"); + // 22466 + nvinfer1::IElementWiseLayer* conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), + get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); + nvinfer1::IElementWiseLayer* conv5 = + convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5"); + // 22466 + nvinfer1::IElementWiseLayer* conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); + + nvinfer1::IElementWiseLayer* conv7 = + convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(768, gw, max_channels), 3, 2, 1, "model.7"); + nvinfer1::IElementWiseLayer* conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(768, gw, max_channels), + get_width(768, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8"); + + nvinfer1::IElementWiseLayer* conv9 = + convBnSiLU(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.9"); + nvinfer1::IElementWiseLayer* conv10 = + C2F(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.10"); + + nvinfer1::IElementWiseLayer* conv11 = + SPPF(network, weightMap, *conv10->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), 5, "model.11"); + + /******************************************************************************************************* + ********************************************* YOLOV8 HEAD ******************************************** + *******************************************************************************************************/ + // Head + float scale[] = {1.0, 2.0, 2.0}; // scale used for upsampling + + // P5 + nvinfer1::IResizeLayer* upsample12 = network->addResize(*conv11->getOutput(0)); + upsample12->setResizeMode(nvinfer1::ResizeMode::kNEAREST); + upsample12->setScales(scale, 3); + nvinfer1::ITensor* concat13_inputs[] = {upsample12->getOutput(0), conv8->getOutput(0)}; + nvinfer1::IConcatenationLayer* concat13 = network->addConcatenation(concat13_inputs, 2); + nvinfer1::IElementWiseLayer* conv14 = + C2(network, weightMap, *concat13->getOutput(0), get_width(768, gw, max_channels), + get_width(768, gw, max_channels), get_depth(3, gd), false, 0.5, "model.14"); + + // P4 + nvinfer1::IResizeLayer* upsample15 = network->addResize(*conv14->getOutput(0)); + upsample15->setResizeMode(nvinfer1::ResizeMode::kNEAREST); + upsample15->setScales(scale, 3); + nvinfer1::ITensor* concat16_inputs[] = {upsample15->getOutput(0), conv6->getOutput(0)}; + nvinfer1::IConcatenationLayer* concat16 = network->addConcatenation(concat16_inputs, 2); + nvinfer1::IElementWiseLayer* conv17 = + C2(network, weightMap, *concat16->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.17"); + + // P3 + nvinfer1::IResizeLayer* upsample18 = network->addResize(*conv17->getOutput(0)); + upsample18->setResizeMode(nvinfer1::ResizeMode::kNEAREST); + upsample18->setScales(scale, 3); + nvinfer1::ITensor* concat19_inputs[] = {upsample18->getOutput(0), conv4->getOutput(0)}; + nvinfer1::IConcatenationLayer* concat19 = network->addConcatenation(concat19_inputs, 2); + nvinfer1::IElementWiseLayer* conv20 = + C2(network, weightMap, *concat19->getOutput(0), get_width(256, gw, max_channels), + get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.20"); + + // Additional layers for P4, P5, P6 + // P4/16-medium + nvinfer1::IElementWiseLayer* conv21 = convBnSiLU(network, weightMap, *conv20->getOutput(0), + get_width(256, gw, max_channels), 3, 2, 1, "model.21"); + nvinfer1::ITensor* concat22_inputs[] = {conv21->getOutput(0), conv17->getOutput(0)}; + nvinfer1::IConcatenationLayer* concat22 = network->addConcatenation(concat22_inputs, 2); + nvinfer1::IElementWiseLayer* conv23 = + C2(network, weightMap, *concat22->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.23"); + + // P5/32-large + nvinfer1::IElementWiseLayer* conv24 = convBnSiLU(network, weightMap, *conv23->getOutput(0), + get_width(512, gw, max_channels), 3, 2, 1, "model.24"); + nvinfer1::ITensor* concat25_inputs[] = {conv24->getOutput(0), conv14->getOutput(0)}; + nvinfer1::IConcatenationLayer* concat25 = network->addConcatenation(concat25_inputs, 2); + nvinfer1::IElementWiseLayer* conv26 = + C2(network, weightMap, *concat25->getOutput(0), get_width(768, gw, max_channels), + get_width(768, gw, max_channels), get_depth(3, gd), false, 0.5, "model.26"); + + // P6/64-xlarge + nvinfer1::IElementWiseLayer* conv27 = convBnSiLU(network, weightMap, *conv26->getOutput(0), + get_width(768, gw, max_channels), 3, 2, 1, "model.27"); + nvinfer1::ITensor* concat28_inputs[] = {conv27->getOutput(0), conv11->getOutput(0)}; + nvinfer1::IConcatenationLayer* concat28 = network->addConcatenation(concat28_inputs, 2); + nvinfer1::IElementWiseLayer* conv29 = + C2(network, weightMap, *concat28->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.29"); + + /******************************************************************************************************* + ********************************************* YOLOV8 OUTPUT ****************************************** + *******************************************************************************************************/ + int base_in_channel = (gw == 1.25) ? 80 : 64; + int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(256, gw, max_channels); + + // output0 + nvinfer1::IElementWiseLayer* conv30_cv2_0_0 = + convBnSiLU(network, weightMap, *conv20->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.0.0"); + nvinfer1::IElementWiseLayer* conv30_cv2_0_1 = + convBnSiLU(network, weightMap, *conv30_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.0.1"); + nvinfer1::IConvolutionLayer* conv30_cv2_0_2 = + network->addConvolutionNd(*conv30_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv2.0.2.weight"], weightMap["model.30.cv2.0.2.bias"]); + conv30_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + + conv30_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + + nvinfer1::IElementWiseLayer* conv30_cv3_0_0 = + convBnSiLU(network, weightMap, *conv20->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.0.0"); + + nvinfer1::IElementWiseLayer* conv30_cv3_0_1 = convBnSiLU(network, weightMap, *conv30_cv3_0_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.30.cv3.0.1"); + nvinfer1::IConvolutionLayer* conv30_cv3_0_2 = + network->addConvolutionNd(*conv30_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv3.0.2.weight"], weightMap["model.30.cv3.0.2.bias"]); + conv30_cv3_0_2->setStride(nvinfer1::DimsHW{1, 1}); + conv30_cv3_0_2->setPadding(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor* inputTensor30_0[] = {conv30_cv2_0_2->getOutput(0), conv30_cv3_0_2->getOutput(0)}; + nvinfer1::IConcatenationLayer* cat30_0 = network->addConcatenation(inputTensor30_0, 2); + + // output1 + nvinfer1::IElementWiseLayer* conv30_cv2_1_0 = + convBnSiLU(network, weightMap, *conv23->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.1.0"); + nvinfer1::IElementWiseLayer* conv30_cv2_1_1 = + convBnSiLU(network, weightMap, *conv30_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.1.1"); + nvinfer1::IConvolutionLayer* conv30_cv2_1_2 = + network->addConvolutionNd(*conv30_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv2.1.2.weight"], weightMap["model.30.cv2.1.2.bias"]); + conv30_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::IElementWiseLayer* conv30_cv3_1_0 = + convBnSiLU(network, weightMap, *conv23->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.1.0"); + nvinfer1::IElementWiseLayer* conv30_cv3_1_1 = convBnSiLU(network, weightMap, *conv30_cv3_1_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.30.cv3.1.1"); + nvinfer1::IConvolutionLayer* conv30_cv3_1_2 = + network->addConvolutionNd(*conv30_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv3.1.2.weight"], weightMap["model.30.cv3.1.2.bias"]); + conv30_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor* inputTensor30_1[] = {conv30_cv2_1_2->getOutput(0), conv30_cv3_1_2->getOutput(0)}; + nvinfer1::IConcatenationLayer* cat30_1 = network->addConcatenation(inputTensor30_1, 2); + + // output2 + nvinfer1::IElementWiseLayer* conv30_cv2_2_0 = + convBnSiLU(network, weightMap, *conv26->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.2.0"); + nvinfer1::IElementWiseLayer* conv30_cv2_2_1 = + convBnSiLU(network, weightMap, *conv30_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.2.1"); + nvinfer1::IConvolutionLayer* conv30_cv2_2_2 = + network->addConvolution(*conv30_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv2.2.2.weight"], weightMap["model.30.cv2.2.2.bias"]); + conv30_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::IElementWiseLayer* conv30_cv3_2_0 = + convBnSiLU(network, weightMap, *conv26->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.2.0"); + nvinfer1::IElementWiseLayer* conv30_cv3_2_1 = convBnSiLU(network, weightMap, *conv30_cv3_2_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.30.cv3.2.1"); + nvinfer1::IConvolutionLayer* conv30_cv3_2_2 = + network->addConvolution(*conv30_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv3.2.2.weight"], weightMap["model.30.cv3.2.2.bias"]); + conv30_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor* inputTensor30_2[] = {conv30_cv2_2_2->getOutput(0), conv30_cv3_2_2->getOutput(0)}; + nvinfer1::IConcatenationLayer* cat30_2 = network->addConcatenation(inputTensor30_2, 2); + + // output3 + nvinfer1::IElementWiseLayer* conv30_cv2_3_0 = + convBnSiLU(network, weightMap, *conv29->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.3.0"); + nvinfer1::IElementWiseLayer* conv30_cv2_3_1 = + convBnSiLU(network, weightMap, *conv30_cv2_3_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.3.1"); + nvinfer1::IConvolutionLayer* conv30_cv2_3_2 = + network->addConvolution(*conv30_cv2_3_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv2.3.2.weight"], weightMap["model.30.cv2.3.2.bias"]); + conv30_cv2_3_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv2_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::IElementWiseLayer* conv30_cv3_3_0 = + convBnSiLU(network, weightMap, *conv29->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.3.0"); + nvinfer1::IElementWiseLayer* conv30_cv3_3_1 = convBnSiLU(network, weightMap, *conv30_cv3_3_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.30.cv3.3.1"); + nvinfer1::IConvolutionLayer* conv30_cv3_3_2 = + network->addConvolution(*conv30_cv3_3_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv3.3.2.weight"], weightMap["model.30.cv3.3.2.bias"]); + conv30_cv3_3_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv3_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor* inputTensor30_3[] = {conv30_cv2_3_2->getOutput(0), conv30_cv3_3_2->getOutput(0)}; + nvinfer1::IConcatenationLayer* cat30_3 = network->addConcatenation(inputTensor30_3, 2); + + /******************************************************************************************************* + ********************************************* YOLOV8 DETECT ****************************************** + *******************************************************************************************************/ + nvinfer1::IElementWiseLayer* conv_layers[] = {conv3, conv5, conv7, conv9}; + int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; + calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); + int stridesLength = sizeof(strides) / sizeof(int); + + // P3 processing steps (remains unchanged) + nvinfer1::IShuffleLayer* shuffle30_0 = + network->addShuffle(*cat30_0->getOutput(0)); // Reusing the previous cat30_0 as P3 concatenation layer + shuffle30_0->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); + nvinfer1::ISliceLayer* split30_0_0 = network->addSlice( + *shuffle30_0->getOutput(0), nvinfer1::Dims2{0, 0}, + nvinfer1::Dims2{64, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); + nvinfer1::ISliceLayer* split30_0_1 = network->addSlice( + *shuffle30_0->getOutput(0), nvinfer1::Dims2{64, 0}, + nvinfer1::Dims2{kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, nvinfer1::Dims2{1, 1}); + nvinfer1::IShuffleLayer* dfl30_0 = + DFL(network, weightMap, *split30_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, + 1, 0, "model.30.dfl.conv.weight"); + + // det0 + auto shuffle_conv20 = cv4_conv_combined(network, weightMap, *conv20->getOutput(0), "model.30.cv4.0", + (kInputH / strides[0]) * (kInputW / strides[0]), gw, "pose"); + nvinfer1::ITensor* inputTensor30_dfl_0[] = {dfl30_0->getOutput(0), split30_0_1->getOutput(0), + shuffle_conv20->getOutput(0)}; + nvinfer1::IConcatenationLayer* cat30_dfl_0 = network->addConcatenation(inputTensor30_dfl_0, 2); + + // P4 processing steps (remains unchanged) + nvinfer1::IShuffleLayer* shuffle30_1 = + network->addShuffle(*cat30_1->getOutput(0)); // Reusing the previous cat30_1 as P4 concatenation layer + shuffle30_1->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); + nvinfer1::ISliceLayer* split30_1_0 = network->addSlice( + *shuffle30_1->getOutput(0), nvinfer1::Dims2{0, 0}, + nvinfer1::Dims2{64, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); + nvinfer1::ISliceLayer* split30_1_1 = network->addSlice( + *shuffle30_1->getOutput(0), nvinfer1::Dims2{64, 0}, + nvinfer1::Dims2{kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, nvinfer1::Dims2{1, 1}); + nvinfer1::IShuffleLayer* dfl30_1 = + DFL(network, weightMap, *split30_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, + 1, 0, "model.30.dfl.conv.weight"); + + // det1 + auto shuffle_conv23 = cv4_conv_combined(network, weightMap, *conv23->getOutput(0), "model.30.cv4.1", + (kInputH / strides[1]) * (kInputW / strides[1]), gw, "pose"); + nvinfer1::ITensor* inputTensor30_dfl_1[] = {dfl30_1->getOutput(0), split30_1_1->getOutput(0), + shuffle_conv23->getOutput(0)}; + nvinfer1::IConcatenationLayer* cat30_dfl_1 = network->addConcatenation(inputTensor30_dfl_1, 2); + + // P5 processing steps (remains unchanged) + nvinfer1::IShuffleLayer* shuffle30_2 = + network->addShuffle(*cat30_2->getOutput(0)); // Reusing the previous cat30_2 as P5 concatenation layer + shuffle30_2->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); + nvinfer1::ISliceLayer* split30_2_0 = network->addSlice( + *shuffle30_2->getOutput(0), nvinfer1::Dims2{0, 0}, + nvinfer1::Dims2{64, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); + nvinfer1::ISliceLayer* split30_2_1 = network->addSlice( + *shuffle30_2->getOutput(0), nvinfer1::Dims2{64, 0}, + nvinfer1::Dims2{kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, nvinfer1::Dims2{1, 1}); + nvinfer1::IShuffleLayer* dfl30_2 = + DFL(network, weightMap, *split30_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, + 1, 0, "model.30.dfl.conv.weight"); + + // det2 + auto shuffle_conv26 = cv4_conv_combined(network, weightMap, *conv26->getOutput(0), "model.30.cv4.2", + (kInputH / strides[2]) * (kInputW / strides[2]), gw, "pose"); + nvinfer1::ITensor* inputTensor30_dfl_2[] = {dfl30_2->getOutput(0), split30_2_1->getOutput(0), + shuffle_conv26->getOutput(0)}; + nvinfer1::IConcatenationLayer* cat30_dfl_2 = network->addConcatenation(inputTensor30_dfl_2, 2); + + // P6 processing steps + nvinfer1::IShuffleLayer* shuffle30_3 = network->addShuffle(*cat30_3->getOutput(0)); + shuffle30_3->setReshapeDimensions(nvinfer1::Dims2{64 + kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}); + nvinfer1::ISliceLayer* split30_3_0 = network->addSlice( + *shuffle30_3->getOutput(0), nvinfer1::Dims2{0, 0}, + nvinfer1::Dims2{64, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1}); + nvinfer1::ISliceLayer* split30_3_1 = network->addSlice( + *shuffle30_3->getOutput(0), nvinfer1::Dims2{64, 0}, + nvinfer1::Dims2{kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}, nvinfer1::Dims2{1, 1}); + nvinfer1::IShuffleLayer* dfl30_3 = + DFL(network, weightMap, *split30_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1, + 1, 0, "model.30.dfl.conv.weight"); + + // det2 + auto shuffle_conv29 = cv4_conv_combined(network, weightMap, *conv29->getOutput(0), "model.30.cv4.3", + (kInputH / strides[3]) * (kInputW / strides[3]), gw, "pose"); + nvinfer1::ITensor* inputTensor30_dfl_3[] = {dfl30_3->getOutput(0), split30_3_1->getOutput(0), + shuffle_conv29->getOutput(0)}; + nvinfer1::IConcatenationLayer* cat30_dfl_3 = network->addConcatenation(inputTensor30_dfl_3, 2); + + nvinfer1::IPluginV2Layer* yolo = addYoLoLayer( + network, std::vector{cat30_dfl_0, cat30_dfl_1, cat30_dfl_2, cat30_dfl_3}, + strides, stridesLength, false, false); + yolo->getOutput(0)->setName(kOutputTensorName); + network->markOutput(*yolo->getOutput(0)); + + builder->setMaxBatchSize(kBatchSize); + config->setMaxWorkspaceSize(16 * (1 << 20)); + +#if defined(USE_FP16) + config->setFlag(nvinfer1::BuilderFlag::kFP16); +#elif defined(USE_INT8) + std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; + assert(builder->platformHasFastInt8()); + config->setFlag(nvinfer1::BuilderFlag::kINT8); + auto* calibrator = + new Int8EntropyCalibrator2(1, kInputW, kInputH, "../coco_calib/", "int8calib.table", kInputTensorName); + config->setInt8Calibrator(calibrator); +#endif + + std::cout << "Building engine, please wait for a while..." << std::endl; + nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); + std::cout << "Build engine successfully!" << std::endl; + + delete network; + + for (auto& mem : weightMap) { + free((void*)(mem.second.values)); + } + return serialized_model; +} diff --git a/yolov8/yolov8_pose.cpp b/yolov8/yolov8_pose.cpp index 2b354e46..84d35aea 100644 --- a/yolov8/yolov8_pose.cpp +++ b/yolov8/yolov8_pose.cpp @@ -20,7 +20,7 @@ void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p IHostMemory* serialized_engine = nullptr; if (is_p == 6) { - std::cout << "p6 is not supported right nowe" << std::endl; + serialized_engine = buildEngineYolov8PoseP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); } else if (is_p == 2) { std::cout << "p2 is not supported right now" << std::endl; } else {