diff --git a/example/auto_compression/pytorch_yolo_series/README.md b/example/auto_compression/pytorch_yolo_series/README.md index 87841ddb6..d64b15142 100644 --- a/example/auto_compression/pytorch_yolo_series/README.md +++ b/example/auto_compression/pytorch_yolo_series/README.md @@ -19,47 +19,45 @@ | 模型 | 策略 | 输入尺寸 | mAPval
0.5:0.95 | 模型体积 | 预测时延FP32
|预测时延FP16
| 预测时延INT8
| 内存占用 | 显存占用 | 配置文件 | Inference模型 | |:--------------|:-------- |:--------: |:-----------------------:|:------:| :----------------: | :----------------: |:----------------: | :----------------: | :---------------: |:------------------------------------------------------------------------------------------------------------------------:|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:| -| YOLOv5s | Base模型 | 640*640 | 37.4 | 28.1MB | 6.87ms | 3.51ms | - | 1718MB | 705MB | - | [Model](https://paddle-slim-models.bj.bcebos.com/act/yolov5s.onnx) | -| YOLOv5s | 离线量化 | 640*640 | 36.0 | 7.4MB | - | - | 3.17ms | 736MB | 315MB | [config](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/example/post_training_quantization/pytorch_yolo_series) | - | -| YOLOv5s | ACT量化训练 | 640*640 | **36.9** | 7.4MB | - | - | **3.17ms** | 736MB | 315MB | [config](./configs/yolov5s_qat_dis.yaml) | [Infer Model](https://bj.bcebos.com/v1/paddle-slim-models/act/yolov5s_quant.tar) | [ONNX Model](https://bj.bcebos.com/v1/paddle-slim-models/act/yolov5s_quant_onnx.tar) | +| YOLOv5s | Base模型 | 640*640 | 37.5 | 28.1MB | 14.4ms | 6.9ms | - | 2637MB | 1143MB | - | [Model](https://paddle-slim-models.bj.bcebos.com/act/yolov5s.onnx) | +| YOLOv5s | 离线量化 | 640*640 | 36.7 | 7.5MB | - | - | 6.4ms | 2669MB | 1089MB | [config](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/example/post_training_quantization/pytorch_yolo_series) | - | +| YOLOv5s | ACT量化训练 | 640*640 | **36.8** | 7.5MB | - | - | **6.8ms** | 2593MB | 1083MB | [config](./configs/yolov5s_qat_dis.yaml) | [Infer Model](https://bj.bcebos.com/v1/paddle-slim-models/act/yolov5s_quant.tar) | [ONNX Model](https://bj.bcebos.com/v1/paddle-slim-models/act/yolov5s_quant_onnx.tar) | | | | | | | | | | | -| YOLOv6s | Base模型 | 640*640 | 42.4 | 65.9MB | 9.18ms | 3.58ms | - | 1208MB | 555MB | - | [Model](https://paddle-slim-models.bj.bcebos.com/act/yolov6s.onnx) | -| YOLOv6s | KL离线量化 | 640*640 | 30.3 | 16.8MB | - | - | 2.81ms | 736MB | 315MB | [config](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/example/post_training_quantization/pytorch_yolo_series) | - | -| YOLOv6s | 量化蒸馏训练 | 640*640 | **41.3** | 16.8MB | - | - | **2.81ms** | 736MB | 315MB | [config](./configs/yolov6s_qat_dis.yaml) | [Infer Model](https://bj.bcebos.com/v1/paddle-slim-models/act/yolov6s_quant.tar) | [ONNX Model](https://bj.bcebos.com/v1/paddle-slim-models/act/yolov6s_quant_onnx.tar) | +| YOLOv6s | Base模型 | 640*640 | 42.5 | 65.9MB | 18.3ms | 7.1ms | - | 2660MB | 1183MB | - | [Model](https://paddle-slim-models.bj.bcebos.com/act/yolov6s.onnx) | +| YOLOv6s | KL离线量化 | 640*640 | 34.0 | 17MB | - | - | 4.9ms | 2570MB | 1085MB | [config](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/example/post_training_quantization/pytorch_yolo_series) | - | +| YOLOv6s | 量化蒸馏训练 | 640*640 | **41.3** | 17MB | - | - | **4.9ms** | 2532MB | 1085MB | [config](./configs/yolov6s_qat_dis.yaml) | [Infer Model](https://bj.bcebos.com/v1/paddle-slim-models/act/yolov6s_quant.tar) | [ONNX Model](https://bj.bcebos.com/v1/paddle-slim-models/act/yolov6s_quant_onnx.tar) | | | | | | | | | | | -| YOLOv6s_v2 | Base模型 | 640*640 | 43.4 | 67.4MB | 9.18ms | 3.58ms | - | 1208MB | 555MB | - | [Model](https://github.com/meituan/YOLOv6/releases/download/0.2.0/yolov6s.onnx) | -| YOLOv6s_v2 | 量化蒸馏训练 | 640*640 | **43.0** | 16.8MB | - | - | **2.81ms** | 736MB | 315MB | [config](./configs/yolov6s_v2_qat_dis.yaml) | [Infer Model](https://bj.bcebos.com/v1/paddle-slim-models/act/yolov6s_v2_0_quant.tar) | [ONNX Model](https://bj.bcebos.com/v1/paddle-slim-models/act/yolov6s_v2_0_quant_onnx.tar) | -| | | | | | | | | | -| YOLOv7 | Base模型 | 640*640 | 51.1 | 141MB | 26.76ms | 8.16ms | - | 1722MB | 917MB | - | [Model](https://paddle-slim-models.bj.bcebos.com/act/yolov7.onnx) | -| YOLOv7 | 离线量化 | 640*640 | 50.2 | 36MB | - | - | 5.19ms | 827MB | 363MB | [config](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/example/post_training_quantization/pytorch_yolo_series) | - | -| YOLOv7 | ACT量化训练 | 640*640 | **50.9** | 36MB | - | - | **5.19ms** | 827MB | 363MB | [config](./configs/yolov7_qat_dis.yaml) | [Infer Model](https://bj.bcebos.com/v1/paddle-slim-models/act/yolov7_quant.tar) | [ONNX Model](https://bj.bcebos.com/v1/paddle-slim-models/act/yolov7_quant_onnx.tar) | -| | | | | | | | | | -| YOLOv7-Tiny | Base模型 | 640*640 | 37.3 | 24MB | 5.06ms | 2.32ms | - | 738MB | 349MB | - | [Model](https://paddle-slim-models.bj.bcebos.com/act/yolov7-tiny.onnx) | -| YOLOv7-Tiny | 离线量化 | 640*640 | 35.8 | 6.1MB | - | - | 1.68ms | 729MB | 315MB | - | - | -| YOLOv7-Tiny | ACT量化训练 | 640*640 | **37.0** | 6.1MB | - | - | **1.68ms** | 729MB | 315MB | [config](./configs/yolov7_tiny_qat_dis.yaml) | [Infer Model](https://bj.bcebos.com/v1/paddle-slim-models/act/yolov7_tiny_quant.tar) | [ONNX Model](https://bj.bcebos.com/v1/paddle-slim-models/act/yolov7_tiny_quant_onnx.tar) | +| YOLOv7-Tiny | Base模型 | 640*640 | 37.2 | 24MB | 13.2ms | 8.1ms | - | 2466MB | 1133MB | - | [Model](https://paddle-slim-models.bj.bcebos.com/act/yolov7-tiny.onnx) | +| YOLOv7-Tiny | ACT量化训练 | 640*640 | **36.8** | 6.2MB | - | - | **6.6ms** | 2547MB | 1085MB | [config](./configs/yolov7_tiny_qat_dis.yaml) | [Infer Model](https://bj.bcebos.com/v1/paddle-slim-models/act/yolov7_tiny_quant.tar) | [ONNX Model](https://bj.bcebos.com/v1/paddle-slim-models/act/yolov7_tiny_quant_onnx.tar) | 说明: -- mAP的指标均在COCO val2017数据集中评测得到。 -- YOLOv7模型在Tesla T4的GPU环境下开启TensorRT 8.4.1,batch_size=1, 测试脚本是[cpp_infer](./cpp_infer)。 +- mAP的指标均在COCO val2017数据集中评测得到,IoU=0.5:0.95。 +- 测速环境:Tesla T4,TensorRT 8.6.1,CUDA 11.2,batch_size=1,cudnn 8.2.0 Intel(R)Xeon(R)Gold 6271C CPU , 测试脚本是[paddle_inference_eval.py](./paddle_inference_eval.py)。 ## 3. 自动压缩流程 ### 3.1 准备环境 -- PaddlePaddle >= 2.4版本 (可从[Paddle官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)根据相应环境的安装指令进行安装) -- PaddleSlim >= 2.4版本 +- PaddlePaddle 2.6 (可从[Paddle官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html)根据相应环境的安装指令进行安装) +- PaddleSlim 2.6版本 (1)安装paddlepaddle ```shell # CPU -pip install paddlepaddle==2.4.1 +python -m pip install paddlepaddle==2.6.0 -i https://pypi.tuna.tsinghua.edu.cn/simple # GPU 以Ubuntu、CUDA 11.2为例 -python -m pip install paddlepaddle-gpu==2.4.1.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html +python -m pip install paddlepaddle-gpu==2.6.0.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html ``` -(2)安装paddleslim>=2.4 +(2)安装paddleslim 2.6 ```shell pip install paddleslim ``` + (3) 安装paddledet +```shell +pip install paddledet +``` +注:安装PaddleDet的目的是为了直接使用PaddleDetection中的Dataloader组件。 + #### 版本对齐 @@ -135,6 +133,11 @@ pip install paddleslim ``` **注意**:目前ACT支持**不带NMS**模型,使用如上命令导出即可。也可以直接下载我们已经准备好的[yolov7.onnx](https://paddle-slim-models.bj.bcebos.com/act/yolov7-tiny.onnx)。 + 将ONNX模型转换为Paddle模型,举例: + 使用命令行将YOLOv6s.onnx转换为paddle模型 + ```shell + x2paddle --framework=onnx --model=yolov6s.onnx --save_dir=yolov6_model + ``` ### 3.4 自动压缩并产出模型 @@ -145,13 +148,14 @@ pip install paddleslim - 单卡训练: ``` export CUDA_VISIBLE_DEVICES=0 -python run.py --config_path=./configs/yolov7_tiny_qat_dis.yaml --save_dir='./output/' +python run.py --config_path=./configs/yolov7_tiny_qat_dis.yaml --save_dir='./yolov7-quantAware/' ``` - 多卡训练: ``` -CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch --log_dir=log --gpus 0,1,2,3 run.py \ - --config_path=./configs/yolov7_tiny_qat_dis.yaml --save_dir='./output/' +export CUDA_VISIBLE_DEVICES=0,1,2,3 +python -m paddle.distributed.launch --log_dir=log --gpus 0,1,2,3 run.py \ + --config_path=./configs/yolov6s_qat_dis.yaml --save_dir='./yolov6s_quantaware/' ``` @@ -177,18 +181,31 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch --log_dir=log - | model_path | inference 模型文件所在目录,该目录下需要有文件 model.pdmodel 和 model.pdiparams 两个文件 | | dataset_dir | eval时数据验证集路径, 默认`dataset/coco` | | image_file | 如果只测试单张图片效果,直接根据image_file指定图片路径 | +| val_image_dir | COCO数据集中验证图像的目录名,默认为val2017 | +| val_anno_path | 指定COCO数据集的注释(annotation)文件路径,这是包含验证集标注信息的JSON文件,默认为annotations/instances_val2017.json | +| benchmark | 指定是否运行性能基准测试。如果设置为True,程序将会进行性能测试 | | device | 使用GPU或者CPU预测,可选CPU/GPU | | use_trt | 是否使用 TesorRT 预测引擎 | | use_mkldnn | 是否启用```MKL-DNN```加速库,注意```use_mkldnn```与```use_gpu```同时为```True```时,将忽略```enable_mkldnn```,而使用```GPU```预测 | +| use_dynamic_shape | 是否使用动态形状(dynamic_shape)功能 | | cpu_threads | CPU预测时,使用CPU线程数量,默认10 | | precision | 预测精度,包括`fp32/fp16/int8` | +| arch | 指定所使用的模型架构的名称,例如YOLOv5 | +| img_shape | 指定模型输入的图像尺寸 | +| batch_size | 指定模型输入的批处理大小 | - TensorRT Python部署: - -首先安装带有TensorRT的[Paddle安装包](https://www.paddlepaddle.org.cn/inference/v2.3/user_guides/download_lib.html#python)。 +Paddle-TensorRT Python部署: 然后使用[paddle_inference_eval.py](./paddle_inference_eval.py)进行部署: +- YOLOv5 +```shell +python paddle_inference_eval.py \ + --model_path==yolov5_model/inference_model \ + --reader_config=configs/yoloe_reader.yml \ + --use_trt=True \ + --precision=int8 +``` ```shell python paddle_inference_eval.py \ --model_path=output \ diff --git a/example/auto_compression/pytorch_yolo_series/cpp_infer/CMakeLists.txt b/example/auto_compression/pytorch_yolo_series/cpp_infer/CMakeLists.txt index d5307c657..d7234f6f4 100644 --- a/example/auto_compression/pytorch_yolo_series/cpp_infer/CMakeLists.txt +++ b/example/auto_compression/pytorch_yolo_series/cpp_infer/CMakeLists.txt @@ -1,43 +1,47 @@ cmake_minimum_required(VERSION 3.0) project(cpp_inference_demo CXX C) -option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON) -option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF) -option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." ON) -option(USE_TENSORRT "Compile demo with TensorRT." OFF) +option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON) +option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF) +option(WITH_STATIC_LIB + "Compile demo with static/shared library, default don't use static." OFF) +option(USE_TENSORRT "Compile demo with TensorRT." OFF) +option(WITH_SHARED_PHI "Compile demo with phi shared lib" ON) option(WITH_ROCM "Compile demo with rocm." OFF) -option(WITH_ONNXRUNTIME "Compile demo with ONNXRuntime" OFF) +option(WITH_ONNXRUNTIME "Compile demo with ONNXRuntime" OFF) option(WITH_ARM "Compile demo with ARM" OFF) option(WITH_MIPS "Compile demo with MIPS" OFF) -option(WITH_SW "Compile demo with SW" OFF) -option(WITH_XPU "Compile demow ith xpu" OFF) -option(WITH_NPU "Compile demow ith npu" OFF) +option(WITH_LOONGARCH "Compile demo with LOONGARCH" OFF) +option(WITH_SW "Compile demo with SW" OFF) +option(WITH_XPU "Compile demo with xpu" OFF) +option(WITH_NPU "Compile demo with npu" OFF) if(NOT WITH_STATIC_LIB) add_definitions("-DPADDLE_WITH_SHARED_LIB") else() - # PD_INFER_DECL is mainly used to set the dllimport/dllexport attribute in dynamic library mode. + # PD_INFER_DECL is mainly used to set the dllimport/dllexport attribute in dynamic library mode. # Set it to empty in static library mode to avoid compilation issues. add_definitions("/DPD_INFER_DECL=") endif() macro(safe_set_static_flag) - foreach(flag_var - CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE - CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) - if(${flag_var} MATCHES "/MD") - string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") - endif(${flag_var} MATCHES "/MD") - endforeach(flag_var) + foreach(flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + endif() + endforeach() endmacro() if(NOT DEFINED PADDLE_LIB) - message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") + message( + FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") endif() if(NOT DEFINED DEMO_NAME) message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name") endif() -include_directories("${PADDLE_LIB}/") +include_directories("${PADDLE_LIB}/paddle/include") set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/include") @@ -56,19 +60,23 @@ link_directories("${PADDLE_LIB}/paddle/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib") -if (WIN32) +if(WIN32) add_definitions("/DGOOGLE_GLOG_DLL_DECL=") option(MSVC_STATIC_CRT "use static C Runtime library by default" ON) - if (MSVC_STATIC_CRT) - if (WITH_MKL) + if(MSVC_STATIC_CRT) + if(WITH_MKL) set(FLAG_OPENMP "/openmp") endif() - set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}") + set(CMAKE_C_FLAGS_DEBUG + "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}") + set(CMAKE_C_FLAGS_RELEASE + "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}") + set(CMAKE_CXX_FLAGS_DEBUG + "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}") + set(CMAKE_CXX_FLAGS_RELEASE + "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}") safe_set_static_flag() - if (WITH_STATIC_LIB) + if(WITH_STATIC_LIB) add_definitions(-DSTATIC_LIB) endif() endif() @@ -83,38 +91,50 @@ if(WITH_GPU) if(NOT WIN32) include_directories("/usr/local/cuda/include") if(CUDA_LIB STREQUAL "") - set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") + set(CUDA_LIB + "/usr/local/cuda/lib64/" + CACHE STRING "CUDA Library") endif() else() - include_directories("C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\include") + include_directories( + "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\include") if(CUDA_LIB STREQUAL "") - set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64") + set(CUDA_LIB + "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64" + ) endif() - endif(NOT WIN32) + endif() endif() -if (USE_TENSORRT AND WITH_GPU) - set(TENSORRT_ROOT "" CACHE STRING "The root directory of TensorRT library") +if(USE_TENSORRT AND WITH_GPU) + set(TENSORRT_ROOT + "" + CACHE STRING "The root directory of TensorRT library") if("${TENSORRT_ROOT}" STREQUAL "") - message(FATAL_ERROR "The TENSORRT_ROOT is empty, you must assign it a value with CMake command. Such as: -DTENSORRT_ROOT=TENSORRT_ROOT_PATH ") + message( + FATAL_ERROR + "The TENSORRT_ROOT is empty, you must assign it a value with CMake command. Such as: -DTENSORRT_ROOT=TENSORRT_ROOT_PATH " + ) endif() set(TENSORRT_INCLUDE_DIR ${TENSORRT_ROOT}/include) set(TENSORRT_LIB_DIR ${TENSORRT_ROOT}/lib) file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS) - string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION - "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" + TENSORRT_MAJOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") if("${TENSORRT_MAJOR_VERSION}" STREQUAL "") - file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h TENSORRT_VERSION_FILE_CONTENTS) - string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION - "${TENSORRT_VERSION_FILE_CONTENTS}") + file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h + TENSORRT_VERSION_FILE_CONTENTS) + string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" + TENSORRT_MAJOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") endif() if("${TENSORRT_MAJOR_VERSION}" STREQUAL "") message(SEND_ERROR "Failed to detect TensorRT version.") endif() string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1" - TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}") - message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. " - "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ") + TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}") + message( + STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. " + "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ") include_directories("${TENSORRT_INCLUDE_DIR}") link_directories("${TENSORRT_LIB_DIR}") endif() @@ -126,79 +146,117 @@ if(WITH_MKL) set(MATH_LIB ${MATH_LIB_PATH}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX} ${MATH_LIB_PATH}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX}) else() - set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} - ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(MATH_LIB + ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} + ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) endif() set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn") if(EXISTS ${MKLDNN_PATH}) include_directories("${MKLDNN_PATH}/include") if(WIN32) - set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib) - else(WIN32) - set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) - endif(WIN32) + set(MKLDNN_LIB ${MKLDNN_PATH}/lib/dnnl.dll) + else() + set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libdnnl.so.3) + endif() endif() -elseif((NOT WITH_MIPS) AND (NOT WITH_SW)) +elseif((NOT (WITH_MIPS OR WITH_LOONGARCH)) AND (NOT WITH_SW)) set(OPENBLAS_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}openblas") include_directories("${OPENBLAS_LIB_PATH}/include/openblas") if(WIN32) - set(MATH_LIB ${OPENBLAS_LIB_PATH}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(MATH_LIB + ${OPENBLAS_LIB_PATH}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}) else() - set(MATH_LIB ${OPENBLAS_LIB_PATH}/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(MATH_LIB + ${OPENBLAS_LIB_PATH}/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX}) endif() endif() if(WITH_STATIC_LIB) - set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS + ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX} + ) else() if(WIN32) - set(DEPS ${PADDLE_LIB}/paddle/lib/paddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS + ${PADDLE_LIB}/paddle/lib/paddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX}) else() - set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(DEPS + ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX} + ) endif() endif() -if (WITH_ONNXRUNTIME) +if(WITH_ONNXRUNTIME) if(WIN32) - set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx) + set(DEPS + ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib + paddle2onnx) elseif(APPLE) - set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx) + set(DEPS + ${DEPS} + ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib + paddle2onnx) else() - set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx) + set(DEPS + ${DEPS} + ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 + paddle2onnx) endif() endif() -if (NOT WIN32) +if(NOT WIN32) set(EXTERNAL_LIB "-lrt -ldl -lpthread") - set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} - glog gflags protobuf xxhash cryptopp + set(DEPS + ${DEPS} + ${MATH_LIB} + ${MKLDNN_LIB} + glog + gflags + protobuf + xxhash + cryptopp ${EXTERNAL_LIB}) + if(WITH_SHARED_PHI) + set(DEPS ${DEPS} ${PADDLE_LIB}/paddle/lib/libphi${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() else() - set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} - glog gflags_static libprotobuf xxhash cryptopp-static ${EXTERNAL_LIB}) + set(DEPS + ${DEPS} + ${MATH_LIB} + ${MKLDNN_LIB} + glog + gflags_static + libprotobuf + xxhash + cryptopp-static + ${EXTERNAL_LIB}) set(DEPS ${DEPS} shlwapi.lib) -endif(NOT WIN32) +endif() if(WITH_GPU) if(NOT WIN32) - if (USE_TENSORRT) - set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX}) - set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}) + if(USE_TENSORRT) + set(DEPS ${DEPS} + ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(DEPS + ${DEPS} + ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}) endif() set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX}) else() if(USE_TENSORRT) - set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX}) - set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} + ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} + ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX}) if(${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 7) - set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} + ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_STATIC_LIBRARY_SUFFIX}) endif() endif() - set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} ) - set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} ) - set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX}) endif() endif() @@ -208,56 +266,97 @@ endif() if(WITH_XPU AND NOT WIN32) set(XPU_INSTALL_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}xpu") - set(DEPS ${DEPS} ${XPU_INSTALL_PATH}/lib/libxpuapi${CMAKE_SHARED_LIBRARY_SUFFIX}) - set(DEPS ${DEPS} ${XPU_INSTALL_PATH}/lib/libxpurt${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} + ${XPU_INSTALL_PATH}/lib/libxpuapi${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} + ${XPU_INSTALL_PATH}/lib/libxpurt${CMAKE_SHARED_LIBRARY_SUFFIX}) endif() if(WITH_NPU AND NOT WIN32) - set(DEPS ${DEPS} ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64/libgraph${CMAKE_SHARED_LIBRARY_SUFFIX}) - set(DEPS ${DEPS} ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64/libge_runner${CMAKE_SHARED_LIBRARY_SUFFIX}) - set(DEPS ${DEPS} ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64/libascendcl${CMAKE_SHARED_LIBRARY_SUFFIX}) - set(DEPS ${DEPS} ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64/libascendcl${CMAKE_SHARED_LIBRARY_SUFFIX}) - set(DEPS ${DEPS} ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64/libacl_op_compiler${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(DEPS + ${DEPS} + ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64/libgraph${CMAKE_SHARED_LIBRARY_SUFFIX} + ) + set(DEPS + ${DEPS} + ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64/libge_runner${CMAKE_SHARED_LIBRARY_SUFFIX} + ) + set(DEPS + ${DEPS} + ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64/libascendcl${CMAKE_SHARED_LIBRARY_SUFFIX} + ) + set(DEPS + ${DEPS} + ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64/libascendcl${CMAKE_SHARED_LIBRARY_SUFFIX} + ) + set(DEPS + ${DEPS} + ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64/libacl_op_compiler${CMAKE_SHARED_LIBRARY_SUFFIX} + ) endif() add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) target_link_libraries(${DEMO_NAME} ${DEPS}) if(WIN32) if(USE_TENSORRT) - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX} - ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} - COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX} - ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} - ) + add_custom_command( + TARGET ${DEMO_NAME} + POST_BUILD + COMMAND + ${CMAKE_COMMAND} -E copy + ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX} + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + COMMAND + ${CMAKE_COMMAND} -E copy + ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX} + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}) if(${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 7) - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_SHARED_LIBRARY_SUFFIX} - ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}) + add_custom_command( + TARGET ${DEMO_NAME} + POST_BUILD + COMMAND + ${CMAKE_COMMAND} -E copy + ${TENSORRT_LIB_DIR}/myelin64_1${CMAKE_SHARED_LIBRARY_SUFFIX} + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}) endif() endif() if(WITH_MKL) - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/mklml.dll ${CMAKE_BINARY_DIR}/Release - COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/libiomp5md.dll ${CMAKE_BINARY_DIR}/Release - COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll ${CMAKE_BINARY_DIR}/Release - ) + add_custom_command( + TARGET ${DEMO_NAME} + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/mklml.dll + ${CMAKE_BINARY_DIR}/Release + COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/libiomp5md.dll + ${CMAKE_BINARY_DIR}/Release + COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/dnnl.dll + ${CMAKE_BINARY_DIR}/Release) else() - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release - ) + add_custom_command( + TARGET ${DEMO_NAME} + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll + ${CMAKE_BINARY_DIR}/Release) endif() if(WITH_ONNXRUNTIME) - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll - ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} - COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll - ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} - ) + add_custom_command( + TARGET ${DEMO_NAME} + POST_BUILD + COMMAND + ${CMAKE_COMMAND} -E copy + ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + COMMAND + ${CMAKE_COMMAND} -E copy + ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}) endif() if(NOT WITH_STATIC_LIB) - add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} - ) + add_custom_command( + TARGET ${DEMO_NAME} + POST_BUILD + COMMAND + ${CMAKE_COMMAND} -E copy + "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}) endif() endif() diff --git a/example/auto_compression/pytorch_yolo_series/cpp_infer/compile.sh b/example/auto_compression/pytorch_yolo_series/cpp_infer/compile.sh index afff924b4..b03bd9c99 100644 --- a/example/auto_compression/pytorch_yolo_series/cpp_infer/compile.sh +++ b/example/auto_compression/pytorch_yolo_series/cpp_infer/compile.sh @@ -14,10 +14,14 @@ WITH_MKL=ON WITH_GPU=ON USE_TENSORRT=ON -LIB_DIR=/root/auto_compress/Paddle/build/paddle_inference_install_dir/ +# Paddle Inference预测库路径 +LIB_DIR=/work/Paddle/build/paddle_inference_install_dir +# CUDNN路径 CUDNN_LIB=/usr/lib/x86_64-linux-gnu/ +# CUDA路径 CUDA_LIB=/usr/local/cuda/lib64 -TENSORRT_ROOT=/root/auto_compress/trt/trt8.4/ +# TensorRT安装包路径,为TRT资源包解压后的绝对路径,其中包含lib和include文件夹 +TENSORRT_ROOT=/work/TensorRT-8.6.1.6 WITH_ROCM=OFF ROCM_LIB=/opt/rocm/lib diff --git a/example/auto_compression/pytorch_yolo_series/cpp_infer/trt_run.cc b/example/auto_compression/pytorch_yolo_series/cpp_infer/trt_run.cc index 101cf33c8..48a84e561 100644 --- a/example/auto_compression/pytorch_yolo_series/cpp_infer/trt_run.cc +++ b/example/auto_compression/pytorch_yolo_series/cpp_infer/trt_run.cc @@ -1,3 +1,17 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #include #include #include @@ -5,27 +19,28 @@ #include #include -#include -#include "paddle/include/paddle_inference_api.h" -#include "paddle/include/experimental/phi/common/float16.h" +#include "paddle_inference_api.h" using paddle_infer::Config; -using paddle_infer::Predictor; using paddle_infer::CreatePredictor; using paddle_infer::PrecisionType; -using phi::dtype::float16; +using paddle_infer::Predictor; DEFINE_string(model_dir, "", "Directory of the inference model."); DEFINE_string(model_file, "", "Path of the inference model file."); DEFINE_string(params_file, "", "Path of the inference params file."); -DEFINE_string(arch, "YOLOv5", "Architectures name, can be: YOLOv5, YOLOv6, YOLOv7."); -DEFINE_string(run_mode, "trt_fp32", "run_mode which can be: trt_fp32, trt_fp16 and trt_int8"); +DEFINE_string( + run_mode, + "paddle_gpu", + "run_mode which can be: trt_fp32, trt_fp16 and trt_int8 and paddle_gpu"); DEFINE_int32(batch_size, 1, "Batch size."); DEFINE_int32(gpu_id, 0, "GPU card ID num."); DEFINE_int32(trt_min_subgraph_size, 3, "tensorrt min_subgraph_size"); DEFINE_int32(warmup, 50, "warmup"); DEFINE_int32(repeats, 1000, "repeats"); +DEFINE_bool(use_dynamic_shape, false, "use trt dynaminc shape."); +DEFINE_bool(use_calib, true, "use trt int8 calibration."); using Time = decltype(std::chrono::high_resolution_clock::now()); Time time() { return std::chrono::high_resolution_clock::now(); }; @@ -38,89 +53,89 @@ double time_diff(Time t1, Time t2) { std::shared_ptr InitPredictor() { Config config; - std::string model_path; if (FLAGS_model_dir != "") { config.SetModel(FLAGS_model_dir); - model_path = FLAGS_model_dir.substr(0, FLAGS_model_dir.find_last_of("/")); - } else { - config.SetModel(FLAGS_model_file, FLAGS_params_file); - model_path = FLAGS_model_file.substr(0, FLAGS_model_file.find_last_of("/")); } - // enable tune - std::cout << "model_path: " << model_path << std::endl; - config.EnableUseGpu(256, FLAGS_gpu_id); + config.SetModel(FLAGS_model_file, FLAGS_params_file); + + config.EnableUseGpu(500, FLAGS_gpu_id); + if (FLAGS_run_mode == "trt_fp32") { - config.EnableTensorRtEngine(1 << 30, FLAGS_batch_size, FLAGS_trt_min_subgraph_size, - PrecisionType::kFloat32, false, false); + config.EnableTensorRtEngine(1 << 30 * FLAGS_batch_size, + FLAGS_batch_size, + FLAGS_trt_min_subgraph_size, + PrecisionType::kFloat32, + false, + false); } else if (FLAGS_run_mode == "trt_fp16") { - config.EnableTensorRtEngine(1 << 30, FLAGS_batch_size, FLAGS_trt_min_subgraph_size, - PrecisionType::kHalf, false, false); + config.EnableTensorRtEngine(1 << 30 * FLAGS_batch_size, + FLAGS_batch_size, + FLAGS_trt_min_subgraph_size, + PrecisionType::kHalf, + false, + false); } else if (FLAGS_run_mode == "trt_int8") { - config.EnableTensorRtEngine(1 << 30, FLAGS_batch_size, FLAGS_trt_min_subgraph_size, - PrecisionType::kInt8, false, false); + config.EnableTensorRtEngine(1 << 30 * FLAGS_batch_size, + FLAGS_batch_size, + FLAGS_trt_min_subgraph_size, + PrecisionType::kInt8, + false, + FLAGS_use_calib); + } + if (FLAGS_use_dynamic_shape) { + std::map> min_input_shape = { + {"image", {1, 3, 640, 640}}}; + std::map> max_input_shape = { + {"image", {4, 3, 640, 640}}}; + std::map> opt_input_shape = { + {"image", {2, 3, 640, 640}}}; + config.SetTRTDynamicShapeInfo( + min_input_shape, max_input_shape, opt_input_shape); } + // Open the memory optim. config.EnableMemoryOptim(); + config.SwitchIrDebug(true); config.SwitchIrOptim(true); return CreatePredictor(config); } -template -void run(Predictor *predictor, const std::vector &input, - const std::vector &input_shape, type* out_data, std::vector out_shape) { - - // prepare input - int input_num = std::accumulate(input_shape.begin(), input_shape.end(), 1, - std::multiplies()); - - auto input_names = predictor->GetInputNames(); - auto input_t = predictor->GetInputHandle(input_names[0]); - input_t->Reshape(input_shape); - input_t->CopyFromCpu(input.data()); - - for (int i = 0; i < FLAGS_warmup; ++i) - CHECK(predictor->Run()); +void run(Predictor *predictor, + const std::vector &input, + const std::vector &input_shape, + std::vector *out_data) { + int input_num = std::accumulate( + input_shape.begin(), input_shape.end(), 1, std::multiplies()); - auto st = time(); - for (int i = 0; i < FLAGS_repeats; ++i) { - auto input_names = predictor->GetInputNames(); - auto input_t = predictor->GetInputHandle(input_names[0]); + auto input_names = predictor->GetInputNames(); + auto output_names = predictor->GetOutputNames(); + auto input_t = predictor->GetInputHandle(input_names[0]); + input_t->Reshape(input_shape); + input_t->CopyFromCpu(input.data()); - input_t->Reshape(input_shape); - input_t->CopyFromCpu(input.data()); + for (size_t i = 0; i < FLAGS_warmup; ++i) CHECK(predictor->Run()); + auto st = time(); + for (size_t i = 0; i < FLAGS_repeats; ++i) { CHECK(predictor->Run()); - - auto output_names = predictor->GetOutputNames(); auto output_t = predictor->GetOutputHandle(output_names[0]); std::vector output_shape = output_t->shape(); - output_t->CopyToCpu(out_data); - + int out_num = std::accumulate( + output_shape.begin(), output_shape.end(), 1, std::multiplies()); + out_data->resize(out_num); + output_t->CopyToCpu(out_data->data()); } - - LOG(INFO) << "[" << FLAGS_run_mode << " bs-" << FLAGS_batch_size << " ] run avg time is " << time_diff(st, time()) / FLAGS_repeats + LOG(INFO) << "run avg time is " << time_diff(st, time()) / FLAGS_repeats << " ms"; } -int main(int argc, char *argv[]) -{ +int main(int argc, char *argv[]) { google::ParseCommandLineFlags(&argc, &argv, true); auto predictor = InitPredictor(); - - std::cout << "====== Use float instead of FP16 data ======" << std::endl; - std::vector input_data(FLAGS_batch_size * 3 * 640 * 640, float(1.0)); std::vector input_shape = {FLAGS_batch_size, 3, 640, 640}; + std::vector input_data(FLAGS_batch_size * 3 * 640 * 640); + for (size_t i = 0; i < input_data.size(); ++i) input_data[i] = i % 255 * 0.1; + std::vector out_data; + run(predictor.get(), input_data, input_shape, &out_data); - int out_box_shape = 25200; - if (FLAGS_arch == "YOLOv6"){ - out_box_shape = 8400; - } - float* out_data; - std::vector out_shape{ FLAGS_batch_size, 1, out_box_shape, 85}; - int out_data_size = FLAGS_batch_size * out_box_shape * 85; - - // Only use Pinned mem for D2H. - cudaHostAlloc((void**)&out_data, sizeof(float) * out_data_size, cudaHostAllocMapped); - - run(predictor.get(), input_data, input_shape, out_data, out_shape); return 0; -} \ No newline at end of file +}