-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* The v5-cls model supports TensorRT10 * The v5-cls model supports TensorRT10 Python API * add YOLOv5-cls readme * pre-commit and modify trtx download branch * pre-commit * The v5 det model supports TensorRT10 * import pycuda.autoinit # noqa: F401 * The v5 det model supports TensorRT10 Python API * modeify readme * modefiy readme * modify reamde * Delete the link of nvinfer_plugin
- Loading branch information
Showing
9 changed files
with
1,521 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,259 @@ | ||
#include "cuda_utils.h" | ||
#include "logging.h" | ||
#include "model.h" | ||
#include "postprocess.h" | ||
#include "preprocess.h" | ||
#include "utils.h" | ||
|
||
#include <chrono> | ||
#include <cmath> | ||
#include <iostream> | ||
|
||
using namespace nvinfer1; | ||
|
||
static Logger gLogger; | ||
const static int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; | ||
|
||
bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, bool& is_p6, float& gd, float& gw, | ||
std::string& img_dir) { | ||
if (argc < 4) | ||
return false; | ||
if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) { | ||
wts = std::string(argv[2]); | ||
engine = std::string(argv[3]); | ||
auto net = std::string(argv[4]); | ||
if (net[0] == 'n') { | ||
gd = 0.33; | ||
gw = 0.25; | ||
} else if (net[0] == 's') { | ||
gd = 0.33; | ||
gw = 0.50; | ||
} else if (net[0] == 'm') { | ||
gd = 0.67; | ||
gw = 0.75; | ||
} else if (net[0] == 'l') { | ||
gd = 1.0; | ||
gw = 1.0; | ||
} else if (net[0] == 'x') { | ||
gd = 1.33; | ||
gw = 1.25; | ||
} else if (net[0] == 'c' && argc == 7) { | ||
gd = atof(argv[5]); | ||
gw = atof(argv[6]); | ||
} else { | ||
return false; | ||
} | ||
if (net.size() == 2 && net[1] == '6') { | ||
is_p6 = true; | ||
} | ||
} else if (std::string(argv[1]) == "-d" && argc == 4) { | ||
engine = std::string(argv[2]); | ||
img_dir = std::string(argv[3]); | ||
} else { | ||
return false; | ||
} | ||
return true; | ||
} | ||
|
||
void prepare_buffers(ICudaEngine* engine, float** gpu_input_buffer, float** gpu_output_buffer, | ||
float** cpu_output_buffer) { | ||
assert(engine->getNbIOTensors() == 2); | ||
// In order to bind the buffers, we need to know the names of the input and output tensors. | ||
// Note that indices are guaranteed to be less than IEngine::getNbBindings() | ||
TensorIOMode input_mode = engine->getTensorIOMode(kInputTensorName); | ||
if (input_mode != TensorIOMode::kINPUT) { | ||
std::cerr << kInputTensorName << " should be input tensor" << std::endl; | ||
assert(false); | ||
} | ||
TensorIOMode output_mode = engine->getTensorIOMode(kOutputTensorName); | ||
if (output_mode != TensorIOMode::kOUTPUT) { | ||
std::cerr << kOutputTensorName << " should be output tensor" << std::endl; | ||
assert(false); | ||
} | ||
// Create GPU buffers on device | ||
CUDA_CHECK(cudaMalloc((void**)gpu_input_buffer, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); | ||
CUDA_CHECK(cudaMalloc((void**)gpu_output_buffer, kBatchSize * kOutputSize * sizeof(float))); | ||
|
||
*cpu_output_buffer = new float[kBatchSize * kOutputSize]; | ||
} | ||
|
||
void infer(IExecutionContext& context, cudaStream_t& stream, void** gpu_buffers, float* output, int batchsize) { | ||
context.setInputTensorAddress(kInputTensorName, gpu_buffers[0]); | ||
context.setOutputTensorAddress(kOutputTensorName, gpu_buffers[1]); | ||
context.enqueueV3(stream); | ||
CUDA_CHECK(cudaMemcpyAsync(output, gpu_buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, | ||
stream)); | ||
cudaStreamSynchronize(stream); | ||
} | ||
|
||
void serialize_engine(unsigned int max_batchsize, bool& is_p6, float& gd, float& gw, std::string& wts_name, | ||
std::string& engine_name) { | ||
// Create builder | ||
IBuilder* builder = createInferBuilder(gLogger); | ||
IBuilderConfig* config = builder->createBuilderConfig(); | ||
|
||
// Create model to populate the network, then set the outputs and create an engine | ||
IHostMemory* serialized_engine = nullptr; | ||
if (is_p6) { | ||
serialized_engine = build_det_p6_engine(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name); | ||
} else { | ||
serialized_engine = build_det_engine(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name); | ||
} | ||
assert(serialized_engine != nullptr); | ||
|
||
// Serialize the engine | ||
assert(serialized_engine != nullptr); | ||
|
||
// Save engine to file | ||
std::ofstream p(engine_name, std::ios::binary); | ||
if (!p) { | ||
std::cerr << "Could not open plan output file" << std::endl; | ||
assert(false); | ||
} | ||
p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size()); | ||
|
||
// Close everything down | ||
delete serialized_engine; | ||
delete config; | ||
delete builder; | ||
} | ||
|
||
void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, | ||
IExecutionContext** context) { | ||
std::ifstream file(engine_name, std::ios::binary); | ||
if (!file.good()) { | ||
std::cerr << "read " << engine_name << " error!" << std::endl; | ||
assert(false); | ||
} | ||
size_t size = 0; | ||
file.seekg(0, file.end); | ||
size = file.tellg(); | ||
file.seekg(0, file.beg); | ||
char* serialized_engine = new char[size]; | ||
assert(serialized_engine); | ||
file.read(serialized_engine, size); | ||
file.close(); | ||
|
||
*runtime = createInferRuntime(gLogger); | ||
assert(*runtime); | ||
*engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); | ||
assert(*engine); | ||
*context = (*engine)->createExecutionContext(); | ||
assert(*context); | ||
delete[] serialized_engine; | ||
} | ||
|
||
int main(int argc, char** argv) { | ||
// -s ../models/yolov5n.wts ../models/yolov5n.fp32.trt n | ||
// -d ../models/yolov5n.fp32.trt ../images | ||
cudaSetDevice(kGpuId); | ||
|
||
std::string wts_name = ""; | ||
std::string engine_name = ""; | ||
bool is_p6 = false; | ||
float gd = 0.0f, gw = 0.0f; | ||
std::string img_dir; | ||
|
||
if (!parse_args(argc, argv, wts_name, engine_name, is_p6, gd, gw, img_dir)) { | ||
std::cerr << "arguments not right!" << std::endl; | ||
std::cerr << "./yolov5_det -s [.wts] [.engine] [n/s/m/l/x/n6/s6/m6/l6/x6 or c/c6 gd gw] // serialize model to " | ||
"plan file" | ||
<< std::endl; | ||
std::cerr << "./yolov5_det -d [.engine] ../images // deserialize plan file and run inference" << std::endl; | ||
return -1; | ||
} | ||
|
||
// Create a model using the API directly and serialize it to a file | ||
if (!wts_name.empty()) { | ||
serialize_engine(kBatchSize, is_p6, gd, gw, wts_name, engine_name); | ||
return 0; | ||
} | ||
|
||
// Deserialize the engine from file | ||
IRuntime* runtime = nullptr; | ||
ICudaEngine* engine = nullptr; | ||
IExecutionContext* context = nullptr; | ||
deserialize_engine(engine_name, &runtime, &engine, &context); | ||
cudaStream_t stream; | ||
CUDA_CHECK(cudaStreamCreate(&stream)); | ||
|
||
// Init CUDA preprocessing | ||
cuda_preprocess_init(kMaxInputImageSize); | ||
|
||
// Prepare cpu and gpu buffers | ||
float* gpu_buffers[2]; | ||
float* cpu_output_buffer = nullptr; | ||
prepare_buffers(engine, &gpu_buffers[0], &gpu_buffers[1], &cpu_output_buffer); | ||
|
||
// Read images from directory | ||
std::vector<std::string> file_names; | ||
if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { | ||
std::cerr << "read_files_in_dir failed." << std::endl; | ||
return -1; | ||
} | ||
|
||
// batch predict | ||
for (size_t i = 0; i < file_names.size(); i += kBatchSize) { | ||
// Get a batch of images | ||
std::vector<cv::Mat> img_batch; | ||
std::vector<std::string> img_name_batch; | ||
for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { | ||
cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); | ||
img_batch.push_back(img); | ||
img_name_batch.push_back(file_names[j]); | ||
} | ||
|
||
// Preprocess | ||
cuda_batch_preprocess(img_batch, gpu_buffers[0], kInputW, kInputH, stream); | ||
|
||
// Run inference | ||
auto start = std::chrono::system_clock::now(); | ||
infer(*context, stream, (void**)gpu_buffers, cpu_output_buffer, kBatchSize); | ||
auto end = std::chrono::system_clock::now(); | ||
std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() | ||
<< "ms" << std::endl; | ||
|
||
// NMS | ||
std::vector<std::vector<Detection>> res_batch; | ||
batch_nms(res_batch, cpu_output_buffer, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); | ||
|
||
// print results | ||
for (size_t j = 0; j < res_batch.size(); j++) { | ||
for (size_t k = 0; k < res_batch[j].size(); k++) { | ||
std::cout << "image: " << img_name_batch[j] << ", bbox: " << res_batch[j][k].bbox[0] << ", " | ||
<< res_batch[j][k].bbox[1] << ", " << res_batch[j][k].bbox[2] << ", " | ||
<< res_batch[j][k].bbox[3] << ", conf: " << res_batch[j][k].conf | ||
<< ", class_id: " << res_batch[j][k].class_id << std::endl; | ||
} | ||
} | ||
|
||
// Draw bounding boxes | ||
draw_bbox(img_batch, res_batch); | ||
|
||
// Save images | ||
for (size_t j = 0; j < img_batch.size(); j++) { | ||
cv::imwrite("_" + img_name_batch[j], img_batch[j]); | ||
} | ||
} | ||
|
||
// Release stream and buffers | ||
cudaStreamDestroy(stream); | ||
CUDA_CHECK(cudaFree(gpu_buffers[0])); | ||
CUDA_CHECK(cudaFree(gpu_buffers[1])); | ||
delete[] cpu_output_buffer; | ||
cuda_preprocess_destroy(); | ||
// Destroy the engine | ||
delete context; | ||
delete engine; | ||
delete runtime; | ||
|
||
// Print histogram of the output distribution | ||
// std::cout << "\nOutput:\n\n"; | ||
// for (unsigned int i = 0; i < kOutputSize; i++) { | ||
// std::cout << prob[i] << ", "; | ||
// if (i % 10 == 0) std::cout << std::endl; | ||
// } | ||
// std::cout << std::endl; | ||
|
||
return 0; | ||
} |
Oops, something went wrong.