求救：CPU模式下推理正确，GPU（CUDA驱动）推理异常 #3159

HEMOURENNN · 2025-01-09T07:56:25Z

平台(如果交叉编译请再附上交叉编译目标平台): Win10

Platform(Include target platform as well if cross-compiling):

Github版本:3.0.3

Github Version: `8180b48`

直接下载ZIP包请提供下载日期以及压缩包注释里的git版本(可通过7z l zip包路径命令并在输出信息中搜索Comment 获得，形如Comment = bc80b11110cd440aacdabbf59658d630527a7f2b)。 git clone请提供 git commit 第一行的commit id

Provide date (or better yet, git revision from the comment section of the zip. Obtainable using 7z l PATH/TO/ZIP and search for Comment in the output) if downloading source as zip,otherwise provide the first commit id from the output of git commit

编译方式:

Compiling Method

请在这里粘贴cmake参数或使用的cmake脚本路径以及完整输出
cmake  -G "Ninja"  -DCMAKE_BUILD_TYPE:STRING="Release"  -DMNN_BUILD_DEMO:BOOL="True" -DMNN_CPU_WEIGHT_DEQUANT_GEMM:BOOL="True" -DMNN_LOW_MEMORY:BOOL="True" -DMNN_BUILD_OPENCV:BOOL="True" -DMNN_CUDA:BOOL="True" -DMNN_WIN_RUNTIME_MT:BOOL="True"

具体代码:

cv::Mat TransImageToTensorByCV(const char* path, Tensor* tensor) {
	Tensor* inputTensor = new Tensor(tensor, Tensor::TENSORFLOW);
	auto targetChannel = inputTensor->channel();
	auto targetHeight = inputTensor->height();
	auto targetWidth = inputTensor->width();
	MNN_PRINT("input: w:%d , h:%d, bpp: %d\n", targetWidth, targetHeight, targetChannel);

	// 读取图像
	auto image = cv::imread(path, cv::IMREAD_COLOR);
	// 将图像通道顺序从 BGR 转换为 RGB
	cv::cvtColor(image, image, cv::COLOR_BGR2RGB);

	if (image.empty()) {
		MNN_ERROR("Can't open image %s\n", path);
		return cv::Mat();
	}
	MNN_PRINT("origin size: %d, %d\n", image.cols, image.rows);

	// 创建一个新的Mat用于存储填充后的图像
	cv::Mat dstUintImage(targetHeight, targetWidth, CV_8UC3, cv::Scalar(0, 0, 0));

	float scale = std::min(static_cast<float>(targetWidth) / image.cols, static_cast<float>(targetHeight) / image.rows);
	cv::Mat resizedImage;
	cv::resize(image, resizedImage, cv::Size(), scale, scale, cv::INTER_CUBIC);

	// 计算填充大小
	int startX = (targetWidth - resizedImage.cols) / 2;
	int startY = (targetHeight - resizedImage.rows) / 2;

	// 将归一化后的图像转移到0填充的Mat中
	resizedImage.copyTo(dstUintImage(cv::Rect(startX, startY, resizedImage.cols, resizedImage.rows)));
	// 将 BGR 图像转换为浮点数类型
	cv::Mat dstFloatImage;

	dstUintImage.convertTo(dstFloatImage, CV_32FC3, 1.0 / 255.0);

	// 标准化数据处理
	float means[3] = { 0.485f, 0.456f, 0.406f };
	float normals[3] = { 0.229f, 0.224f, 0.225f };
	dstFloatImage -= cv::Scalar(means[0], means[1], means[2]);
	dstFloatImage /= cv::Scalar(normals[0], normals[1], normals[2]);

	::memcpy(inputTensor->host<float>(), dstFloatImage.data, targetHeight * targetWidth * targetChannel * sizeof(float));
	tensor->copyFromHostTensor(inputTensor);

	return image;
}

void ProcBBoxResult(Tensor* bbox, int topkBoxNum, cv::Mat oriSearchImage) {
	std::shared_ptr<Tensor> bboxTensor(new Tensor(bbox, Tensor::TENSORFLOW));
	bbox->copyToHostTensor(bboxTensor.get());

	int stride = bbox->stride(0); // stride为4: (cx, cy, w, h)
	std::vector<std::vector<int>> topkBboxs;
	std::vector<cv::Mat> topkTargets;
	auto bboxValues = bboxTensor->host<float>();

	bboxTensor->print();
}

int main(int argc, const char* argv[]) {
	if (argc < 3) {
		MNN_PRINT("Usage: ./pictureRecognition.out model.mnn input0.jpg input1.jpg input2.jpg ... \n");
		return 0;
	}
	// 初始化网络
	std::shared_ptr<Interpreter> net(Interpreter::createFromFile(argv[1]), Interpreter::destroy);
	net->setCacheFile(".cachefile");
	net->setSessionMode(Interpreter::Session_Backend_Auto);
	net->setSessionHint(Interpreter::MAX_TUNING_NUMBER, 5);
	ScheduleConfig config;
	config.type = MNN_FORWARD_CUDA;



	if (config.type == MNN_FORWARD_CUDA) {
		BackendConfig bnconfig;

		// GPU设备选择
		MNNDeviceContext gpuDeviceConfig;
		gpuDeviceConfig.deviceId = 0;
		bnconfig.sharedContext = &gpuDeviceConfig;

		//
		BackendConfig::PrecisionMode precision = MNN::BackendConfig::Precision_High;
		bnconfig.precision = precision;

		config.backendConfig = &bnconfig;
	}

	auto session = net->createSession(config);
	float memoryUsage = 0.0f;
	net->getSessionInfo(session, MNN::Interpreter::MEMORY, &memoryUsage);
	float flops = 0.0f;
	net->getSessionInfo(session, MNN::Interpreter::FLOPS, &flops);
	int backendType[2];
	net->getSessionInfo(session, MNN::Interpreter::BACKENDS, backendType);
	MNN_PRINT("Session Info: memory use %f MB, flops is %f M, backendType is %d, batch size = %d\n", memoryUsage, flops, backendType[0], argc - 2);

	// 初始化输入
	auto inputTemp = net->getSessionInput(session, "input.1");
	auto inputSearch = net->getSessionInput(session, "x.1");
	Tensor* input[] = { inputTemp ,inputSearch };
	cv::Mat inputImage[2];

	// 输入图像预处理
	for (int i = 0; i < 2; i++) {
		// TransImageToTensor(argv[i + 2], input[i], batch);
		inputImage[i] = TransImageToTensorByCV(argv[i + 2], input[i]);
	}

	// 初始化输出
	auto boxOutput = net->getSessionOutput(session, "1380");
	auto offsetMapOutput = net->getSessionOutput(session, "offset_map");
	auto scoreMapOutput = net->getSessionOutput(session, "score_map_ctr.3");
	auto sizeMapOutput = net->getSessionOutput(session, "size_map");

	Tensor* output[] = { boxOutput ,offsetMapOutput, scoreMapOutput ,sizeMapOutput };
	// 开始推理
	net->runSession(session);
	for (int i = 0; i < 4; i++) {
		auto dimType = output[i]->getDimensionType();
		if (output[i]->getType().code != halide_type_float) {
			dimType = Tensor::TENSORFLOW;
		}
	}

	// 处理bbox结果
	BBOX_RESULT bboxResult = ProcBBoxResult(boxOutput, 1, inputImage[1]);
	net->updateCacheFile(session);
	return 0;
}

在GPU（CUDA）和CPU下进行了多组不同数据的实验后发现：
GPU模式下ProcBBoxResult函数中bboxTensor->print();打印的第二个值永远都是错误的，而且值很大，其他的三个值都是正确的。下面列出2组数据：
数据①
CPU：

GPU：

数据②
CPU：

GPU：

The text was updated successfully, but these errors were encountered:

jxt1234 · 2025-01-09T11:31:33Z

原始模型麻烦发一下看看，另外用 backendTest.out 可以测试出问题么?

HEMOURENNN · 2025-01-13T02:16:29Z

原始模型麻烦发一下看看，另外用 backendTest.out 可以测试出问题么?

大佬在吗？
我的模型是双输入的，大小有300+M。github好像上传不了，怎么发给你呢？
我目前也在尝试用backendTest.out进行测试

jxt1234 added CUDA bug Something isn't working labels Jan 9, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

求救：CPU模式下推理正确，GPU（CUDA驱动）推理异常 #3159

求救：CPU模式下推理正确，GPU（CUDA驱动）推理异常 #3159

HEMOURENNN commented Jan 9, 2025

jxt1234 commented Jan 9, 2025

HEMOURENNN commented Jan 13, 2025 •

edited

Loading

求救：CPU模式下推理正确，GPU（CUDA驱动）推理异常 #3159

求救：CPU模式下推理正确，GPU（CUDA驱动）推理异常 #3159

Comments

HEMOURENNN commented Jan 9, 2025

平台(如果交叉编译请再附上交叉编译目标平台): Win10

Platform(Include target platform as well if cross-compiling):

Github版本:3.0.3

Github Version: 8180b48

编译方式:

Compiling Method

具体代码:

jxt1234 commented Jan 9, 2025

HEMOURENNN commented Jan 13, 2025 • edited Loading

Github Version: `8180b48`

HEMOURENNN commented Jan 13, 2025 •

edited

Loading