Skip to content

Commit

Permalink
[CVCUDA] PP-OCR Cls & Rec preprocessor support CV-CUDA (PaddlePaddle#…
Browse files Browse the repository at this point in the history
…1470)

* ppocr cls preprocessor use manager

* hwc2chw cvcuda

* ppocr rec preproc use manager

* ocr rec preproc cvcuda

* fix rec preproc bug

* ppocr cls&rec preproc set normalize

* fix pybind

* address comment
  • Loading branch information
wang-xinyu authored Mar 2, 2023
1 parent fe2882a commit 044ab99
Show file tree
Hide file tree
Showing 19 changed files with 424 additions and 306 deletions.
28 changes: 20 additions & 8 deletions fastdeploy/vision/common/processors/cvcuda_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,34 +18,36 @@ namespace fastdeploy {
namespace vision {

#ifdef ENABLE_CVCUDA
nvcv::ImageFormat CreateCvCudaImageFormat(FDDataType type, int channel) {
nvcv::ImageFormat CreateCvCudaImageFormat(FDDataType type, int channel,
bool interleaved) {
FDASSERT(channel == 1 || channel == 3 || channel == 4,
"Only support channel be 1/3/4 in CV-CUDA.");
if (type == FDDataType::UINT8) {
if (channel == 1) {
return nvcv::FMT_U8;
} else if (channel == 3) {
return nvcv::FMT_BGR8;
return (interleaved ? nvcv::FMT_BGR8 : nvcv::FMT_BGR8p);
} else {
return nvcv::FMT_BGRA8;
return (interleaved ? nvcv::FMT_BGRA8 : nvcv::FMT_BGRA8p);
}
} else if (type == FDDataType::FP32) {
if (channel == 1) {
return nvcv::FMT_F32;
} else if (channel == 3) {
return nvcv::FMT_BGRf32;
return (interleaved ? nvcv::FMT_BGRf32 : nvcv::FMT_BGRf32p);
} else {
return nvcv::FMT_BGRAf32;
return (interleaved ? nvcv::FMT_BGRAf32 : nvcv::FMT_BGRAf32p);
}
}
FDASSERT(false, "Data type of %s is not supported.", Str(type).c_str());
return nvcv::FMT_BGRf32;
}

nvcv::TensorWrapData CreateCvCudaTensorWrapData(const FDTensor& tensor) {
nvcv::TensorWrapData CreateCvCudaTensorWrapData(const FDTensor& tensor,
Layout layout) {
FDASSERT(tensor.shape.size() == 3,
"When create CVCUDA tensor from FD tensor,"
"tensor shape should be 3-Dim, HWC layout");
"tensor shape should be 3-Dim,");
int batchsize = 1;
int h = tensor.Shape()[0];
int w = tensor.Shape()[1];
Expand All @@ -56,10 +58,20 @@ nvcv::TensorWrapData CreateCvCudaTensorWrapData(const FDTensor& tensor) {
buf.strides[2] = c * buf.strides[3];
buf.strides[1] = w * buf.strides[2];
buf.strides[0] = h * buf.strides[1];
if (layout == Layout::CHW) {
c = tensor.Shape()[0];
h = tensor.Shape()[1];
w = tensor.Shape()[2];
buf.strides[3] = FDDataTypeSize(tensor.Dtype());
buf.strides[2] = w * buf.strides[3];
buf.strides[1] = h * buf.strides[2];
buf.strides[0] = c * buf.strides[1];
}
buf.basePtr = reinterpret_cast<NVCVByte*>(const_cast<void*>(tensor.Data()));

nvcv::Tensor::Requirements req = nvcv::Tensor::CalcRequirements(
batchsize, {w, h}, CreateCvCudaImageFormat(tensor.Dtype(), c));
batchsize, {w, h},
CreateCvCudaImageFormat(tensor.Dtype(), c, layout == Layout::HWC));

nvcv::TensorDataStridedCuda tensor_data(
nvcv::TensorShape{req.shape, req.rank, req.layout},
Expand Down
7 changes: 5 additions & 2 deletions fastdeploy/vision/common/processors/cvcuda_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#pragma once

#include "fastdeploy/core/fd_tensor.h"
#include "fastdeploy/vision/common/processors/mat.h"

#ifdef ENABLE_CVCUDA
#include "nvcv/Tensor.hpp"
Expand All @@ -23,8 +24,10 @@
namespace fastdeploy {
namespace vision {

nvcv::ImageFormat CreateCvCudaImageFormat(FDDataType type, int channel);
nvcv::TensorWrapData CreateCvCudaTensorWrapData(const FDTensor& tensor);
nvcv::ImageFormat CreateCvCudaImageFormat(FDDataType type, int channel,
bool interleaved = true);
nvcv::TensorWrapData CreateCvCudaTensorWrapData(const FDTensor& tensor,
Layout layout = Layout::HWC);
void* GetCvCudaTensorDataPtr(const nvcv::TensorWrapData& tensor);
nvcv::ImageWrapData CreateImageWrapData(const FDTensor& tensor);
void CreateCvCudaImageBatchVarShape(std::vector<FDTensor*>& tensors,
Expand Down
20 changes: 20 additions & 0 deletions fastdeploy/vision/common/processors/hwc2chw.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,26 @@ bool HWC2CHW::ImplByFlyCV(Mat* mat) {
}
#endif

#ifdef ENABLE_CVCUDA
bool HWC2CHW::ImplByCvCuda(FDMat* mat) {
// Prepare input tensor
FDTensor* src = CreateCachedGpuInputTensor(mat);
auto src_tensor = CreateCvCudaTensorWrapData(*src);

// Prepare output tensor
mat->output_cache->Resize({mat->Channels(), mat->Height(), mat->Width()},
src->Dtype(), "output_cache", Device::GPU);
auto dst_tensor =
CreateCvCudaTensorWrapData(*(mat->output_cache), Layout::CHW);

cvcuda_reformat_op_(mat->Stream(), src_tensor, dst_tensor);

mat->SetTensor(mat->output_cache);
mat->mat_type = ProcLib::CVCUDA;
return true;
}
#endif

bool HWC2CHW::Run(Mat* mat, ProcLib lib) {
auto h = HWC2CHW();
return h(mat, lib);
Expand Down
12 changes: 12 additions & 0 deletions fastdeploy/vision/common/processors/hwc2chw.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
#pragma once

#include "fastdeploy/vision/common/processors/base.h"
#ifdef ENABLE_CVCUDA
#include <cvcuda/OpReformat.hpp>

#include "fastdeploy/vision/common/processors/cvcuda_utils.h"
#endif

namespace fastdeploy {
namespace vision {
Expand All @@ -24,10 +29,17 @@ class FASTDEPLOY_DECL HWC2CHW : public Processor {
bool ImplByOpenCV(Mat* mat);
#ifdef ENABLE_FLYCV
bool ImplByFlyCV(Mat* mat);
#endif
#ifdef ENABLE_CVCUDA
bool ImplByCvCuda(FDMat* mat);
#endif
std::string Name() { return "HWC2CHW"; }

static bool Run(Mat* mat, ProcLib lib = ProcLib::DEFAULT);
private:
#ifdef ENABLE_CVCUDA
cvcuda::Reformat cvcuda_reformat_op_;
#endif
};
} // namespace vision
} // namespace fastdeploy
1 change: 1 addition & 0 deletions fastdeploy/vision/common/processors/manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ bool ProcessorManager::Run(std::vector<FDMat>* images,
}
(*images)[i].input_cache = &input_caches_[i];
(*images)[i].output_cache = &output_caches_[i];
(*images)[i].proc_lib = proc_lib_;
if ((*images)[i].mat_type == ProcLib::CUDA) {
// Make a copy of the input data ptr, so that the original data ptr of
// FDMat won't be modified.
Expand Down
22 changes: 14 additions & 8 deletions fastdeploy/vision/common/processors/mat.cc
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,9 @@ std::vector<FDMat> WrapMat(const std::vector<cv::Mat>& images) {
}

bool CheckShapeConsistency(std::vector<Mat>* mats) {
if (mats == nullptr) {
return true;
}
for (size_t i = 1; i < mats->size(); ++i) {
if ((*mats)[i].Channels() != (*mats)[0].Channels() ||
(*mats)[i].Width() != (*mats)[0].Width() ||
Expand All @@ -285,21 +288,24 @@ bool CheckShapeConsistency(std::vector<Mat>* mats) {
FDTensor* CreateCachedGpuInputTensor(Mat* mat) {
#ifdef WITH_GPU
FDTensor* src = mat->Tensor();
// Need to make sure the tensor is pointed to the input_cache.
if (src->Data() == mat->output_cache->Data()) {
std::swap(mat->input_cache, mat->output_cache);
std::swap(mat->input_cache->name, mat->output_cache->name);
}
if (src->device == Device::GPU) {
if (src->Data() == mat->output_cache->Data()) {
std::swap(mat->input_cache, mat->output_cache);
std::swap(mat->input_cache->name, mat->output_cache->name);
}
return src;
} else if (src->device == Device::CPU) {
// Mats on CPU, we need copy these tensors from CPU to GPU
// Tensor on CPU, we need copy it from CPU to GPU
FDASSERT(src->Shape().size() == 3, "The CPU tensor must has 3 dims.")
mat->input_cache->Resize(src->Shape(), src->Dtype(), "input_cache",
Device::GPU);
mat->output_cache->Resize(src->Shape(), src->Dtype(), "output_cache",
Device::GPU);
FDASSERT(
cudaMemcpyAsync(mat->input_cache->Data(), src->Data(), src->Nbytes(),
cudaMemcpyAsync(mat->output_cache->Data(), src->Data(), src->Nbytes(),
cudaMemcpyHostToDevice, mat->Stream()) == 0,
"[ERROR] Error occurs while copy memory from CPU to GPU.");
std::swap(mat->input_cache, mat->output_cache);
std::swap(mat->input_cache->name, mat->output_cache->name);
return mat->input_cache;
} else {
FDASSERT(false, "FDMat is on unsupported device: %d", src->device);
Expand Down
40 changes: 25 additions & 15 deletions fastdeploy/vision/common/processors/mat_batch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,12 @@ FDTensor* FDMatBatch::Tensor() {
if (has_batched_tensor) {
return fd_tensor.get();
}
FDASSERT(CheckShapeConsistency(mats), "Mats shapes are not consistent.")
FDASSERT(mats != nullptr, "Failed to get batched tensor, Mats are empty.");
FDASSERT(CheckShapeConsistency(mats), "Mats shapes are not consistent.");
// Each mat has its own tensor,
// to get a batched tensor, we need copy these tensors to a batched tensor
FDTensor* src = (*mats)[0].Tensor();
device = src->device;
auto new_shape = src->Shape();
new_shape.insert(new_shape.begin(), mats->size());
input_cache->Resize(new_shape, src->Dtype(), "batch_input_cache", device);
Expand All @@ -51,26 +53,34 @@ FDTensor* FDMatBatch::Tensor() {
void FDMatBatch::SetTensor(FDTensor* tensor) {
fd_tensor->SetExternalData(tensor->Shape(), tensor->Dtype(), tensor->Data(),
tensor->device, tensor->device_id);
device = tensor->device;
has_batched_tensor = true;
}

FDTensor* CreateCachedGpuInputTensor(FDMatBatch* mat_batch) {
#ifdef WITH_GPU
auto mats = mat_batch->mats;
FDASSERT(CheckShapeConsistency(mats), "Mats shapes are not consistent.")
FDTensor* src = (*mats)[0].Tensor();
if (mat_batch->device == Device::GPU) {
return mat_batch->Tensor();
} else if (mat_batch->device == Device::CPU) {
// Mats on CPU, we need copy them to GPU and then get a batched GPU tensor
for (size_t i = 0; i < mats->size(); ++i) {
FDTensor* tensor = CreateCachedGpuInputTensor(&(*mats)[i]);
(*mats)[i].SetTensor(tensor);
}
mat_batch->device = Device::GPU;
return mat_batch->Tensor();
// Get the batched tensor
FDTensor* src = mat_batch->Tensor();
// Need to make sure the returned tensor is pointed to the input_cache.
if (src->Data() == mat_batch->output_cache->Data()) {
std::swap(mat_batch->input_cache, mat_batch->output_cache);
std::swap(mat_batch->input_cache->name, mat_batch->output_cache->name);
}
if (src->device == Device::GPU) {
return src;
} else if (src->device == Device::CPU) {
// Batched tensor on CPU, we need copy it to GPU
mat_batch->output_cache->Resize(src->Shape(), src->Dtype(), "output_cache",
Device::GPU);
FDASSERT(cudaMemcpyAsync(mat_batch->output_cache->Data(), src->Data(),
src->Nbytes(), cudaMemcpyHostToDevice,
mat_batch->Stream()) == 0,
"[ERROR] Error occurs while copy memory from CPU to GPU.");
std::swap(mat_batch->input_cache, mat_batch->output_cache);
std::swap(mat_batch->input_cache->name, mat_batch->output_cache->name);
return mat_batch->input_cache;
} else {
FDASSERT(false, "FDMat is on unsupported device: %d", src->device);
FDASSERT(false, "FDMatBatch is on unsupported device: %d", src->device);
}
#else
FDASSERT(false, "FastDeploy didn't compile with WITH_GPU.");
Expand Down
2 changes: 1 addition & 1 deletion fastdeploy/vision/common/processors/mat_batch.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ struct FASTDEPLOY_DECL FDMatBatch {
void SetStream(cudaStream_t s);
#endif

std::vector<FDMat>* mats;
std::vector<FDMat>* mats = nullptr;
ProcLib mat_type = ProcLib::OPENCV;
FDMatBatchLayout layout = FDMatBatchLayout::NHWC;
Device device = Device::CPU;
Expand Down
116 changes: 116 additions & 0 deletions fastdeploy/vision/common/processors/normalize.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifdef WITH_GPU
#include "fastdeploy/vision/common/processors/normalize.h"

namespace fastdeploy {
namespace vision {

__global__ void NormalizeKernel(const uint8_t* src, float* dst,
const float* alpha, const float* beta,
int num_channel, bool swap_rb, int batch_size,
int edge) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx >= edge) return;

int img_size = edge / batch_size;
int n = idx / img_size; // batch index
int p = idx - (n * img_size); // pixel index within the image

for (int i = 0; i < num_channel; ++i) {
int j = i;
if (swap_rb) {
j = 2 - i;
}
dst[num_channel * idx + j] =
src[num_channel * idx + j] * alpha[i] + beta[i];
}
}

bool Normalize::ImplByCuda(FDMat* mat) {
if (mat->layout != Layout::HWC) {
FDERROR << "The input data must be NHWC format!" << std::endl;
return false;
}

// Prepare input tensor
FDTensor* src = CreateCachedGpuInputTensor(mat);
src->ExpandDim(0);
FDMatBatch mat_batch;
mat_batch.SetTensor(src);
mat_batch.mat_type = ProcLib::CUDA;
mat_batch.input_cache = mat->input_cache;
mat_batch.output_cache = mat->output_cache;

bool ret = ImplByCuda(&mat_batch);

FDTensor* dst = mat_batch.Tensor();
dst->Squeeze(0);
mat->SetTensor(dst);
mat->mat_type = ProcLib::CUDA;
return true;
}

bool Normalize::ImplByCuda(FDMatBatch* mat_batch) {
if (mat_batch->layout != FDMatBatchLayout::NHWC) {
FDERROR << "The input data must be NHWC format!" << std::endl;
return false;
}
// Prepare input tensor
FDTensor* src = CreateCachedGpuInputTensor(mat_batch);

// Prepare output tensor
mat_batch->output_cache->Resize(src->Shape(), FDDataType::FP32,
"batch_output_cache", Device::GPU);

// Copy alpha and beta to GPU
gpu_alpha_.Resize({1, 1, static_cast<int>(alpha_.size())}, FDDataType::FP32,
"alpha", Device::GPU);
cudaMemcpy(gpu_alpha_.Data(), alpha_.data(), gpu_alpha_.Nbytes(),
cudaMemcpyHostToDevice);

gpu_beta_.Resize({1, 1, static_cast<int>(beta_.size())}, FDDataType::FP32,
"beta", Device::GPU);
cudaMemcpy(gpu_beta_.Data(), beta_.data(), gpu_beta_.Nbytes(),
cudaMemcpyHostToDevice);

int jobs =
mat_batch->output_cache->Numel() / mat_batch->output_cache->shape[3];
int threads = 256;
int blocks = ceil(jobs / (float)threads);
NormalizeKernel<<<blocks, threads, 0, mat_batch->Stream()>>>(
reinterpret_cast<uint8_t*>(src->Data()),
reinterpret_cast<float*>(mat_batch->output_cache->Data()),
reinterpret_cast<float*>(gpu_alpha_.Data()),
reinterpret_cast<float*>(gpu_beta_.Data()),
mat_batch->output_cache->shape[3], swap_rb_,
mat_batch->output_cache->shape[0], jobs);
mat_batch->SetTensor(mat_batch->output_cache);
mat_batch->mat_type = ProcLib::CUDA;
return true;
}
#ifdef ENABLE_CVCUDA
bool Normalize::ImplByCvCuda(FDMat* mat) { return ImplByCuda(mat); }
bool Normalize::ImplByCvCuda(FDMatBatch* mat_batch) {
return ImplByCuda(mat_batch);
}
#endif
} // namespace vision
} // namespace fastdeploy
#endif
Loading

0 comments on commit 044ab99

Please sign in to comment.