Skip to content

Commit

Permalink
Enable FP16 Clip and Handle Bias in FP16 Depthwise Conv (#21493)
Browse files Browse the repository at this point in the history
- Improved accuracy for face-detection, image-classification, and
object-detection in the GeekBench ML benchmark on ARM64.
- Fixed issue #18992
  • Loading branch information
yihonglyu authored Jul 30, 2024
1 parent 82036b0 commit 530a2d7
Show file tree
Hide file tree
Showing 9 changed files with 531 additions and 20 deletions.
4 changes: 2 additions & 2 deletions docs/OperatorKernels.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ Do not modify directly.*
|Ceil|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float)|
|||[6, 12]|**T** = tensor(double), tensor(float)|
|Celu|*in* X:**T**<br> *out* Y:**T**|12+|**T** = tensor(float)|
|Clip|*in* input:**T**<br> *in* min:**T**<br> *in* max:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
|||12|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
|Clip|*in* input:**T**<br> *in* min:**T**<br> *in* max:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
|||12|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
|||11|**T** = tensor(float)|
|||[6, 10]|**T** = tensor(float)|
|Col2Im|*in* input:**T**<br> *in* image_shape:**tensor(int64)**<br> *in* block_shape:**tensor(int64)**<br> *out* output:**T**|18+|**T** = tensor(float)|
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/core/mlas/inc/mlas.h
Original file line number Diff line number Diff line change
Expand Up @@ -1751,6 +1751,7 @@ MlasSBGemmConvertPackB(size_t N, size_t K, const float* B, size_t ldb, void* Pac
* @brief Indirect Depthwise convolution for fp16
* @param Input Supplies the indirect buffer for NHWC input
* @param Filter Supplies the address for filter tensor
* @param Bias Supplies the address for 1D bias tensor B, has size of M
* @param Output Supplies the address for the result tensor
* @param Channels # of input channels
* @param OutputCount # of output pixels
Expand All @@ -1762,6 +1763,7 @@ MLASCALL
MlasConvDepthwise(
const MLAS_FP16* const* Input,
const MLAS_FP16* Filter,
const MLAS_FP16* Bias,
MLAS_FP16* Output,
size_t Channels,
size_t OutputCount,
Expand Down
32 changes: 17 additions & 15 deletions onnxruntime/core/mlas/lib/dwconv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ Module Name:
--*/


#include "fp16_common.h"

#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
Expand All @@ -24,19 +23,20 @@ void
MlasConvDepthwiseKernel(
const _mlas_fp16_* const* Input,
const _mlas_fp16_* Filter,
const _mlas_fp16_* Bias,
_mlas_fp16_* Output,
size_t Channels,
size_t OutputCount,
size_t KernelSize,
MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
)
)
{
while (OutputCount > 0) {
size_t ChannelOffset = 0;
size_t c = Channels;

while (c >= 8) {
MLAS_FLOAT16X8 Accumulator = MlasZeroFloat16x8();
MLAS_FLOAT16X8 Accumulator = Bias == nullptr ? MlasZeroFloat16x8() : MlasLoadFloat16x8(&Bias[ChannelOffset]);
size_t ChannelKernelOffset = ChannelOffset;

for (size_t k = 0; k < KernelSize; k++) {
Expand All @@ -54,7 +54,7 @@ MlasConvDepthwiseKernel(
}

if (c >= 4) {
MLAS_FLOAT16X4 Accumulator = MlasZeroFloat16x4();
MLAS_FLOAT16X4 Accumulator = Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadFloat16x4(&Bias[ChannelOffset]);
size_t ChannelKernelOffset = ChannelOffset;

for (size_t k = 0; k < KernelSize; k++) {
Expand All @@ -72,7 +72,8 @@ MlasConvDepthwiseKernel(
}

if (c > 0) {
MLAS_FLOAT16X4 Accumulator = MlasZeroFloat16x4();
MLAS_FLOAT16X4 Accumulator =
Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadPartialFloat16x4(&Bias[ChannelOffset], c);
size_t ChannelKernelOffset = ChannelOffset;

for (size_t k = 0; k < KernelSize; k++) {
Expand All @@ -86,8 +87,7 @@ MlasConvDepthwiseKernel(
Output += c;
}
if (PostProc) {
PostProc->Process(reinterpret_cast<MLAS_FP16*>(Output - Channels), 0, 0, 1, Channels,
Channels);
PostProc->Process(reinterpret_cast<MLAS_FP16*>(Output - Channels), 0, 0, 1, Channels, Channels);
}
Input += KernelSize;
OutputCount -= 1;
Expand All @@ -101,16 +101,17 @@ void
MlasConvDepthwiseKernel(
const _mlas_fp16_* const* Input,
const _mlas_fp16_* Filter,
const _mlas_fp16_* Bias,
_mlas_fp16_* Output,
size_t Channels,
size_t OutputCount,
size_t KernelSize,
MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
)
)
{
while (OutputCount > 0) {
for (size_t ChannelOffset = 0; ChannelOffset < Channels; ChannelOffset++) {
float Accumulator = 0.0f;
float Accumulator = Bias == nullptr ? 0.0f : MLAS_Half2Float(Bias[ChannelOffset]);
size_t ChannelKernelOffset = ChannelOffset;

for (size_t k = 0; k < KernelSize; k++) {
Expand All @@ -120,35 +121,36 @@ MlasConvDepthwiseKernel(
*Output++ = MLAS_Float2Half(Accumulator);
}
if (PostProc) {
PostProc->Process(reinterpret_cast<MLAS_FP16*>(Output - Channels), 0, 0, 1, Channels,
Channels);
PostProc->Process(reinterpret_cast<MLAS_FP16*>(Output - Channels), 0, 0, 1, Channels, Channels);
}
Input += KernelSize;
OutputCount -= 1;
}
}

#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED

#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED

void
MLASCALL
MlasConvDepthwise(
const MLAS_FP16* const* Input,
const MLAS_FP16* Filter,
const MLAS_FP16* Bias,
MLAS_FP16* Output,
size_t Channels,
size_t OutputCount,
size_t KernelSize,
MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
)
)
{
MlasConvDepthwiseKernel(
reinterpret_cast<const _mlas_fp16_* const*>(Input),
reinterpret_cast<const _mlas_fp16_*>(Filter),
reinterpret_cast<const _mlas_fp16_*>(Bias),
reinterpret_cast<_mlas_fp16_*>(Output),
Channels,
OutputCount,
KernelSize,
PostProc);
PostProc
);
}
17 changes: 17 additions & 0 deletions onnxruntime/core/mlas/lib/fp16_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,23 @@ MLAS_FORCEINLINE
MLAS_FLOAT16X4
MlasLoadFloat16x4(const _mlas_fp16_* Buffer) { return vreinterpret_f16_u16(vld1_u16(Buffer)); }

MLAS_FORCEINLINE
MLAS_FLOAT16X4
MlasLoadPartialFloat16x4(const _mlas_fp16_* Buffer, size_t len)
{
MLAS_FLOAT16X4 Vector = MlasZeroFloat16x4();
if ((len & 1) != 0) {
Vector = vreinterpret_f16_u16(vld1_lane_u16(Buffer + (len - 1), vreinterpret_u16_f16(Vector), 0));
}
if ((len & 2) != 0) {
Vector = vreinterpret_f16_f32(vdup_lane_f32(vreinterpret_f32_f16(Vector), 0));
Vector = vreinterpret_f16_f32(
vld1_lane_f32(reinterpret_cast<const float*>(Buffer), vreinterpret_f32_f16(Vector), 0)
);
}
return Vector;
}

MLAS_FORCEINLINE
void
MlasStoreFloat16x8(_mlas_fp16_* Buffer, MLAS_FLOAT16X8 Vector)
Expand Down
4 changes: 3 additions & 1 deletion onnxruntime/core/providers/cpu/fp16/fp16_conv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,9 @@ Status FusedConvFp16::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr

bool share_prepacked_weights = (prepacked_weights != nullptr);

const bool is_depthwise_conv = (group_input_channels == 1 && group_output_channels == 1);
// Don't pack the filter buffer if the MlasConvDepthwise path is used.
if (!(group_input_channels == 1 && group_output_channels == 1)) {
if (!is_depthwise_conv) {
packed_W_size_ = MlasHalfGemmPackBSize(group_output_channels, kernel_dim, false);
if (packed_W_size_ != 0) {
size_t packed_W_data_size = SafeInt<size_t>(group_count) * packed_W_size_;
Expand Down Expand Up @@ -472,6 +473,7 @@ Status FusedConvFp16::Compute(OpKernelContext* context) const {
MlasConvDepthwise(
worker_indirection_buffer,
reordered_W,
Bdata,
worker_output,
static_cast<size_t>(M),
static_cast<size_t>(output_count),
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/cpu/math/clip.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES(
float);
ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES(
kCpuExecutionProvider, kOnnxDomain, Clip, 12, Input, 0,
float, double, int8_t, uint8_t, int32_t, uint32_t, int64_t, uint64_t);
float, MLFloat16, double, int8_t, uint8_t, int32_t, uint32_t, int64_t, uint64_t);
} // namespace op_kernel_type_control

using EnabledClip11Types = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST(
Expand Down
18 changes: 18 additions & 0 deletions onnxruntime/test/providers/cpu/math/clip_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,24 @@ TEST(MathOpTest, Clip_Default_uint64) {
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
}

TEST(MathOpTest, Clip_MLFloat16) {
OpTester test("Clip", 12);

std::vector<int64_t> dims{3, 3};
test.AddInput<MLFloat16>("X", dims,
{MLFloat16(-1.0f), MLFloat16(-2.0f), MLFloat16(-3.0f),
MLFloat16(-4.0f), MLFloat16(0.0f), MLFloat16(2.0f),
MLFloat16(4.0f), MLFloat16(6.0f), MLFloat16(8.0f)});
test.AddInput<MLFloat16>("min", {}, {MLFloat16(0.0f)});
test.AddInput<MLFloat16>("max", {}, {MLFloat16(6.0f)});
test.AddOutput<MLFloat16>("Y", dims,
{MLFloat16(0.0f), MLFloat16(0.0f), MLFloat16(0.0f),
MLFloat16(0.0f), MLFloat16(0.0f), MLFloat16(2.0f),
MLFloat16(4.0f), MLFloat16(6.0f), MLFloat16(6.0f)});

test.Run();
}

TEST(MathOpTest, Clip_int32) {
OpTester test("Clip", 12);

Expand Down
Loading

0 comments on commit 530a2d7

Please sign in to comment.