From e210c5cee1922c820643967d94f39cc30325ff23 Mon Sep 17 00:00:00 2001 From: chunhuanMeng <105194461+chunhuanMeng@users.noreply.github.com> Date: Mon, 29 Jul 2024 16:14:51 +0800 Subject: [PATCH 1/2] Enable aten::smooth_l1_loss forward/backward (#621) --- src/ATen/native/xpu/Loss.cpp | 58 +++++++++++++++++++ src/ATen/native/xpu/XPUFallback.template | 2 - .../native/xpu/sycl/BinaryMiscOpsKernels.cpp | 26 +++++++++ .../native/xpu/sycl/BinaryMiscOpsKernels.h | 2 + .../native/xpu/sycl/PointwiseOpsKernels.cpp | 34 +++++++++++ .../native/xpu/sycl/PointwiseOpsKernels.h | 2 + test/xpu/xpu_test_utils.py | 1 + yaml/xpu_functions.yaml | 3 + 8 files changed, 126 insertions(+), 2 deletions(-) diff --git a/src/ATen/native/xpu/Loss.cpp b/src/ATen/native/xpu/Loss.cpp index f09f68b8a..050ff07b9 100644 --- a/src/ATen/native/xpu/Loss.cpp +++ b/src/ATen/native/xpu/Loss.cpp @@ -80,6 +80,64 @@ Tensor& XPUNativeFunctions::mse_loss_backward_out( return grad_input; } + +Tensor& XPUNativeFunctions::smooth_l1_loss_out( + const Tensor& input, + const Tensor& target, + int64_t reduction, + double beta, + Tensor& result) { + if (reduction != Reduction::None) { + TORCH_INTERNAL_ASSERT( + reduction == Reduction::Mean || reduction == Reduction::Sum); + result.resize_({}); + Tensor loss; + auto iter = TensorIterator::borrowing_binary_op(loss, input, target); + native::xpu::smooth_l1_kernel(iter, beta); + if (reduction == Reduction::Mean) { + at::mean_out(const_cast(result), iter.output(), IntArrayRef{}); + } else { + at::sum_out(const_cast(result), iter.output(), IntArrayRef{}); + } + } else { + auto iter = TensorIterator::borrowing_binary_op(result, input, target); + native::xpu::smooth_l1_kernel(iter, beta); + } + return result; +} + +Tensor XPUNativeFunctions::smooth_l1_loss( + const Tensor& input, + const Tensor& target, + int64_t reduction, + double beta) { + Tensor result = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + result = XPUNativeFunctions::smooth_l1_loss_out( + input, target, reduction, beta, result); + return result; +} + +Tensor& XPUNativeFunctions::smooth_l1_loss_backward_out( + const Tensor& grad_output, + const Tensor& input, + const Tensor& target, + int64_t reduction, + double beta, + Tensor& grad_input) { + auto norm = reduction == Reduction::Mean ? 1. / input.numel() : 1.; + auto iter = at::TensorIteratorConfig() + .add_output(grad_input) + .add_const_input(input) + .add_const_input(target) + .add_const_input(grad_output) + .promote_inputs_to_common_dtype(true) + .cast_common_dtype_to_outputs(true) + .enforce_safe_casting_to_output(true) + .build(); + native::xpu::smooth_l1_backward_kernel(iter, norm, beta); + return grad_input; +} + Tensor XPUNativeFunctions::binary_cross_entropy( const Tensor& self, const Tensor& target, diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template index 3f2064653..93321f23d 100644 --- a/src/ATen/native/xpu/XPUFallback.template +++ b/src/ATen/native/xpu/XPUFallback.template @@ -257,8 +257,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "signbit.out", "sign.out", "sinc.out", - "smooth_l1_loss_backward.grad_input", - "smooth_l1_loss.out", "special_airy_ai.out", "special_bessel_j0.out", "special_bessel_j1.out", diff --git a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp index 00c5398af..5ac71c163 100644 --- a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp @@ -23,6 +23,32 @@ void mse_kernel(TensorIteratorBase& iter) { [&]() { gpu_kernel(iter, MSEFunctor()); }); } +template +struct SmoothL1Functor { + scalar_t operator()(scalar_t input, scalar_t target) const { + auto z = std::abs(input - target); + return z < beta_val ? scalar_t(0.5) * z * z / beta_val + : z - scalar_t(0.5) * beta_val; + } + SmoothL1Functor(scalar_t beta_val) : beta_val(beta_val) {} + + private: + scalar_t beta_val; +}; + +void smooth_l1_kernel(TensorIteratorBase& iter, double beta) { + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + iter.dtype(), + "smooth_l1_xpu", + [&iter, beta]() { + scalar_t beta_val(beta); + SmoothL1Functor f(beta_val); + gpu_kernel(iter, f); + }); +} + template struct HuberFunctor { scalar_t operator()(scalar_t a, scalar_t b) const { diff --git a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h index 94cfb7c90..17672ec29 100644 --- a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h +++ b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h @@ -6,6 +6,8 @@ namespace at::native::xpu { void mse_kernel(TensorIteratorBase& iter); +void smooth_l1_kernel(TensorIteratorBase& iter, double beta); + void huber_kernel(TensorIterator& iter, double delta); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp index 5dc06e25a..822a83e99 100644 --- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp @@ -125,6 +125,40 @@ void mse_backward_kernel(TensorIterator& iter, const Scalar& value) { }); } +template +struct SmoothL1BackwardFunctor { + scalar_t operator()(scalar_t input, scalar_t target, scalar_t grad_output) + const { + const auto x = input - target; + if (x < -beta_val) + return -norm_val * grad_output; + else if (x > beta_val) + return norm_val * grad_output; + else + return norm_val * x * grad_output / beta_val; + } + SmoothL1BackwardFunctor(scalar_t norm_val, scalar_t beta_val) + : norm_val(norm_val), beta_val(beta_val) {} + + private: + scalar_t norm_val; + scalar_t beta_val; +}; + +void smooth_l1_backward_kernel(TensorIterator& iter, Scalar norm, double beta) { + AT_DISPATCH_ALL_TYPES_AND2( + kHalf, + kBFloat16, + iter.dtype(), + "smooth_l1_backward_xpu", + [&iter, &norm, beta] { + auto norm_val = norm.to(); + scalar_t beta_val(beta); + SmoothL1BackwardFunctor f(norm_val, beta_val); + gpu_kernel(iter, f); + }); +} + template struct HuberBackwardFunctor { scalar_t operator()(scalar_t input, scalar_t target, scalar_t grad_output) diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h index 586a64f3c..613c3cca6 100644 --- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h +++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h @@ -10,6 +10,8 @@ void addcdiv_kernel(TensorIterator& iter, Scalar value); void mse_backward_kernel(TensorIterator& iter, const Scalar& value); +void smooth_l1_backward_kernel(TensorIterator& iter, Scalar norm, double beta); + void huber_backward_kernel( TensorIterator& iter, const Scalar& norm, diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index 2a36ddfb0..c281747f2 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -179,6 +179,7 @@ "nn.functional.upsample_bilinear", "nn.functional.upsample_nearest", "nn.functional.nll_loss", + "nn.functional.smooth_l1_loss", "nn.functional.mse_loss", "nn.functional.binary_cross_entropy", "nn.functional.huber_loss", diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index 77d77d4f0..fd087c7bc 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -308,6 +308,9 @@ supported: - bitwise_and.Tensor_out - bitwise_or.Tensor_out - bitwise_xor.Tensor_out + - smooth_l1_loss + - smooth_l1_loss.out + - smooth_l1_loss_backward.grad_input - bitwise_not.out - where.self_out - where.self From 36dfe230dea6a737fe260b072276cbcca3ca3f9a Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Tue, 30 Jul 2024 08:40:20 +0800 Subject: [PATCH 2/2] Add aten::polar and its variants (#606) Co-authored-by: yucai Co-authored-by: Feng Yuan --- src/ATen/native/xpu/TensorFactories.cpp | 15 +++++++++++++++ src/ATen/native/xpu/XPUFallback.template | 1 - src/ATen/native/xpu/sycl/ComplexKernels.cpp | 14 ++++++++++++++ src/ATen/native/xpu/sycl/ComplexKernels.h | 2 ++ test/xpu/extended/run_test_with_skip.py | 4 ++++ test/xpu/run_test_with_skip.py | 12 ++++++++++-- test/xpu/xpu_test_utils.py | 1 + yaml/xpu_functions.yaml | 1 + 8 files changed, 47 insertions(+), 3 deletions(-) diff --git a/src/ATen/native/xpu/TensorFactories.cpp b/src/ATen/native/xpu/TensorFactories.cpp index 110590958..44da487f7 100644 --- a/src/ATen/native/xpu/TensorFactories.cpp +++ b/src/ATen/native/xpu/TensorFactories.cpp @@ -151,6 +151,21 @@ Tensor& XPUNativeFunctions::complex_out( return result; } +Tensor& XPUNativeFunctions::polar_out( + const Tensor& abs, + const Tensor& angle, + Tensor& result) { + complex_check_dtype(result, abs, angle); + auto iter = TensorIteratorConfig() + .add_output(result) + .add_const_input(abs) + .add_const_input(angle) + .check_all_same_dtype(false) + .build(); + native::xpu::polar_kernel(iter); + return result; +} + Tensor& XPUNativeFunctions::randperm_out( int64_t n, c10::optional generator, diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template index 93321f23d..4a4c96828 100644 --- a/src/ATen/native/xpu/XPUFallback.template +++ b/src/ATen/native/xpu/XPUFallback.template @@ -240,7 +240,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "ormqr", "_pdist_backward", "_pdist_forward", - "polar.out", "_prelu_kernel", "_prelu_kernel_backward", "prod", diff --git a/src/ATen/native/xpu/sycl/ComplexKernels.cpp b/src/ATen/native/xpu/sycl/ComplexKernels.cpp index 56b25d0ef..87504bd5e 100644 --- a/src/ATen/native/xpu/sycl/ComplexKernels.cpp +++ b/src/ATen/native/xpu/sycl/ComplexKernels.cpp @@ -21,4 +21,18 @@ void complex_kernel(TensorIterator& iter) { }); } +template +struct PolarFunctor { + c10::complex operator()(scalar_t a, scalar_t b) const { + return c10::complex(a * std::cos(b), a * std::sin(b)); + } +}; + +void polar_kernel(TensorIterator& iter) { + AT_DISPATCH_FLOATING_TYPES(iter.input_dtype(0), "polar_xpu", [&]() { + PolarFunctor f; + gpu_kernel(iter, f); + }); +} + } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ComplexKernels.h b/src/ATen/native/xpu/sycl/ComplexKernels.h index 990bcd14e..d51556b4f 100644 --- a/src/ATen/native/xpu/sycl/ComplexKernels.h +++ b/src/ATen/native/xpu/sycl/ComplexKernels.h @@ -6,4 +6,6 @@ namespace at::native::xpu { void complex_kernel(TensorIterator& iter); +void polar_kernel(TensorIterator& iter); + } // namespace at::native::xpu diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py index 6f8fe8d3a..a75d2e675 100644 --- a/test/xpu/extended/run_test_with_skip.py +++ b/test/xpu/extended/run_test_with_skip.py @@ -154,6 +154,10 @@ # Greatest relative difference: 0.00396728515625 at index (610,) (up to 0.001 allowed) "test_compare_cpu_hypot_xpu_bfloat16", + # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16. + # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error. + "test_compare_cpu_polar_xpu_bfloat16", + # Regressions due to PyTorch uplift (Numeric difference in float and bfloat) # https://github.com/intel/torch-xpu-ops/issues/549 # Example fail log diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py index 719af3ca4..7d051607e 100644 --- a/test/xpu/run_test_with_skip.py +++ b/test/xpu/run_test_with_skip.py @@ -782,6 +782,10 @@ def launch_test(test_case, skip_list=None, exe_list=None): # torch.complex32 - "sinh_cpu" not implemented for 'ComplexHalf' "test_dtypes_cosh_xpu", + # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16. + # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error. + "test_dtypes_polar_xpu", + # implemented aten::histogram to align MPS operators coverage, CUDA doesn't support # but test_dtypes infrastructure leverage CUDA supported datatypes "test_dtypes_histogram_xpu", @@ -3016,8 +3020,12 @@ def launch_test(test_case, skip_list=None, exe_list=None): res += launch_test("nn/test_load_state_dict_xpu.py") # test_module_hooks - -res += launch_test("nn/test_module_hooks_xpu.py") +skip_list = ( + # TypeError: TestStateDictHooks.test_register_state_dict_post_hook() missing 1 required positional argument: 'private' + # https://github.com/intel/torch-xpu-ops/issues/658 + "test_register_state_dict_post_hook", +) +res += launch_test("nn/test_module_hooks_xpu.py", skip_list) # test_parametrization diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index c281747f2..823988488 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -208,6 +208,7 @@ "unique", "multinomial", "lerp", + "polar", "frac", "aminmax", "argmin", diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index fd087c7bc..9d453d215 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -268,6 +268,7 @@ supported: - eye.m_out - _efficientzerotensor - complex.out + - polar.out - clone - fill_.Scalar - fill_.Tensor