diff --git a/src/ATen/native/xpu/Loss.cpp b/src/ATen/native/xpu/Loss.cpp index f09f68b8a..050ff07b9 100644 --- a/src/ATen/native/xpu/Loss.cpp +++ b/src/ATen/native/xpu/Loss.cpp @@ -80,6 +80,64 @@ Tensor& XPUNativeFunctions::mse_loss_backward_out( return grad_input; } + +Tensor& XPUNativeFunctions::smooth_l1_loss_out( + const Tensor& input, + const Tensor& target, + int64_t reduction, + double beta, + Tensor& result) { + if (reduction != Reduction::None) { + TORCH_INTERNAL_ASSERT( + reduction == Reduction::Mean || reduction == Reduction::Sum); + result.resize_({}); + Tensor loss; + auto iter = TensorIterator::borrowing_binary_op(loss, input, target); + native::xpu::smooth_l1_kernel(iter, beta); + if (reduction == Reduction::Mean) { + at::mean_out(const_cast(result), iter.output(), IntArrayRef{}); + } else { + at::sum_out(const_cast(result), iter.output(), IntArrayRef{}); + } + } else { + auto iter = TensorIterator::borrowing_binary_op(result, input, target); + native::xpu::smooth_l1_kernel(iter, beta); + } + return result; +} + +Tensor XPUNativeFunctions::smooth_l1_loss( + const Tensor& input, + const Tensor& target, + int64_t reduction, + double beta) { + Tensor result = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + result = XPUNativeFunctions::smooth_l1_loss_out( + input, target, reduction, beta, result); + return result; +} + +Tensor& XPUNativeFunctions::smooth_l1_loss_backward_out( + const Tensor& grad_output, + const Tensor& input, + const Tensor& target, + int64_t reduction, + double beta, + Tensor& grad_input) { + auto norm = reduction == Reduction::Mean ? 1. / input.numel() : 1.; + auto iter = at::TensorIteratorConfig() + .add_output(grad_input) + .add_const_input(input) + .add_const_input(target) + .add_const_input(grad_output) + .promote_inputs_to_common_dtype(true) + .cast_common_dtype_to_outputs(true) + .enforce_safe_casting_to_output(true) + .build(); + native::xpu::smooth_l1_backward_kernel(iter, norm, beta); + return grad_input; +} + Tensor XPUNativeFunctions::binary_cross_entropy( const Tensor& self, const Tensor& target, diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template index 45e4db303..4a4c96828 100644 --- a/src/ATen/native/xpu/XPUFallback.template +++ b/src/ATen/native/xpu/XPUFallback.template @@ -256,8 +256,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "signbit.out", "sign.out", "sinc.out", - "smooth_l1_loss_backward.grad_input", - "smooth_l1_loss.out", "special_airy_ai.out", "special_bessel_j0.out", "special_bessel_j1.out", diff --git a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp index 00c5398af..5ac71c163 100644 --- a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp @@ -23,6 +23,32 @@ void mse_kernel(TensorIteratorBase& iter) { [&]() { gpu_kernel(iter, MSEFunctor()); }); } +template +struct SmoothL1Functor { + scalar_t operator()(scalar_t input, scalar_t target) const { + auto z = std::abs(input - target); + return z < beta_val ? scalar_t(0.5) * z * z / beta_val + : z - scalar_t(0.5) * beta_val; + } + SmoothL1Functor(scalar_t beta_val) : beta_val(beta_val) {} + + private: + scalar_t beta_val; +}; + +void smooth_l1_kernel(TensorIteratorBase& iter, double beta) { + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + iter.dtype(), + "smooth_l1_xpu", + [&iter, beta]() { + scalar_t beta_val(beta); + SmoothL1Functor f(beta_val); + gpu_kernel(iter, f); + }); +} + template struct HuberFunctor { scalar_t operator()(scalar_t a, scalar_t b) const { diff --git a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h index 94cfb7c90..17672ec29 100644 --- a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h +++ b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h @@ -6,6 +6,8 @@ namespace at::native::xpu { void mse_kernel(TensorIteratorBase& iter); +void smooth_l1_kernel(TensorIteratorBase& iter, double beta); + void huber_kernel(TensorIterator& iter, double delta); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp index 5dc06e25a..822a83e99 100644 --- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp @@ -125,6 +125,40 @@ void mse_backward_kernel(TensorIterator& iter, const Scalar& value) { }); } +template +struct SmoothL1BackwardFunctor { + scalar_t operator()(scalar_t input, scalar_t target, scalar_t grad_output) + const { + const auto x = input - target; + if (x < -beta_val) + return -norm_val * grad_output; + else if (x > beta_val) + return norm_val * grad_output; + else + return norm_val * x * grad_output / beta_val; + } + SmoothL1BackwardFunctor(scalar_t norm_val, scalar_t beta_val) + : norm_val(norm_val), beta_val(beta_val) {} + + private: + scalar_t norm_val; + scalar_t beta_val; +}; + +void smooth_l1_backward_kernel(TensorIterator& iter, Scalar norm, double beta) { + AT_DISPATCH_ALL_TYPES_AND2( + kHalf, + kBFloat16, + iter.dtype(), + "smooth_l1_backward_xpu", + [&iter, &norm, beta] { + auto norm_val = norm.to(); + scalar_t beta_val(beta); + SmoothL1BackwardFunctor f(norm_val, beta_val); + gpu_kernel(iter, f); + }); +} + template struct HuberBackwardFunctor { scalar_t operator()(scalar_t input, scalar_t target, scalar_t grad_output) diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h index 586a64f3c..613c3cca6 100644 --- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h +++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h @@ -10,6 +10,8 @@ void addcdiv_kernel(TensorIterator& iter, Scalar value); void mse_backward_kernel(TensorIterator& iter, const Scalar& value); +void smooth_l1_backward_kernel(TensorIterator& iter, Scalar norm, double beta); + void huber_backward_kernel( TensorIterator& iter, const Scalar& norm, diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index ae04f241b..823988488 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -179,6 +179,7 @@ "nn.functional.upsample_bilinear", "nn.functional.upsample_nearest", "nn.functional.nll_loss", + "nn.functional.smooth_l1_loss", "nn.functional.mse_loss", "nn.functional.binary_cross_entropy", "nn.functional.huber_loss", diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index f5ecda76a..9d453d215 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -309,6 +309,9 @@ supported: - bitwise_and.Tensor_out - bitwise_or.Tensor_out - bitwise_xor.Tensor_out + - smooth_l1_loss + - smooth_l1_loss.out + - smooth_l1_loss_backward.grad_input - bitwise_not.out - where.self_out - where.self