diff --git a/src/ATen/native/xpu/UnaryOps.cpp b/src/ATen/native/xpu/UnaryOps.cpp index 4a27034dc..f79442bc2 100644 --- a/src/ATen/native/xpu/UnaryOps.cpp +++ b/src/ATen/native/xpu/UnaryOps.cpp @@ -530,6 +530,71 @@ Tensor& XPUNativeFunctions::sigmoid_out(const Tensor& self, Tensor& out) { return out; } +Tensor XPUNativeFunctions::sign(const Tensor& self) { + TORCH_CHECK( + !self.is_complex(), + "Unlike NumPy, torch.sign is not intended to support complex numbers. Please use torch.sgn instead."); + Tensor out; + TensorIterator iter; + iter.build_borrowing_unary_op(out, self); + native::xpu::sign_kernel(iter); + return iter.output(); +} + +Tensor& XPUNativeFunctions::sign_(Tensor& self) { + TORCH_CHECK( + !self.is_complex(), + "Unlike NumPy, torch.sign is not intended to support complex numbers. Please use torch.sgn instead."); + TensorIterator iter; + iter.build_borrowing_unary_op(self, self); + native::xpu::sign_kernel(iter); + return self; +} + +Tensor& XPUNativeFunctions::sign_out(const Tensor& self, Tensor& out) { + TORCH_CHECK( + !self.is_complex(), + "Unlike NumPy, torch.sign is not intended to support complex numbers. Please use torch.sgn instead."); + TensorIterator iter; + iter.build_borrowing_unary_op(out, self); + native::xpu::sign_kernel(iter); + return out; +} + +Tensor XPUNativeFunctions::signbit(const Tensor& self) { + TORCH_CHECK( + !self.is_complex(), "signbit is not implemented for complex tensors."); + + Tensor out; + TensorIterator iter; + iter.build_borrowing_unary_force_boolean_op(out, self); + + if (self.dtype() == at::kBool) { + iter.output().fill_(false); + } else { + native::xpu::signbit_kernel(iter); + } + return iter.output(); +} + +Tensor& XPUNativeFunctions::signbit_out(const Tensor& self, Tensor& out) { + TORCH_CHECK( + !self.is_complex(), "signbit is not implemented for complex tensors."); + TORCH_CHECK( + out.dtype() == at::kBool, + "signbit does not support non-boolean outputs."); + + TensorIterator iter; + iter.build_borrowing_unary_force_boolean_op(out, self); + + if (self.dtype() == at::kBool) { + out.fill_(false); + } else { + native::xpu::signbit_kernel(iter); + } + return out; +} + Tensor& XPUNativeFunctions::logit_out( const Tensor& self, std::optional eps, diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template index 3445f5e42..5b2d6e5ff 100644 --- a/src/ATen/native/xpu/XPUFallback.template +++ b/src/ATen/native/xpu/XPUFallback.template @@ -245,8 +245,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "_scaled_mm", "segment_reduce", "_segment_reduce_backward", - "signbit.out", - "sign.out", "sinc.out", "special_airy_ai.out", "special_bessel_j0.out", diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp index 10f7f0eec..25acd873a 100644 --- a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp @@ -263,7 +263,8 @@ static inline void group_reduce( // uint32_t SIMD = sg.get_local_range()[0]; #pragma unroll for (int i = 1; i < SIMD; i <<= 1) { - val = bin_op(val, static_cast(sg.shuffle_down(val, i))); + val = bin_op( + val, static_cast(sycl::shift_group_left(sg, val, i))); } if (sub_group_num == 1) { if (lane_id == 0) { @@ -294,7 +295,8 @@ static inline void group_reduce( } #pragma unroll for (int i = 1; i < SIMD; i <<= 1) { - val = bin_op(val, static_cast(sg.shuffle_down(val, i))); + val = bin_op( + val, static_cast(sycl::shift_group_left(sg, val, i))); if (i >= ((sub_group_num + 1) >> 1)) break; } @@ -450,10 +452,10 @@ struct BatchNormCollectStatisticsKernelFunctor // one value per subgroup #pragma unroll for (int i = 1; i < SIMD; i <<= 1) { - stat_accscalar_t o_avg = sg.shuffle_xor(avg, i); - int o_n = sg.shuffle_xor(n, i); + stat_accscalar_t o_avg = sycl::permute_group_by_xor(sg, avg, i); + int o_n = sycl::permute_group_by_xor(sg, n, i); stat_accscalar_t factor = 1.0 / fmaxf(1.0, n + o_n); - var_n += sg.shuffle_xor(var_n, i) + + var_n += sycl::permute_group_by_xor(sg, var_n, i) + (avg - o_avg) * (avg - o_avg) * n * o_n * factor; avg = (n * avg + o_n * o_avg) * factor; n += o_n; @@ -481,10 +483,10 @@ struct BatchNormCollectStatisticsKernelFunctor } #pragma unroll for (int i = 1; i < SIMD; i <<= 1) { - stat_accscalar_t o_avg = sg.shuffle_xor(avg, i); - int o_n = sg.shuffle_xor(n, i); + stat_accscalar_t o_avg = sycl::permute_group_by_xor(sg, avg, i); + int o_n = sycl::permute_group_by_xor(sg, n, i); stat_accscalar_t factor = 1.0f / fmaxf(1.0f, n + o_n); - var_n += sg.shuffle_xor(var_n, i) + + var_n += sycl::permute_group_by_xor(sg, var_n, i) + (avg - o_avg) * (avg - o_avg) * n * o_n * factor; avg = (n * avg + o_n * o_avg) * factor; n += o_n; diff --git a/src/ATen/native/xpu/sycl/DistanceKernels.cpp b/src/ATen/native/xpu/sycl/DistanceKernels.cpp index 8bd61bdd3..eb0f1f50e 100644 --- a/src/ATen/native/xpu/sycl/DistanceKernels.cpp +++ b/src/ATen/native/xpu/sycl/DistanceKernels.cpp @@ -120,7 +120,7 @@ scalar_t subgroup_reduce_agg_without_broadcast_impl( #pragma unroll for (int offset = (SG_SIZE >> 1); offset > 0; offset >>= 1) { - F::agg(value, sg.shuffle_down(value, offset)); + F::agg(value, sycl::shift_group_left(sg, value, offset)); } return value; } diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp index 622e99ffe..8dafdc8d2 100644 --- a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp +++ b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp @@ -3,10 +3,10 @@ #include #include #include -#include #include #include #include +#include #include #include @@ -18,23 +18,23 @@ template < typename index_t, typename res_t> struct WelfordOpsXPU - : public at::native::WelfordOps { + : public WelfordOps { sycl::nd_item<1>& item; public: - using acc_t = typename at::native:: - WelfordOps::acc_t; + using acc_t = + typename WelfordOps::acc_t; inline acc_t shfl_down(acc_t acc, int offset) const { auto sg = item.get_sub_group(); return { - sg.shuffle_down(acc.mean, offset), - sg.shuffle_down(acc.m2, offset), - sg.shuffle_down(acc.n, offset), - sg.shuffle_down(acc.nf, offset)}; + sycl::shift_group_left(sg, acc.mean, offset), + sycl::shift_group_left(sg, acc.m2, offset), + sycl::shift_group_left(sg, acc.n, offset), + sycl::shift_group_left(sg, acc.nf, offset)}; } WelfordOpsXPU(acc_scalar_t correction, bool take_sqrt, sycl::nd_item<1>& item) - : at::native::WelfordOps( + : WelfordOps( correction, take_sqrt), item(item) {} @@ -43,7 +43,7 @@ struct WelfordOpsXPU template struct GNRowwiseMomentsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { using T_ACC = acc_type_device; - using WelfordType = at::native::WelfordData; + using WelfordType = WelfordData; using WelfordOp = WelfordOpsXPU>; diff --git a/src/ATen/native/xpu/sycl/Norm.h b/src/ATen/native/xpu/sycl/Norm.h index 9aee941cb..36d1282a3 100644 --- a/src/ATen/native/xpu/sycl/Norm.h +++ b/src/ATen/native/xpu/sycl/Norm.h @@ -39,8 +39,10 @@ static inline void norm_group_reduce( // uint32_t SIMD = sg.get_local_range()[0]; #pragma unroll for (int i = 1; i < SIMD; i <<= 1) { - sum1 = bin_op(sum1, static_cast(sg.shuffle_down(sum1, i))); - sum2 = bin_op(sum2, static_cast(sg.shuffle_down(sum2, i))); + sum1 = bin_op( + sum1, static_cast(sycl::shift_group_left(sg, sum1, i))); + sum2 = bin_op( + sum2, static_cast(sycl::shift_group_left(sg, sum2, i))); } if (sub_group_num == 1) { sum1 = sycl::group_broadcast(sg, sum1, 0); @@ -73,8 +75,10 @@ static inline void norm_group_reduce( } #pragma unroll for (int i = 1; i < SIMD; i <<= 1) { - sum1 = bin_op(sum1, static_cast(sg.shuffle_down(sum1, i))); - sum2 = bin_op(sum2, static_cast(sg.shuffle_down(sum2, i))); + sum1 = bin_op( + sum1, static_cast(sycl::shift_group_left(sg, sum1, i))); + sum2 = bin_op( + sum2, static_cast(sycl::shift_group_left(sg, sum2, i))); if (i >= ((sub_group_num + 1) >> 1)) break; } diff --git a/src/ATen/native/xpu/sycl/Reduce.h b/src/ATen/native/xpu/sycl/Reduce.h index 1be3a5e93..276b21175 100644 --- a/src/ATen/native/xpu/sycl/Reduce.h +++ b/src/ATen/native/xpu/sycl/Reduce.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -50,7 +51,7 @@ inline at::detail::Array group_reduce( for (int offset = 1; offset < sg_size; offset <<= 1) { #pragma unroll(out_vec_sz) for (int i = 0; i < out_vec_sz; ++i) { - arg_t other = sg.shuffle_down(value[i], offset); + arg_t other = sycl::shift_group_left(sg, value[i], offset); value[i] = combine(value[i], other); } } @@ -71,7 +72,7 @@ inline at::detail::Array group_reduce( for (int offset = 1; offset < sg_range; offset <<= 1) { #pragma unroll(out_vec_sz) for (int i = 0; i < out_vec_sz; ++i) { - arg_t other = sg.shuffle_down(value[i], offset); + arg_t other = sycl::shift_group_left(sg, value[i], offset); value[i] = combine(value[i], other); } } @@ -132,7 +133,7 @@ inline at::detail::Array group_x_reduce( for (int offset = 1; offset < dim_x; offset <<= 1) { #pragma unroll(out_vec_sz) for (int i = 0; i < out_vec_sz; ++i) { - arg_t other = sg.shuffle_down(value[i], offset); + arg_t other = sycl::shift_group_left(sg, value[i], offset); value[i] = combine(value[i], other); } } @@ -541,11 +542,11 @@ struct ReduceOp { (const scalar_t*)((const char*)src + base_offsets1); value = item_reduce(pos, input_slice); } - // TODO: Currently, there are bugs with shuffle_down when the arg_t is a - // pair for half dtype, We temporarily workaround to do + // TODO: Currently, there are bugs with sycl::shift_group_left when the + // arg_t is a pair for half dtype, We temporarily workaround to do // "reduce_for_compound_dtype" function. constexpr bool is_pair = - std::is_same, arg_t>::value; + std::is_same, arg_t>::value; auto combine = [=](arg1_t value, arg2_t other) -> arg1_t { return ops.combine(value, other); @@ -832,8 +833,8 @@ struct ReduceOp { return value_list[0]; } - // TODO: Currently, there are bugs with shuffle_down when the arg_t is a - // pair with half dtype, We temporarily workaround to do + // TODO: Currently, there are bugs with sycl::shift_group_left when the arg_t + // is a pair with half dtype, We temporarily workaround to do // "reduce_for_compound_dtype" function. template at::detail::Array group_reduce_for_compound_dtype( @@ -850,7 +851,7 @@ struct ReduceOp { for (int offset = 1; offset < (int)sbgrpSize; offset <<= 1) { #pragma unroll(output_vec_size) for (int i = 0; i < output_vec_size; ++i) { - arg_t other = sg.shuffle_down(value[i], offset); + arg_t other = sycl::shift_group_left(sg, value[i], offset); value[i] = ops.combine(value[i], other); } } @@ -875,12 +876,13 @@ struct ReduceOp { #pragma unroll(output_vec_size) for (int i = 0; i < output_vec_size; ++i) { // Shuffle down separately for first and second pair. - std::pair - other = std::pair< - typename arg_t::first_type, - typename arg_t::second_type>( - sg.shuffle_down(value[i].first, offset), - sg.shuffle_down(value[i].second, offset)); + at::xpu:: + pair + other = at::xpu::pair< + typename arg_t::first_type, + typename arg_t::second_type>( + sycl::shift_group_left(sg, value[i].first, offset), + sycl::shift_group_left(sg, value[i].second, offset)); value[i] = ops.combine(value[i], other); } } @@ -907,8 +909,8 @@ struct ReduceOp { return value; } - // TODO: Currently, there are bugs with shuffle_down when the arg_t is a - // pair for half dtype, We temporarily workaround to do + // TODO: Currently, there are bugs with sycl::shift_group_left when the arg_t + // is a pair for half dtype, We temporarily workaround to do // "reduce_for_compound_dtype" function. template at::detail::Array group_x_reduce_for_compound_dtype( @@ -947,11 +949,11 @@ struct ReduceOp { for (int offset = 1; offset < dim_x; offset <<= 1) { #pragma unroll(output_vec_size) for (int i = 0; i < output_vec_size; ++i) { - std::pair - other = std:: + at::xpu::pair + other = xpu:: pair( - sg.shuffle_down(value[i].first, offset), - sg.shuffle_down(value[i].second, offset)); + sycl::shift_group_left(sg, value[i].first, offset), + sycl::shift_group_left(sg, value[i].second, offset)); value[i] = ops.combine(value[i], other); } } @@ -1028,7 +1030,8 @@ struct ReduceOp { // Currently implemented for max of two outputs template - void set_results(const std::pair x, const index_t base_offset) const { + void set_results(const at::xpu::pair x, const index_t base_offset) + const { if (noutputs >= 1) { auto res0 = (T1*)((char*)dst[0] + base_offset); *res0 = x.first; @@ -1121,9 +1124,10 @@ struct ReduceOp { decltype(combine), output_vec_size>(pos, shared_memory, value, combine); if (config.should_group_x_reduce()) { - // TODO: workaround because sg.shuffle_down will fail on `half` dtype. + // TODO: workaround because sycl::shift_group_left will fail on `half` + // dtype. constexpr bool is_pair = - std::is_same, arg_t>::value; + std::is_same, arg_t>::value; if constexpr (is_pair) { value = group_x_reduce_for_compound_dtype( pos, value, shared_memory); diff --git a/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp b/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp index 04c847a4d..121761053 100644 --- a/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp +++ b/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp @@ -1,9 +1,9 @@ #include #include -#include #include #include #include +#include namespace at::native::xpu { @@ -12,7 +12,7 @@ void _min_max_values_kernel_xpu_impl(TensorIterator& iter) { gpu_reduce_kernel( iter, MinMaxOps{}, - std::pair( + at::xpu::pair( at::numeric_limits::upper_bound(), at::numeric_limits::lower_bound())); } diff --git a/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp b/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp index c41ad0c04..1e18b2e5b 100644 --- a/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp +++ b/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp @@ -1,9 +1,9 @@ #include #include -#include #include #include #include +#include namespace at { namespace native { @@ -14,7 +14,8 @@ void argmax_kernel_impl(TensorIterator& iter) { gpu_reduce_kernel( iter, ArgMaxOps{}, - std::pair(at::numeric_limits::lower_bound(), 0)); + at::xpu::pair( + at::numeric_limits::lower_bound(), 0)); }; void argmax_kernel(TensorIterator& iter) { diff --git a/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp b/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp index 2f6c38152..3c9f8453d 100644 --- a/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp +++ b/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp @@ -1,9 +1,9 @@ #include #include -#include #include #include #include +#include namespace at::native::xpu { @@ -12,7 +12,8 @@ void argmin_kernel_impl(TensorIterator& iter) { gpu_reduce_kernel( iter, ArgMinOps{}, - std::pair(at::numeric_limits::upper_bound(), 0)); + at::xpu::pair( + at::numeric_limits::upper_bound(), 0)); }; void argmin_kernel(TensorIterator& iter) { diff --git a/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp b/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp index 0dfa7fd52..16095056c 100644 --- a/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp @@ -1,7 +1,7 @@ #include #include -#include #include +#include #include #include @@ -38,7 +38,7 @@ void max_kernel(TensorIterator& iter) { gpu_reduce_kernel( iter, MaxOps{}, - std::pair( + at::xpu::pair( at::numeric_limits::lower_bound(), 0)); }); } diff --git a/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp b/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp index fff7207c6..2a0ce889c 100644 --- a/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp @@ -1,6 +1,6 @@ #include -#include #include +#include #include #include @@ -37,7 +37,7 @@ void min_kernel(TensorIterator& iter) { gpu_reduce_kernel( iter, MinOps{}, - std::pair( + at::xpu::pair( at::numeric_limits::upper_bound(), 0)); }); } diff --git a/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp b/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp index 54ae01273..6d0e75680 100644 --- a/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp @@ -1,9 +1,9 @@ #include #include -#include #include #include #include +#include namespace at { namespace native { @@ -18,7 +18,7 @@ void std_var_template( // This is necessary to lower register usage that leads to register spills. using accscalar_t = at::acc_type_device; using ops_t = - WelfordOps>; + WelfordOps>; ops_t ops(static_cast(correction_opt), take_sqrt); gpu_reduce_kernel(iter, ops, typename ops_t::acc_t{}); } diff --git a/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp b/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp index 4aac0cceb..658f2e21b 100644 --- a/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp +++ b/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include diff --git a/src/ATen/native/xpu/sycl/SYCLGroupAlgorithm.h b/src/ATen/native/xpu/sycl/SYCLGroupAlgorithm.h index 35f1d54a5..8729f31dd 100644 --- a/src/ATen/native/xpu/sycl/SYCLGroupAlgorithm.h +++ b/src/ATen/native/xpu/sycl/SYCLGroupAlgorithm.h @@ -6,7 +6,7 @@ template inline T GroupReduceSumSGSizeEqualstoNumSG(sg_t& sg, T val) { auto sg_size = sg.get_local_range()[0]; for (int offset = (sg_size >> 1); offset > 0; offset >>= 1) { - val += sg.shuffle_down(val, offset); + val += sycl::shift_group_left(sg, val, offset); } return val; } diff --git a/src/ATen/native/xpu/sycl/SharedReduceOps.h b/src/ATen/native/xpu/sycl/SharedReduceOps.h new file mode 100644 index 000000000..0e63cc6ed --- /dev/null +++ b/src/ATen/native/xpu/sycl/SharedReduceOps.h @@ -0,0 +1,416 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX(X, Y) max_impl(X, Y) +#define MIN(X, Y) min_impl(X, Y) + +#define device_sqrt std::sqrt +#define compat_pow std::pow + +namespace at { +namespace native { +namespace xpu { + +template +struct WelfordData { + scalar_t mean; + scalar_t m2; + index_t n; + scalar_t nf; + + WelfordData() : mean(0), m2(0), n(0), nf(0) {} + + WelfordData(scalar_t mean, scalar_t m2, index_t n, scalar_t nf) + : mean(mean), m2(m2), n(n), nf(nf) {} +}; + +template < + typename scalar_t, + typename acc_scalar_t, + typename index_t, + typename res_t> +struct WelfordOps { + acc_scalar_t correction; + bool take_sqrt; + + public: + using acc_t = WelfordData; + inline acc_t reduce(acc_t acc, scalar_t data, index_t /*idx*/) const { + // We accumulate n in index_t to avoid cumulative rounding error, but still + // need nf for use in combine where int32 may overflow. + index_t new_n = acc.n + 1; + acc_scalar_t new_nf = static_cast(new_n); + acc_scalar_t delta = data - acc.mean; + acc_scalar_t new_mean = acc.mean + delta / new_nf; + acc_scalar_t new_delta = data - new_mean; + return { + new_mean, + acc.m2 + delta * new_delta, + new_n, + new_nf, + }; + } + inline acc_t combine(acc_t a, acc_t b) const { + if (a.nf == 0) { + return b; + } + if (b.nf == 0) { + return a; + } + acc_scalar_t delta = b.mean - a.mean; + acc_scalar_t new_count = a.nf + b.nf; + acc_scalar_t nb_over_n = b.nf / new_count; + return { + a.mean + delta * nb_over_n, + a.m2 + b.m2 + delta * delta * a.nf * nb_over_n, + // setting acc.n as -1 since acc.n might not be able to represent the + // count correctly within its range, setting it to -1 to avoid confusion + -1, + new_count}; + } + inline res_t project(acc_t acc) const __ubsan_ignore_float_divide_by_zero__ { + const auto mean = static_cast(acc.mean); + const auto divisor = acc.nf > correction ? acc.nf - correction : 0; + const auto var = acc.m2 / divisor; + res_t results(take_sqrt ? device_sqrt(var) : var, mean); + return results; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + + WelfordOps(acc_scalar_t correction, bool take_sqrt) + : correction(correction), take_sqrt(take_sqrt) {} +}; + +template < + typename scalar_t, + typename acc_t = scalar_t, + typename factor_t = acc_t, + typename out_t = acc_t> +struct MeanOps { + factor_t factor; + + inline acc_t reduce(acc_t a, scalar_t b, int64_t /*idx*/) const { + return combine(a, static_cast(b)); + } + + inline acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline out_t project(acc_t a) const { + return a * factor; + } + + static acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + + MeanOps(factor_t factor) : factor(factor) {} +}; + +// This accumulator template is used to calculate the minimum absolute value of +// a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the +// accumulated value. These types differ for complex number input support. +template +struct AbsMinOps { + inline acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return MIN(acc, static_cast(std::abs(data))); + } + + inline acc_t combine(acc_t a, acc_t b) const { + return MIN(a, b); + } + + inline out_t project(acc_t a) const { + return a; + } + + static acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } +}; + +// This accumulator template is used to calculate the maximum absolute value of +// a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the +// accumulated value. These types differ for complex number input support. +template +struct AbsMaxOps { + inline acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return MAX(acc, static_cast(std::abs(data))); + } + + inline acc_t combine(acc_t a, acc_t b) const { + return MAX(a, b); + } + + inline out_t project(acc_t a) const { + return a; + } + + static acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } +}; + +// This accumulator template is used to calculate the norm of the absolute value +// of a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the +// accumulated value. These types differ for complex number input support. +template +struct NormOps { + acc_t norm_; + + inline acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return acc + compat_pow(static_cast(std::abs(data)), norm_); + } + + inline acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline out_t project(acc_t a) const { + return compat_pow(a, static_cast(1.0) / norm_); + } + + static acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + + NormOps(acc_t norm_) : norm_(norm_) {} +}; + +// This accumulator template is used to calculate the order zero norm of the +// absolute value of a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the +// accumulated value. These types differ for complex number input support. +template +struct NormZeroOps { + inline acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return acc + + (data == static_cast(0) ? static_cast(0) + : static_cast(1)); + } + + inline acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline out_t project(acc_t a) const { + return a; + } + + static acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } +}; + +// This accumulator template is used to calculate the order one norm of the +// absolute value of a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the +// accumulated value. These types differ for complex number input support. +template +struct NormOneOps { + inline acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return acc + static_cast(std::abs(data)); + } + + inline acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline out_t project(acc_t a) const { + return a; + } + + static acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } +}; + +template +struct AbsSwitch {}; + +template +inline acc_t abs_if_complex(scalar_t data, AbsSwitch) { + return static_cast(data); +} + +template +inline acc_t abs_if_complex(std::complex data, AbsSwitch) { + return static_cast(std::abs(data)); +} + +template +inline acc_t abs_if_complex(c10::complex data, AbsSwitch) { + return static_cast(std::abs(data)); +} + +// This accumulator template is used to calculate the order two norm of the +// absolute value of a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the +// accumulated value. These types differ for complex number input support. +template +struct NormTwoOps { + inline acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + acc_t data_ = abs_if_complex(data, AbsSwitch()); + return acc + data_ * data_; + } + + inline acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline out_t project(acc_t a) const { + return device_sqrt(a); + } + + static acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } +}; + +template +struct NanSumOps { + inline acc_t reduce(acc_t a, data_t b, int64_t /*idx*/) const { + return a + (at::_isnan(b) ? acc_t{0.} : acc_t{b}); + } + + inline acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline data_t project(acc_t a) const { + return data_t{a}; + } + + static acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } +}; + +namespace detail { + +template +struct LessOrNan { + bool operator()(scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) const { + // If (a == b), then choose the one with lower idx, else min(a, b) + if (at::_isnan(a)) { + if (at::_isnan(b)) { + return idx_a < idx_b; + } + return true; + } + return (a == b) ? idx_a < idx_b : (a < b); + } +}; + +template +struct GreaterOrNan { + bool operator()(scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) const { + // If (a == b), then choose the one with lower idx, else max(a, b) + if (at::_isnan(a)) { + if (at::_isnan(b)) { + return idx_a < idx_b; + } + return true; + } + return (a == b) ? idx_a < idx_b : (a > b); + } +}; + +template +struct MinMaxReductionOps { + using scalar_t = typename binary_function_traits::arg1_t; + using index_t = int64_t; + using arg_t = at::xpu::pair; + + static arg_t project(arg_t arg) { + return arg; + } + + static arg_t reduce(arg_t arg, scalar_t val, int64_t idx) { + return comp_t{}(arg.first, val, arg.second, idx) ? arg : arg_t(val, idx); + } + + static arg_t combine(arg_t a, arg_t b) { + return comp_t{}(a.first, b.first, a.second, b.second) ? a : b; + } + + static arg_t translate_idx(arg_t a, int64_t base_idx) { + return {a.first, a.second + base_idx}; + } +}; + +template +struct ArgReductionOps : public MinMaxReductionOps { + using typename MinMaxReductionOps::scalar_t; + using typename MinMaxReductionOps::index_t; + using typename MinMaxReductionOps::arg_t; + + static index_t project(arg_t arg) { + return arg.second; + } +}; + +} // namespace detail + +template +struct ArgMaxOps + : public detail::ArgReductionOps> {}; + +template +struct ArgMinOps : public detail::ArgReductionOps> { +}; + +template +struct MinOps : public detail::MinMaxReductionOps> { +}; + +template +struct MaxOps + : public detail::MinMaxReductionOps> {}; + +template +struct MinMaxOps { + using acc_t = at::xpu::pair; + inline acc_t reduce(acc_t acc, scalar_t data, index_t /*idx*/) const { + return combine(acc, {data, data}); + } + + inline acc_t combine(acc_t a, acc_t b) const { + auto min_val = + (at::_isnan(a.first) || a.first < b.first) ? a.first : b.first; + auto max_val = + (at::_isnan(a.second) || a.second > b.second) ? a.second : b.second; + + return {min_val, max_val}; + } + + inline acc_t project(acc_t acc) const { + return acc; + } + + static acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } +}; + +} // namespace xpu +} // namespace native +} // namespace at + +#undef MAX +#undef MIN diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp index 8db72165a..78062e21a 100644 --- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp +++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp @@ -40,7 +40,8 @@ static inline void softmax_group_reduce( // uint32_t SIMD = sg.get_local_range()[0]; #pragma unroll for (int i = 1; i < SIMD; i <<= 1) { - val = bin_op(val, static_cast(sg.shuffle_down(val, i))); + val = bin_op( + val, static_cast(sycl::shift_group_left(sg, val, i))); } if (sub_group_num == 1) { val = sycl::group_broadcast(sg, val, 0); @@ -68,7 +69,8 @@ static inline void softmax_group_reduce( } #pragma unroll for (int i = 1; i < SIMD; i <<= 1) { - val = bin_op(val, static_cast(sg.shuffle_down(val, i))); + val = bin_op( + val, static_cast(sycl::shift_group_left(sg, val, i))); if (i >= ((sub_group_num + 1) >> 1)) break; } diff --git a/src/ATen/native/xpu/sycl/UnarySignKernels.cpp b/src/ATen/native/xpu/sycl/UnarySignKernels.cpp index fee36b973..b35aba437 100644 --- a/src/ATen/native/xpu/sycl/UnarySignKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnarySignKernels.cpp @@ -54,6 +54,35 @@ void sign_kernel(TensorIteratorBase& iter) { } } +template +struct SignbitIntFunctor { + bool operator()(scalar_t a) const { + return is_negative(a); + } +}; + +template +struct SignbitFunctor { + bool operator()(scalar_t a) const { + using opmath_t = at::opmath_type; + return std::signbit(opmath_t{a}); + } +}; + +void signbit_kernel(TensorIteratorBase& iter) { + // NOTE: signbit does not always support integral arguments. + if (at::isIntegralType(iter.input_dtype(), /*includeBool=*/false)) { + AT_DISPATCH_INTEGRAL_TYPES(iter.input_dtype(), "signbit_xpu", [&]() { + gpu_kernel(iter, SignbitIntFunctor()); + }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND2( + kBFloat16, ScalarType::Half, iter.input_dtype(), "signbit_xpu", [&]() { + gpu_kernel(iter, SignbitFunctor()); + }); + } +} + template struct LogicalNotFunctor { scalar_t operator()(scalar_t a) const { diff --git a/src/ATen/native/xpu/sycl/UnarySignKernels.h b/src/ATen/native/xpu/sycl/UnarySignKernels.h index 1fefcf035..98b6eeb5f 100644 --- a/src/ATen/native/xpu/sycl/UnarySignKernels.h +++ b/src/ATen/native/xpu/sycl/UnarySignKernels.h @@ -12,4 +12,6 @@ void sgn_kernel(TensorIteratorBase& iter); void sign_kernel(TensorIteratorBase& iter); +void signbit_kernel(TensorIteratorBase& iter); + } // namespace at::native::xpu diff --git a/src/comm/XPUPair.h b/src/comm/XPUPair.h new file mode 100644 index 000000000..0c36f8625 --- /dev/null +++ b/src/comm/XPUPair.h @@ -0,0 +1,78 @@ +#pragma once + +#include + +namespace at::xpu { + +template +struct pair { + typedef T1 first_type; + typedef T2 second_type; + + first_type first; + second_type second; + + // default constructor + pair(void) : first(), second() {} + + inline pair(const T1& x, const T2& y) : first(x), second(y) {} + + template + inline pair(const pair& p) : first(p.first), second(p.second) {} + + template + pair(const std::pair& p) : first(p.first), second(p.second) {} +}; + +template +bool operator==(const pair& x, const pair& y) { + return x.first == y.first && x.second == y.second; +} + +template +inline bool operator<(const pair& x, const pair& y) { + return x.first < y.first || (!(y.first < x.first) && x.second < y.second); +} + +template +inline bool operator!=(const pair& x, const pair& y) { + return !(x == y); +} + +template +inline bool operator>(const pair& x, const pair& y) { + return y < x; +} + +template +bool operator<=(const pair& x, const pair& y) { + return !(y < x); +} + +template +bool operator>=(const pair& x, const pair& y) { + return !(x < y); +} + +template +inline pair make_pair(T1 x, T2 y) { + return pair(x, y); +} + +template +inline auto& get(pair& p) { + if constexpr (N == 0) + return p.first; + else + return p.second; +} + +template +inline const auto& get(const pair& p) { + if constexpr (N == 0) + return p.first; + else + return p.second; +} + +} // namespace at::xpu diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py index ca698993e..f94a9cd5d 100644 --- a/test/xpu/run_test_with_skip.py +++ b/test/xpu/run_test_with_skip.py @@ -1264,7 +1264,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): "test_RReLU_with_up_down_cuda", # AssertionError: Scalars are not close! "test_RReLU_with_up_down_scalar_cuda", - # lstm: AssertionError: Scalars are not equal! + # rnn fallback to cpu "test_cudnn_weight_format", # NotImplementedError: Could not run 'aten::_indices' with arguments from the 'SparseXPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). "test_EmbeddingBag_sparse_cuda", @@ -1289,16 +1289,12 @@ def launch_test(test_case, skip_list=None, exe_list=None): "test_rnn_retain_variables_xpu_float64", "test_transformerencoderlayer_xpu_float64", "test_variable_sequence_xpu_float64", - # CPU fallback fails - # AssertionError: Tensor-likes are not close! + # native_group_norm : RuntimeError: Expected X.is_contiguous(memory_format) to be true, but got false. (Could this error message be improved? If so, please report an enhancement request to PyTorch.) "test_GroupNorm_memory_format_xpu", # AssertionError: Scalars are not close! "test_InstanceNorm1d_general_xpu", "test_InstanceNorm2d_general_xpu", "test_InstanceNorm3d_general_xpu", - # AssertionError: False is not true - "test_device_mask_xpu", - "test_overwrite_module_params_on_conversion_cpu_device_xpu", # AssertionError: RuntimeError not raised "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_False_num_channels_3_mode_bicubic_uint8_xpu_uint8", "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_False_num_channels_3_mode_bilinear_uint8_xpu_uint8", @@ -1308,8 +1304,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_True_num_channels_3_mode_bilinear_uint8_xpu_uint8", "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_True_num_channels_5_mode_bicubic_uint8_xpu_uint8", "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_True_num_channels_5_mode_bilinear_uint8_xpu_uint8", - "test_grid_sample_error_checking", - # Failed: Unexpected success + #upsamplingNearest2d: Failed: Unexpected success "test_upsamplingNearest2d_launch_fail_xpu", # CPU fallback could not cover # NotImplementedError: Could not run 'aten::_thnn_fused_gru_cell' with arguments from the 'CPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build pro... @@ -1329,9 +1324,6 @@ def launch_test(test_case, skip_list=None, exe_list=None): "test_MultiLabelMarginLoss_no_batch_dim_mean_cuda_half", "test_MultiLabelMarginLoss_no_batch_dim_none_cuda_half", "test_MultiLabelMarginLoss_no_batch_dim_sum_cuda_half", - # align CUDA to skip, XPU implementation is not yet supporting uint8 - "test_upsamplingBiMode2d_consistency", - "test_upsamplingBiLinear2d_consistency_interp_size_bug", ) res += launch_test("test_nn_xpu.py", skip_list) @@ -2217,6 +2209,12 @@ def launch_test(test_case, skip_list=None, exe_list=None): "test_scaled_mm_vs_emulated_float16_xpu", "test_scaled_mm_vs_emulated_float32_xpu", "test_scaled_mm_vs_emulated_row_wise_bfloat16_xpu", + + # https://github.com/intel/torch-xpu-ops/issues/676 + # Mismatched elements: 9 / 1003002 (0.0%) + # Greatest absolute difference: 711.126220703125 at index (472, 999) (up to 0.1 allowed) + # Greatest relative difference: 2.7107455730438232 at index (472, 997) (up to 0.1 allowed) + "test_cublas_addmm_size_1000_xpu_float32", ) res += launch_test("test_matmul_cuda_xpu.py", skip_list=skip_list) diff --git a/test/xpu/test_nn_xpu.py b/test/xpu/test_nn_xpu.py index 02cd90e7c..fb05f2c78 100644 --- a/test/xpu/test_nn_xpu.py +++ b/test/xpu/test_nn_xpu.py @@ -12,6 +12,7 @@ import torch import torch.nn.functional as F from torch import nn +import torch.nn.utils.rnn as rnn_utils from torch.testing._internal.common_device_type import ( dtypes, instantiate_device_type_tests, @@ -1686,6 +1687,170 @@ def _batch_norm_stats(data, memory_format, mean_axes): _batch_norm_stats(torch.randn(1, 96, 112, 112, 112, dtype=torch.float, device='xpu'), torch.channels_last_3d, (0, 2, 3, 4)) TestNN.test_sync_batchnorm_accuracy_cuda=_test_sync_batchnorm_accuracy_xpu +@parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last]) +@parametrize_test("mode", ["bilinear", "bicubic"]) +@parametrize_test("antialias", [True, False]) +@parametrize_test("align_corners", [True, False]) +@parametrize_test("num_channels", [3, 5]) +@parametrize_test("output_size", [32, 600]) +@parametrize_test("check_as_unsqueezed_3d_tensor", [True, False]) +@parametrize_test("non_contig", [False, "sliced", "restrided"]) +@parametrize_test("batch_size", [1, 5]) +def _test_upsamplingBiMode2d_consistency( + self, + device, + memory_format, + mode, + antialias, + align_corners, + num_channels, + output_size, + check_as_unsqueezed_3d_tensor, + non_contig, + batch_size, +): + # Check output value consistency between resized_input_uint8 and resized input_float + if torch.device(device).type == "xpu": + raise SkipTest("XPU implementation is not yet supporting uint8") + + torch.manual_seed(0) + + # - input range is set to [30, 220] for bicubic mode, because the bicubic kernel may create + # [intermediate] values outside of the [0, 255] range, which need + # to be clipped in uint8 path, but not in float path. This isn't + # an issue with bilinear kernel. + input_range = (30, 220) if mode == "bicubic" else (0, 256) + input_ui8 = torch.randint(*input_range, size=(batch_size, num_channels, 400, 400), dtype=torch.uint8, device=device) + input_ui8 = input_ui8.contiguous(memory_format=memory_format) + + if non_contig == "sliced": + input_ui8 = input_ui8[:, :, 10:-10, 10:-10] + elif non_contig == "restrided": + input_ui8 = input_ui8[:, :, ::2, ::2] + + if batch_size == 1 and check_as_unsqueezed_3d_tensor: + input_ui8 = input_ui8[0, ...] + input_ui8 = input_ui8[None, ...] + + input_f32 = input_ui8.float() + + output_f32 = F.interpolate( + input_f32, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=antialias + ).round().clip(0, 255) + output_ui8 = F.interpolate( + input_ui8, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=antialias + ) + + if non_contig is False: + self.assertTrue(input_ui8.is_contiguous(memory_format=memory_format)) + + # FIXME if-clause shows the current behaviour which is definitely unexpected. + # Ideally we want to fix it such that both the ui8 and f32 outputs are also channels_last + # See for more details: https://github.com/pytorch/pytorch/pull/100373 + if batch_size == 1 and check_as_unsqueezed_3d_tensor and memory_format == torch.channels_last: + self.assertTrue(output_ui8.is_contiguous()) + self.assertTrue(output_f32.is_contiguous()) + else: + self.assertTrue(output_ui8.is_contiguous(memory_format=memory_format)) + self.assertTrue(output_f32.is_contiguous(memory_format=memory_format)) + + if mode == "bilinear": + torch.testing.assert_close(output_f32, output_ui8.float(), rtol=0, atol=1) + else: + diff = (output_f32 - output_ui8.float()).abs() + self.assertLess(diff.max(), 15) + + threshold = 2 + percent = 3 + self.assertLess((diff > threshold).float().mean(), percent / 100) + + threshold = 5 + percent = 1 + self.assertLess((diff > threshold).float().mean(), percent / 100) + + self.assertLess(diff.mean(), 0.4) +TestNNDeviceType.test_upsamplingBiMode2d_consistency=_test_upsamplingBiMode2d_consistency + +@parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last]) +@parametrize_test("align_corners", [True, False]) +@parametrize_test("input_size, output_size", [(399, 437), (403, 377)]) +def _test_upsamplingBiLinear2d_consistency_interp_size_bug(self, device, memory_format, align_corners, input_size, output_size): + # Non-regression test for https://github.com/pytorch/pytorch/pull/101403 + + if torch.device(device).type == "xpu": + raise SkipTest("XPU implementation is not yet supporting uint8") + + mode = "bilinear" + input_ui8 = torch.randint(0, 256, size=(1, 3, input_size, input_size), dtype=torch.uint8, device=device) + input_ui8 = input_ui8.contiguous(memory_format=memory_format) + input_f32 = input_ui8.float() + + output_f32 = F.interpolate( + input_f32, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=False + ).round().to(torch.uint8) + output_ui8 = F.interpolate( + input_ui8, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=False + ) + torch.testing.assert_close(output_f32, output_ui8, atol=1, rtol=0) +TestNNDeviceType.test_upsamplingBiLinear2d_consistency_interp_size_bug=_test_upsamplingBiLinear2d_consistency_interp_size_bug + +def _test_device_mask(self, device): + def is_xpu(packed): + return packed.data.device.type=="xpu" + for enforce_sorted in [True, False]: + padded, lengths = self._padded_sequence('cpu', torch.float) + packed = rnn_utils.pack_padded_sequence( + padded, lengths, enforce_sorted=enforce_sorted) + self.assertFalse(is_xpu(packed)) + packed = packed.to(device) + self.assertTrue(is_xpu(packed)) + unpacked, _ = rnn_utils.pad_packed_sequence(packed) + self.assertTrue(is_xpu(unpacked)) + self.assertEqual(unpacked.dtype, torch.float) +TestNNDeviceType.test_device_mask=_test_device_mask + +def _test_overwrite_module_params_on_conversion_cpu_device(self, device): + # Test that under the current default settings + # (`torch.__future__.get_overwrite_module_params_on_conversion() == False`), + # a view to a module's parameters is not pointing to the same storage as + # its base variable after converting the module to a different device. + m = nn.Linear(20, 10) + mw = m.weight[:] + m.to(device) + with torch.no_grad(): + # Without using `torch.no_grad()`, this will leak CUDA memory. + # (Issue is filed at https://github.com/pytorch/pytorch/issues/21875) + mw[0][0] = 5 + self.assertTrue(mw[0][0].device.type == "cpu") + self.assertTrue(mw._base[0][0].device.type == "xpu") + + try: + torch.__future__.set_overwrite_module_params_on_conversion(True) + + # Test that if `torch.__future__.get_overwrite_module_params_on_conversion() == True`, + # a view to a module's parameters is still pointing to the same storage as + # its base variable after converting the module to a different device. + m = nn.Linear(20, 10) + mw = m.weight[:] + m.to(device) + with torch.no_grad(): + mw[0][0] = 5 + self.assertTrue(mw[0][0] == mw._base[0][0]) + + # Test that if `torch.__future__.get_overwrite_module_params_on_conversion() == True`, + # `cpu_module.to("cuda")` doesn't preserve previous references to + # `cpu_module`'s parameters or gradients. + m = nn.Linear(20, 10) + m.weight.grad = torch.randn(10, 20) + weight_ref = m.weight + weight_grad_ref = m.weight.grad + m.to(device) + self.assertNotEqual(weight_ref.device, m.weight.device) + self.assertNotEqual(weight_grad_ref.device, m.weight.grad.device) + finally: + torch.__future__.set_overwrite_module_params_on_conversion(False) +TestNNDeviceType.test_overwrite_module_params_on_conversion_cpu_device=_test_overwrite_module_params_on_conversion_cpu_device + def _test_ctc_loss_xpu(self, device): batch_size = 16 input_length = 30 diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index d92474224..483af50f7 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -191,6 +191,8 @@ "sigmoid", "logsigmoid", "sgn", + "sign", + "signbit", "round", "nn.functional.embedding_bag", "bucketize", diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index 19aa0fe6c..db3df0667 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -473,6 +473,11 @@ supported: - sigmoid - sigmoid.out - sigmoid_ + - sign + - sign.out + - sign_ + - signbit + - signbit.out - sigmoid_backward.grad_input - sigmoid_backward - hardsigmoid.out