diff --git a/src/ATen/native/xpu/ForeachOpScalarList.cpp b/src/ATen/native/xpu/ForeachOpScalarList.cpp index 05a5c9706..87c1f0ce3 100644 --- a/src/ATen/native/xpu/ForeachOpScalarList.cpp +++ b/src/ATen/native/xpu/ForeachOpScalarList.cpp @@ -6,12 +6,15 @@ #include #include #include +#include #include #include #include +#include #include #include +#include #include #include @@ -118,5 +121,38 @@ FOREACH_BINARY_OP_SCALARLIST(pow, true); FOREACH_POINTWISE_OP_SCALARLIST(addcmul) FOREACH_POINTWISE_OP_SCALARLIST(addcdiv) +std::vector foreach_tensor_lerp_scalarlist_xpu( + TensorList tensors1, + TensorList tensors2, + at::ArrayRef scalars) { + check_foreach_api_restrictions(tensors1, tensors2, scalars); + if (!can_use_fast_route({tensors1, tensors2}, scalars, true)) { + return foreach_tensor_lerp_scalarlist_kernel_slow( + tensors1, tensors2, scalars); + } + + std::vector vec_res; + vec_res.reserve(tensors1.size()); + for (const auto& t : tensors1) { + vec_res.emplace_back(at::empty_like(t)); + } + + xpu::foreach_lerp_scalarlist_kernel(tensors1, tensors2, scalars, vec_res); + return vec_res; +} + +void foreach_tensor_lerp_scalarlist_xpu_( + TensorList tensors1, + TensorList tensors2, + at::ArrayRef scalars) { + check_foreach_api_restrictions(tensors1, tensors2, scalars); + if (!can_use_fast_route({tensors1, tensors2}, scalars, true)) { + return foreach_tensor_lerp_scalarlist_kernel_slow_( + tensors1, tensors2, scalars); + } + + xpu::foreach_lerp_scalarlist_kernel_(tensors1, tensors2, scalars); +} + }; // namespace native } // namespace at diff --git a/src/ATen/native/xpu/ForeachReduceOp.cpp b/src/ATen/native/xpu/ForeachReduceOp.cpp index a60eb600c..a9ef1ff44 100644 --- a/src/ATen/native/xpu/ForeachReduceOp.cpp +++ b/src/ATen/native/xpu/ForeachReduceOp.cpp @@ -1,6 +1,7 @@ #include #include +#include #include namespace at { @@ -70,5 +71,23 @@ std::vector foreach_tensor_norm_xpu( return native::xpu::foreach_norm_kernel(tensors, ord, p, dtype); } + +std::vector foreach_tensor_max_xpu(TensorList tensors) { + check_foreach_api_restrictions(tensors); + if (!can_use_fast_route(tensors)) { + return foreach_tensor_max_slow(tensors); + } + + // for parity with max in ReduceAllOps.cpp, as max(empty) is ??? + TORCH_CHECK( + std::all_of( + tensors.begin(), + tensors.end(), + [](const auto& t) { return t.numel() > 0; }), + "max(): Expected reduction dim to be specified for input.numel() == 0. Specify the reduction dim with the 'dim' argument."); + + return at::native::xpu::foreach_max_kernel(tensors); +} + } // namespace native } // namespace at diff --git a/src/ATen/native/xpu/ForeachUnaryOp.cpp b/src/ATen/native/xpu/ForeachUnaryOp.cpp index 83577707b..2672d5827 100644 --- a/src/ATen/native/xpu/ForeachUnaryOp.cpp +++ b/src/ATen/native/xpu/ForeachUnaryOp.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -93,6 +94,7 @@ FOREACH_UNARY_OP(round); FOREACH_UNARY_OP(frac); FOREACH_UNARY_OP(reciprocal); FOREACH_UNARY_OP(sign); +FOREACH_UNARY_OP(rsqrt); std::vector foreach_tensor_neg_xpu(TensorList tensors) { at::native::check_foreach_api_restrictions(tensors); diff --git a/src/ATen/native/xpu/sycl/ForeachFunctors.h b/src/ATen/native/xpu/sycl/ForeachFunctors.h index 9f0f33063..70a362fcd 100644 --- a/src/ATen/native/xpu/sycl/ForeachFunctors.h +++ b/src/ATen/native/xpu/sycl/ForeachFunctors.h @@ -834,6 +834,77 @@ struct TernaryOpScalarFunctor { } }; +template +struct TernaryOpScalarListFunctor { + using opmath_t = at::opmath_type; + + template + void operator()( + int chunk_size, + TLA tlAddress, + TLW tlWGMeta, + sycl::nd_item<1> item_id, + Op op) const { + static_assert(depth == 2 || depth == 3, ""); + static_assert(depth >= r_args_depth, ""); + static_assert(res_arg_index == depth - 1 || res_arg_index == 0, ""); + auto item_idx = item_id.get_local_id(0); + auto item_range = item_id.get_local_range(0); + auto group_idx = item_id.get_group(0); + int tensor_loc = tlWGMeta[group_idx].wg_to_tensor; + int chunk_idx = tlWGMeta[group_idx].wg_to_chunk; + int64_t n = tlAddress[tensor_loc].numel_to_tensor; + + T* args[depth]; + const bool all_aligned = + init_args(args, tlAddress, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + const opmath_t scalar = tlAddress[tensor_loc].scalar_vals; + + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = item_idx; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += item_range) { + // load + load_store(r_args[0], args[0], 0, i_start); + load_store(r_args[1], args[1], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]), + scalar); + } + // store + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += item_range * kILP) { + load_args( + r_args, args, i_start, chunk_size, n, item_idx, item_range); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]), + scalar); + } + store_args( + args[res_arg_index], + r_args[0], + i_start, + chunk_size, + n, + item_idx, + item_range); + } + } + } +}; + template struct power_functor { T operator()(const T& a, const T& b) const { diff --git a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp index 9e982e4a0..19fda8472 100644 --- a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp +++ b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp @@ -5,14 +5,17 @@ #include #include +#include #include #include +namespace at::native::xpu { + enum class NormType { L1, L2, LInf }; #define SIMD16 16 #define SIMD32 32 -namespace at::native::xpu { + template < typename T, NormType norm_type, @@ -476,4 +479,274 @@ std::vector foreach_norm_kernel( return result; } +template +struct LpMaxFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { + template + [[intel::reqd_sub_group_size(SIMD)]] void operator()( + int64_t chunk_size, + TLA tlAddressMeta, + TLW tlWGMeta, + sycl::nd_item<1> item, + T* output_per_tensor_ptr, + const int max_chunks_per_tensor) const { + auto workgroup_id = item.get_group(0); + auto item_id = item.get_local_id(0); + auto local_range = item.get_local_range(0); + + const auto tensor_loc = tlWGMeta[workgroup_id].wg_to_tensor; + const auto chunk_idx = tlWGMeta[workgroup_id].wg_to_chunk; + auto n = tlAddressMeta[tensor_loc].numel_to_tensor; + + T* x = (T*)tlAddressMeta[tensor_loc].addresses[0]; + x += chunk_idx * chunk_size; + n -= chunk_idx * chunk_size; + + T vals[kILP]; + T r_x[kILP]; + for (int64_t i = 0; i < kILP; i++) { + vals[i] = T(std::numeric_limits::lowest()); + r_x[i] = T(std::numeric_limits::lowest()); + } + + if (n % kILP == 0 && (chunk_size & kILP) == 0 && is_aligned(x)) { + for (int64_t i_start = item_id; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += local_range) { + // load + load_store(r_x, x, 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + vals[ii] = max_impl(vals[ii], r_x[ii]); + } + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += local_range * kILP) { +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + int i = i_start + item_id + ii * local_range; + if (i < n && i < chunk_size) { + vals[ii] = max_impl(vals[ii], x[i]); + } + } + } + } + + auto val = T(std::numeric_limits::lowest()); + for (int i = 0; i < kILP; i++) { + val = max_impl(val, vals[i]); + } + auto final_val = + GroupReduceMaxWithoutBroadcast(item, val, shared_); + + if (item_id == 0) { + output_per_tensor_ptr[tensor_loc * max_chunks_per_tensor + chunk_idx] = + final_val; + } + } + + void sycl_ker_config_convention(sycl::handler& cgh) { + shared_ = sycl_local_acc_t(SIMD, cgh); + } + + private: + sycl_local_acc_t shared_; +}; + +template +struct LpmaxChunkReduceKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { + [[intel::reqd_sub_group_size(SIMD)]] void operator()( + sycl::nd_item<1> item_id) const { + auto local_range = item_id.get_local_range(0); + auto lid = item_id.get_local_linear_id(); + auto group_id = item_id.get_group(0); + + const T* output_this_tensor = + output_per_tensor_ + group_id * max_chunks_per_tensor_; + int chunks_this_tensor = chunks_per_tensor_[group_id]; + T val = std::numeric_limits::lowest(); + for (int i = lid; i < chunks_this_tensor; i += local_range) { + val = max_impl(val, output_this_tensor[i]); + } + T final_value = + GroupReduceMaxWithoutBroadcast(item_id, val, shared_); + if (lid == 0) { + *(ret_per_tensor_[group_id]) = final_value; + } + } + + void sycl_ker_config_convention(sycl::handler& cgh) { + shared_ = sycl_local_acc_t(SIMD, cgh); + } + + LpmaxChunkReduceKernelFunctor( + const T* output_per_tensor, + T** ret_per_tensor, + int* chunks_per_tensor, + int max_chunks_per_tensor) + : output_per_tensor_(output_per_tensor), + ret_per_tensor_(ret_per_tensor), + chunks_per_tensor_(chunks_per_tensor), + max_chunks_per_tensor_(max_chunks_per_tensor) {} + + private: + const T* output_per_tensor_; + T** ret_per_tensor_; + int* chunks_per_tensor_; + int max_chunks_per_tensor_; + sycl_local_acc_t shared_; +}; + +template +void launch_lpmax_chunk_reduce_kernel( + const T* output_per_tensor, + T** ret_per_tensor, + int* chunks_per_tensor, + int max_chunks_per_tensor, + int n_tensor) { + int wg_size = multi_tensor_apply_kernel_get_wg_size(SIMD); + LpmaxChunkReduceKernelFunctor kfn( + output_per_tensor, + ret_per_tensor, + chunks_per_tensor, + max_chunks_per_tensor); + + sycl_kernel_submit( + sycl::range<1>(n_tensor * wg_size), + sycl::range<1>(wg_size), + getCurrentSYCLQueue(), + kfn); +} + +std::vector foreach_max_kernel(TensorList tensors) { + const size_t ntensors = tensors.size(); + const auto options = tensors[0].options(); + + auto& q = getCurrentSYCLQueue(); + // Store output address for each tensor + auto addressStorage = + at::empty({(int)(sizeof(void*) * ntensors)}, options.dtype(at::kByte)); + auto metaAddress = static_cast(addressStorage.mutable_data_ptr()); + void** tensor_list_addresses = nullptr; + auto tensor_list_addresses_dptr = + at::xpu::HostAlloc(sizeof(void*) * ntensors); + tensor_list_addresses = (void**)tensor_list_addresses_dptr.get(); + + // Store thunks count for each tensor + auto countsStorage = + at::empty({(int)(sizeof(int) * ntensors)}, options.dtype(at::kByte)); + auto metaCounts = static_cast(countsStorage.mutable_data_ptr()); + int* thunk_counts = nullptr; + auto thunk_counts_dptr = at::xpu::HostAlloc(sizeof(int) * ntensors); + thunk_counts = (int*)thunk_counts_dptr.get(); + + int max_chunks_per_tensor = -1; + int64_t simd = syclMaxSubGroupSize(); + int64_t kChunkSize = multi_tensor_apply_kernel_get_chunk_size(simd); + for (const auto t : c10::irange(ntensors)) { + int max_chunks_this_tensor = + (tensors[t].numel() + kChunkSize - 1) / kChunkSize; + thunk_counts[t] = max_chunks_this_tensor; + if (max_chunks_this_tensor > max_chunks_per_tensor) { + max_chunks_per_tensor = max_chunks_this_tensor; + } + } + auto output_per_tensor = at::zeros( + {static_cast(ntensors) * max_chunks_per_tensor}, options); + + std::vector vec_res; + vec_res.reserve(ntensors); + for (const auto i : c10::irange(ntensors)) { + vec_res.push_back(at::detail::empty_xpu( + {}, + optTypeMetaToScalarType(options.dtype_opt()), + options.layout_opt(), + options.device_opt(), + options.pinned_memory_opt(), + options.memory_format_opt())); + } + + auto tensor_lists = std::vector>{tensors.vec()}; + AT_DISPATCH_ALL_TYPES_AND3( + kHalf, + kBFloat16, + kBool, + tensor_lists[0][0].scalar_type(), + "foreach_tensor_max_xpu_scalar_type", + [&]() { + if (simd == SIMD32) { + multi_tensor_apply<1>( + tensor_lists, + LpMaxFunctor(), + output_per_tensor.mutable_data_ptr(), + max_chunks_per_tensor); + } else if (simd == SIMD16) { + multi_tensor_apply<1>( + tensor_lists, + LpMaxFunctor(), + output_per_tensor.mutable_data_ptr(), + max_chunks_per_tensor); + } else { + TORCH_CHECK( + false, + "foreach_max_xpu_kernel does not support SIMD width: ", + simd); + } + + for (int i = 0; i < ntensors; i++) { + tensor_list_addresses[i] = vec_res[i].mutable_data_ptr(); + } + q.memcpy( + (void*)metaAddress, + (void*)tensor_list_addresses, + sizeof(void*) * ntensors); + at::xpu::CachingHostAllocator_recordEvent( + (void*)tensor_list_addresses, + tensor_list_addresses_dptr.get_context(), + at::xpu::getCurrentXPUStream()); + q.memcpy( + (void*)metaCounts, (void*)thunk_counts, sizeof(int) * ntensors); + at::xpu::CachingHostAllocator_recordEvent( + (void*)thunk_counts, + thunk_counts_dptr.get_context(), + at::xpu::getCurrentXPUStream()); + if (simd == SIMD32) { + launch_lpmax_chunk_reduce_kernel( + output_per_tensor.mutable_data_ptr(), + (scalar_t**)(metaAddress), + (int*)(metaCounts), + max_chunks_per_tensor, + ntensors); + } else { + launch_lpmax_chunk_reduce_kernel( + output_per_tensor.mutable_data_ptr(), + (scalar_t**)(metaAddress), + (int*)(metaCounts), + max_chunks_per_tensor, + ntensors); + } + }); + + // correctly assign values to only non-empty slots, as the empty slots should + // get skipped + std::vector result; + result.reserve(ntensors); + int i = 0; + for (const auto& t : tensors) { + if (t.numel() != 0) { + result.emplace_back(vec_res[i]); + i++; + } else { + result.emplace_back(at::detail::empty_xpu( + {}, + optTypeMetaToScalarType(options.dtype_opt()), + options.layout_opt(), + options.device_opt(), + options.pinned_memory_opt(), + options.memory_format_opt())); + } + } + return result; +} + } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ForeachReduceKernels.h b/src/ATen/native/xpu/sycl/ForeachReduceKernels.h index cefb3d840..e7413b50c 100644 --- a/src/ATen/native/xpu/sycl/ForeachReduceKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachReduceKernels.h @@ -9,4 +9,6 @@ TORCH_XPU_API std::vector foreach_norm_kernel( double p, c10::optional dtype); +TORCH_XPU_API std::vector foreach_max_kernel(TensorList tensors); + } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp b/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp index ed8e01653..53e937f0f 100644 --- a/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp +++ b/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp @@ -7,6 +7,7 @@ #include #include +#include namespace at::native::xpu { @@ -123,4 +124,58 @@ void foreach_lerp_scalar_kernel_( weight.to()); }); } + +void foreach_lerp_scalarlist_kernel( + TensorList tensors1, + TensorList tensors2, + at::ArrayRef scalars, + TensorList result) { + std::vector> tensor_lists{ + tensors1.vec(), tensors2.vec(), result.vec()}; + + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + tensors1[0].scalar_type(), + "foreach_tensor_lerp_scalarlist_xpu", + [&]() { + using opmath_t = typename at::opmath_type; + multi_tensor_apply<3, opmath_t>( + tensor_lists, + scalars, + TernaryOpScalarListFunctor< + scalar_t, + /* depth */ 3, + /* r_args_depth */ 2, + /* res_arg_index */ 2>(), + LerpFunctor()); + }); +}; + +void foreach_lerp_scalarlist_kernel_( + TensorList tensors1, + TensorList tensors2, + at::ArrayRef scalars) { + std::vector> tensor_lists{ + tensors1.vec(), tensors2.vec()}; + + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + tensors1[0].scalar_type(), + "foreach_tensor_lerp_scalarlist_xpu_", + [&]() { + using opmath_t = typename at::opmath_type; + multi_tensor_apply<2, opmath_t>( + tensor_lists, + scalars, + TernaryOpScalarListFunctor< + scalar_t, + /* depth */ 2, + /* r_args_depth */ 2, + /* res_arg_index */ 0>(), + LerpFunctor()); + }); +}; + } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarListKernels.h b/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarListKernels.h new file mode 100644 index 000000000..cd3afd665 --- /dev/null +++ b/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarListKernels.h @@ -0,0 +1,17 @@ +#pragma once +#include + +namespace at::native::xpu { + +TORCH_XPU_API void foreach_lerp_scalarlist_kernel( + TensorList tensors1, + TensorList tensors2, + at::ArrayRef scalars, + TensorList result); + +TORCH_XPU_API void foreach_lerp_scalarlist_kernel_( + TensorList tensors1, + TensorList tensors2, + at::ArrayRef scalars); + +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp index b24cd2c81..073ab8908 100644 --- a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp +++ b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include namespace at::native::xpu { @@ -391,6 +392,29 @@ struct Sign { } }; +template +struct Rsqrt { + T operator()(T t) const { + return c10::xpu::compat::rsqrt(t); + } +}; + +template <> +struct Rsqrt> { + c10::complex operator()(c10::complex t) const { + const auto one = c10::complex(1.0, 0); + return one / std::sqrt(t); + } +}; + +template <> +struct Rsqrt> { + c10::complex operator()(c10::complex t) const { + const auto one = c10::complex(1.0, 0); + return one / std::sqrt(t); + } +}; + FOREACH_UNARY_INPLACE_KERNEL(sigmoid) { return floating_complex_half_bfloat16_(tensors); } @@ -426,6 +450,13 @@ FOREACH_UNARY_KERNEL(sign) { return floating_half_bfloat16(tensors); } +FOREACH_UNARY_INPLACE_KERNEL(rsqrt) { + return floating_complex_half_bfloat16_(tensors); +} +FOREACH_UNARY_KERNEL(rsqrt) { + return floating_complex_half_bfloat16(tensors); +} + FOREACH_UNARY_INPLACE_KERNEL(neg) { return all_types_half_complex_bfloat16_(tensors); } diff --git a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h index 4af3ffede..8ac04f41e 100644 --- a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h +++ b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h @@ -94,6 +94,9 @@ TORCH_XPU_API FOREACH_UNARY_KERNEL(reciprocal); TORCH_XPU_API FOREACH_UNARY_INPLACE_KERNEL(sign); TORCH_XPU_API FOREACH_UNARY_KERNEL(sign); +TORCH_XPU_API FOREACH_UNARY_INPLACE_KERNEL(rsqrt); +TORCH_XPU_API FOREACH_UNARY_KERNEL(rsqrt); + TORCH_XPU_API FOREACH_UNARY_INPLACE_KERNEL(neg); TORCH_XPU_API FOREACH_UNARY_KERNEL(neg); diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py index 5bd989848..02b9e69d9 100644 --- a/test/xpu/skip_list_common.py +++ b/test/xpu/skip_list_common.py @@ -679,6 +679,10 @@ # FATAL: Unexpected page fault from GPU at 0x0, ctx_id: 1 (CCS) type: 0 (NotPresent), level: 3 (PML4), access: 0 (Read), banned: 1, aborting. # FATAL: Unexpected page fault from GPU at 0x0, ctx_id: 1 (CCS) type: 0 (NotPresent), level: 3 (PML4), access: 0 (Read), banned: 1, aborting. "test_dtypes__refs_nn_functional_pdist_xpu", + + # 2025 bundle std::pow complex result is different on host and device + "test_python_ref__refs_square_xpu_complex64", + "test_python_ref_torch_fallback__refs_square_xpu_complex64", ), "test_binary_ufuncs_xpu.py": ( @@ -1245,6 +1249,15 @@ # Compiler issue in handling tanh with real or imag inf. # https://github.com/intel/torch-xpu-ops/issues/184, https://jira.devtools.intel.com/browse/CMPLRLIBS-34974 "test_reference_numerics_large__refs_tanh_xpu_complex32", + + # 2025 bundle std::pow complex result is different on host and device + "test_exp_xpu_complex64", + "test_reference_numerics_extremal__refs_exp2_xpu_complex64", + "test_reference_numerics_extremal__refs_exp_xpu_complex64", + "test_reference_numerics_extremal_exp2_xpu_complex64", + "test_reference_numerics_extremal_exp_xpu_complex64", + "test_reference_numerics_large__refs_exp_xpu_complex32", + "test_reference_numerics_large_exp_xpu_complex32", ), "test_masked_xpu.py": ( diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml index 64e94073b..6b0a1221d 100644 --- a/yaml/native/native_functions.yaml +++ b/yaml/native/native_functions.yaml @@ -3094,6 +3094,29 @@ dispatch: CompositeExplicitAutograd: _foreach_copy +- func: _foreach_max(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_max_slow + XPU: foreach_tensor_max_xpu + autogen: _foreach_max.out + +- func: _foreach_rsqrt(Tensor[] self) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_rsqrt_slow + XPU: foreach_tensor_rsqrt_xpu + +- func: _foreach_rsqrt_(Tensor(a!)[] self) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_rsqrt_slow_ + XPU: foreach_tensor_rsqrt_xpu_ + autogen: _foreach_rsqrt.out + - func: kthvalue(Tensor self, int k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices) variants: function, method dispatch: @@ -3688,6 +3711,22 @@ XPU: foreach_tensor_lerp_list_xpu_ autogen: _foreach_lerp.Scalar_out +- func: _foreach_lerp.ScalarList(Tensor[] self, Tensor[] tensors1, Scalar[] weight) -> Tensor[] + device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_lerp_scalarlist_kernel_slow + XPU: foreach_tensor_lerp_scalarlist_xpu + autogen: _foreach_lerp.ScalarList_out + +- func: _foreach_lerp_.ScalarList(Tensor(a!)[] self, Tensor[] tensors1, Scalar[] weight) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensors are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_lerp_scalarlist_kernel_slow_ + XPU: foreach_tensor_lerp_scalarlist_xpu_ + autogen: _foreach_lerp.ScalarList_out + - func: acos(Tensor self) -> Tensor device_check: NoCheck # TensorIterator variants: function, method