diff --git a/src/ATen/native/xpu/ForeachOpList.cpp b/src/ATen/native/xpu/ForeachOpList.cpp index 0ce7c9792..e7c25eb5c 100644 --- a/src/ATen/native/xpu/ForeachOpList.cpp +++ b/src/ATen/native/xpu/ForeachOpList.cpp @@ -7,11 +7,14 @@ #include #include #include +#include +#include #include #include #include #include +#include #include @@ -147,5 +150,24 @@ void foreach_tensor_lerp_ternary_xpu_( } } +void foreach_tensor_copy_list_kernel_xpu_( + TensorList self, + TensorList src, + bool non_blocking) { + check_foreach_api_restrictions(self, src); + if (!can_use_fast_route( + self, src, /* does_op_promote_integer_inputs_to_float */ false)) { + return foreach_tensor_copy_list_kernel_slow_( + self, src, non_blocking); + } + + xpu::foreach_copy_list_kernel_(self, src); + + // increment_version + for (const auto& t : self) { + t.unsafeGetTensorImpl()->bump_version(); + } +} + } // namespace native } // namespace at diff --git a/src/ATen/native/xpu/sycl/ForeachCopyKernels.cpp b/src/ATen/native/xpu/sycl/ForeachCopyKernels.cpp new file mode 100644 index 000000000..acc29f08f --- /dev/null +++ b/src/ATen/native/xpu/sycl/ForeachCopyKernels.cpp @@ -0,0 +1,42 @@ +#include +#include + +#include +#include + +#include + + +namespace at::native::xpu { +template +struct Identity { + T operator()(const T& x) { + return x; + } +}; + +void foreach_copy_list_kernel_( + TensorList self, + TensorList src) { + std::vector> tensor_lists{src.vec(), self.vec()}; + + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( + at::ScalarType::Half, + at::ScalarType::BFloat16, + at::ScalarType::Bool, + self[0].scalar_type(), + "foreach_tensor_copy", + [&]() { + using opmath_t = at::opmath_type; + multi_tensor_apply<2>( + tensor_lists, + UnaryOpFunctor< + scalar_t, + /* depth */ 2, + /* r_args_depth */ 1, + /* res_arg_index */ 1>(), + Identity()); + }); +} + +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ForeachCopyKernels.h b/src/ATen/native/xpu/sycl/ForeachCopyKernels.h new file mode 100644 index 000000000..92c127607 --- /dev/null +++ b/src/ATen/native/xpu/sycl/ForeachCopyKernels.h @@ -0,0 +1,10 @@ +#pragma once +#include + +namespace at::native::xpu { + +TORCH_XPU_API void foreach_copy_list_kernel_( + TensorList self, + TensorList src); + +} // namespace at::native::xpu diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml index c79bbf899..1b5d5d01c 100644 --- a/yaml/native/native_functions.yaml +++ b/yaml/native/native_functions.yaml @@ -2749,6 +2749,20 @@ XPU: foreach_tensor_zero_xpu_ autogen: _foreach_zero, _foreach_zero.out +- func: _foreach_copy_(Tensor(a!)[] self, Tensor[] src, bool non_blocking=False) -> () + device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices + variants: function + dispatch: + CompositeExplicitAutograd: foreach_tensor_copy_list_kernel_slow_ + XPU: foreach_tensor_copy_list_kernel_xpu_ + autogen: _foreach_copy.out + +- func: _foreach_copy(Tensor[] self, Tensor[] src, bool non_blocking=False) -> Tensor[] self_out + device_check: NoCheck + variants: function + dispatch: + CompositeExplicitAutograd: _foreach_copy + - func: native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor) dispatch: XPU: layer_norm_xpu