Skip to content

Commit

Permalink
foreach: Format codes for slow path (#948)
Browse files Browse the repository at this point in the history
1. Taking forward-declaration from PyTorch exposed headers not rewriting
them.
2. Formating coding style.

Signed-off-by: Feng Yuan <[email protected]>
  • Loading branch information
fengyuan14 authored Oct 15, 2024
1 parent 76ee14f commit 8e7d7eb
Show file tree
Hide file tree
Showing 13 changed files with 42 additions and 205 deletions.
62 changes: 7 additions & 55 deletions src/ATen/native/xpu/ForeachOpList.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
#include <ATen/native/ForeachUtils.h>
#include <ATen/ops/_foreach_add_native.h>
#include <ATen/ops/_foreach_addcdiv_native.h>
#include <ATen/ops/_foreach_addcmul_native.h>
#include <ATen/ops/_foreach_div_native.h>
#include <ATen/ops/_foreach_lerp_native.h>
#include <ATen/ops/_foreach_mul_native.h>

#include <ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h>
#include <ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h>
#include <ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h>
Expand All @@ -8,29 +15,6 @@
namespace at {
namespace native {

::std::vector<at::Tensor> foreach_tensor_mul_list_kernel_slow(
at::TensorList self,
at::TensorList other);
void foreach_tensor_mul_list_kernel_slow_(
at::TensorList self,
at::TensorList other);

::std::vector<at::Tensor> foreach_tensor_div_list_kernel_slow(
at::TensorList self,
at::TensorList other);
void foreach_tensor_div_list_kernel_slow_(
at::TensorList self,
at::TensorList other);

::std::vector<at::Tensor> foreach_tensor_add_list_kernel_slow(
at::TensorList self,
at::TensorList other,
const at::Scalar& alpha);
void foreach_tensor_add_list_kernel_slow_(
at::TensorList self,
at::TensorList other,
const at::Scalar& alpha);

#define FOREACH_BINARY_OP_LIST(NAME, DIVISION_OP) \
void foreach_tensor_##NAME##_list_kernel_xpu_( \
TensorList tensors1, TensorList tensors2) { \
Expand Down Expand Up @@ -81,28 +65,6 @@ FOREACH_BINARY_OP_LIST_ALPHA(add);
FOREACH_BINARY_OP_LIST(mul, false);
FOREACH_BINARY_OP_LIST(div, true);

::std::vector<at::Tensor> foreach_tensor_addcmul_scalarlist_slow(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
at::ArrayRef<at::Scalar> scalars);
void foreach_tensor_addcmul_scalarlist_slow_(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
at::ArrayRef<at::Scalar> scalars);

::std::vector<at::Tensor> foreach_tensor_addcdiv_scalarlist_slow(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
at::ArrayRef<at::Scalar> scalars);
void foreach_tensor_addcdiv_scalarlist_slow_(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
at::ArrayRef<at::Scalar> scalars);

#define FOREACH_POINTWISE_OP_TENSOR(NAME) \
std::vector<Tensor> foreach_tensor_##NAME##_list_kernel_xpu( \
TensorList input, \
Expand Down Expand Up @@ -142,11 +104,6 @@ void foreach_tensor_addcdiv_scalarlist_slow_(
FOREACH_POINTWISE_OP_TENSOR(addcmul)
FOREACH_POINTWISE_OP_TENSOR(addcdiv)

::std::vector<at::Tensor> foreach_tensor_ternary_lerp_slow(
at::TensorList self,
at::TensorList tensors1,
at::TensorList weights);

std::vector<at::Tensor> foreach_tensor_lerp_ternary_xpu(
TensorList tensors1,
TensorList tensors2,
Expand All @@ -166,11 +123,6 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_xpu(
return vec_res;
}

void foreach_tensor_ternary_lerp_slow_(
at::TensorList self,
at::TensorList tensors1,
at::TensorList weights);

void foreach_tensor_lerp_ternary_xpu_(
TensorList tensors1,
TensorList tensors2,
Expand Down
59 changes: 6 additions & 53 deletions src/ATen/native/xpu/ForeachOpScalar.cpp
Original file line number Diff line number Diff line change
@@ -1,34 +1,18 @@
#include <ATen/native/ForeachUtils.h>
#include <ATen/ops/_foreach_add_native.h>
#include <ATen/ops/_foreach_addcdiv_native.h>
#include <ATen/ops/_foreach_addcmul_native.h>
#include <ATen/ops/_foreach_div_native.h>
#include <ATen/ops/_foreach_lerp_native.h>
#include <ATen/ops/_foreach_mul_native.h>

#include <ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h>
#include <ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h>
#include <ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h>

namespace at {

namespace native {

::std::vector<at::Tensor> foreach_tensor_add_scalar_kernel_slow(
at::TensorList self,
const at::Scalar& scalar);
void foreach_tensor_add_scalar_kernel_slow_(
at::TensorList self,
const at::Scalar& scalar);

::std::vector<at::Tensor> foreach_tensor_mul_scalar_kernel_slow(
at::TensorList self,
const at::Scalar& scalar);
void foreach_tensor_mul_scalar_kernel_slow_(
at::TensorList self,
const at::Scalar& scalar);

::std::vector<at::Tensor> foreach_tensor_div_scalar_kernel_slow(
at::TensorList self,
const at::Scalar& scalar);
void foreach_tensor_div_scalar_kernel_slow_(
at::TensorList self,
const at::Scalar& scalar);

#define FOREACH_BINARY_OP_SCALAR(NAME, DIV_OP) \
void foreach_tensor_##NAME##_scalar_kernel_xpu_( \
TensorList tensors, const Scalar& scalar) { \
Expand All @@ -54,28 +38,6 @@ FOREACH_BINARY_OP_SCALAR(add, /*div_op*/ false);
FOREACH_BINARY_OP_SCALAR(mul, /*div_op*/ false);
FOREACH_BINARY_OP_SCALAR(div, /*div_op*/ true);

::std::vector<at::Tensor> foreach_tensor_addcmul_scalar_slow(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
const at::Scalar& value);
void foreach_tensor_addcmul_scalar_slow_(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
const at::Scalar& value);

::std::vector<at::Tensor> foreach_tensor_addcdiv_scalar_slow(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
const at::Scalar& value);
void foreach_tensor_addcdiv_scalar_slow_(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
const at::Scalar& value);

#define FOREACH_POINTWISE_OP_SCALAR(NAME) \
std::vector<Tensor> foreach_tensor_##NAME##_scalar_xpu( \
TensorList input, \
Expand Down Expand Up @@ -112,15 +74,6 @@ void foreach_tensor_addcdiv_scalar_slow_(
FOREACH_POINTWISE_OP_SCALAR(addcmul)
FOREACH_POINTWISE_OP_SCALAR(addcdiv)

::std::vector<at::Tensor> foreach_tensor_lerp_list_kernel_slow(
at::TensorList self,
at::TensorList tensors1,
const at::Scalar& weight);
void foreach_tensor_lerp_list_kernel_slow_(
at::TensorList self,
at::TensorList tensors1,
const at::Scalar& weight);

std::vector<at::Tensor> foreach_tensor_lerp_list_xpu(
TensorList tensors1,
TensorList tensors2,
Expand Down
84 changes: 5 additions & 79 deletions src/ATen/native/xpu/ForeachOpScalarList.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
#include <ATen/native/ForeachUtils.h>
#include <ATen/ops/_foreach_add_native.h>
#include <ATen/ops/_foreach_addcdiv_native.h>
#include <ATen/ops/_foreach_addcmul_native.h>
#include <ATen/ops/_foreach_div_native.h>
#include <ATen/ops/_foreach_mul_native.h>

#include <ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h>
#include <ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h>
Expand All @@ -8,44 +13,6 @@

namespace at {
namespace native {
::std::vector<at::Tensor> foreach_tensor_add_scalar_kernel_slow(
at::TensorList self,
const at::Scalar& scalar);
void foreach_tensor_add_scalar_kernel_slow_(
at::TensorList self,
const at::Scalar& scalar);
::std::vector<at::Tensor> foreach_tensor_mul_scalar_kernel_slow(
at::TensorList self,
const at::Scalar& scalar);
void foreach_tensor_mul_scalar_kernel_slow_(
at::TensorList self,
const at::Scalar& scalar);

::std::vector<at::Tensor> foreach_tensor_add_scalarlist_kernel_slow(
at::TensorList self,
at::ArrayRef<at::Scalar> scalars);
void foreach_tensor_add_scalarlist_kernel_slow_(
at::TensorList self,
at::ArrayRef<at::Scalar> scalars);
::std::vector<at::Tensor> foreach_tensor_mul_scalarlist_kernel_slow(
at::TensorList self,
at::ArrayRef<at::Scalar> scalars);
void foreach_tensor_mul_scalarlist_kernel_slow_(
at::TensorList self,
at::ArrayRef<at::Scalar> scalars);

::std::vector<at::Tensor> foreach_tensor_div_scalar_kernel_slow(
at::TensorList self,
const at::Scalar& scalar);
void foreach_tensor_div_scalar_kernel_slow_(
at::TensorList self,
const at::Scalar& scalar);
::std::vector<at::Tensor> foreach_tensor_div_scalarlist_kernel_slow(
at::TensorList self,
at::ArrayRef<at::Scalar> scalars);
void foreach_tensor_div_scalarlist_kernel_slow_(
at::TensorList self,
at::ArrayRef<at::Scalar> scalars);

#define FOREACH_BINARY_OP_SCALARLIST(NAME, DIV_OP) \
void foreach_tensor_##NAME##_scalar_kernel_xpu_( \
Expand Down Expand Up @@ -74,47 +41,6 @@ FOREACH_BINARY_OP_SCALARLIST(add, /*div_op*/ false);
FOREACH_BINARY_OP_SCALARLIST(mul, /*div_op*/ false);
FOREACH_BINARY_OP_SCALARLIST(div, /*div_op*/ true);

void foreach_tensor_addcmul_scalar_slow_(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
const at::Scalar& value = 1);
::std::vector<at::Tensor> foreach_tensor_addcmul_scalar_slow(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
const at::Scalar& value = 1);
::std::vector<at::Tensor> foreach_tensor_addcmul_scalarlist_slow(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
at::ArrayRef<at::Scalar> scalars);
void foreach_tensor_addcmul_scalarlist_slow_(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
at::ArrayRef<at::Scalar> scalars);
void foreach_tensor_addcdiv_scalar_slow_(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
const at::Scalar& value = 1);
::std::vector<at::Tensor> foreach_tensor_addcdiv_scalar_slow(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
const at::Scalar& value = 1);
::std::vector<at::Tensor> foreach_tensor_addcdiv_scalarlist_slow(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
at::ArrayRef<at::Scalar> scalars);
void foreach_tensor_addcdiv_scalarlist_slow_(
at::TensorList self,
at::TensorList tensor1,
at::TensorList tensor2,
at::ArrayRef<at::Scalar> scalars);

#define FOREACH_POINTWISE_OP_SCALARLIST(NAME) \
std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_xpu( \
TensorList input, \
Expand Down
5 changes: 2 additions & 3 deletions src/ATen/native/xpu/ForeachUnaryOp.cpp
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
#include <ATen/native/ForeachUtils.h>
#include <ATen/ops/_foreach_sqrt_native.h>

#include <ATen/native/xpu/sycl/ForeachUnaryKernels.h>

namespace at {
namespace native {
// given a functor and a "dispatch function", creates the outplace and inplace
// operations

::std::vector<at::Tensor> foreach_tensor_sqrt_slow(at::TensorList self);
void foreach_tensor_sqrt_slow_(at::TensorList self);

#define FOREACH_UNARY_OP(op_name) \
std::vector<Tensor> foreach_tensor_##op_name##_xpu(TensorList tensors) { \
check_foreach_api_restrictions(tensors); \
Expand Down
4 changes: 2 additions & 2 deletions src/ATen/native/xpu/Nonzero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include <ATen/native/xpu/sycl/OffsetCalculator.h>

namespace at {
namespace native{
namespace native {
Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) {
TORCH_CHECK(
self.numel() < std::numeric_limits<int>::max(),
Expand Down Expand Up @@ -38,5 +38,5 @@ Tensor nonzero_xpu(const Tensor& self) {
nonzero_out_xpu(self, out);
return out;
}
}
} // namespace native
} // namespace at
6 changes: 4 additions & 2 deletions src/ATen/native/xpu/RangeFactories.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ Tensor& arange_out_xpu(

TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
TORCH_CHECK(
std::isfinite(static_cast<double>(xstart)) && std::isfinite(static_cast<double>(xend)),
std::isfinite(static_cast<double>(xstart)) &&
std::isfinite(static_cast<double>(xend)),
"unsupported range: ",
xstart,
" -> ",
Expand Down Expand Up @@ -99,7 +100,8 @@ Tensor& range_xpu_out(

TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
TORCH_CHECK(
std::isfinite(static_cast<double>(xstart)) && std::isfinite(static_cast<double>(xend)),
std::isfinite(static_cast<double>(xstart)) &&
std::isfinite(static_cast<double>(xend)),
"unsupported range: ",
xstart,
" -> ",
Expand Down
1 change: 0 additions & 1 deletion src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#include <ATen/OpMathType.h>
#include <ATen/TensorIterator.h>

#include <ATen/native/xpu/sycl/Loops.h>
#include <ATen/native/xpu/sycl/Loops.h>
#include <comm/SYCLContext.h>

Expand Down
2 changes: 1 addition & 1 deletion src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
#include <ATen/native/xpu/sycl/ForeachFunctors.h>
#include <ATen/native/xpu/sycl/MultiTensorApply.h>

#include <ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h>
#include <ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h>
#include <ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h>

namespace at::native::xpu {

Expand Down
8 changes: 6 additions & 2 deletions src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,13 @@

namespace at::native::xpu {

TORCH_XPU_API void addcmul_kernel(TensorIteratorBase& iter, const Scalar& value);
TORCH_XPU_API void addcmul_kernel(
TensorIteratorBase& iter,
const Scalar& value);

TORCH_XPU_API void addcdiv_kernel(TensorIteratorBase& iter, const Scalar& value);
TORCH_XPU_API void addcdiv_kernel(
TensorIteratorBase& iter,
const Scalar& value);

TORCH_XPU_API void mse_backward_kernel(
TensorIterator& iter,
Expand Down
3 changes: 2 additions & 1 deletion src/ATen/native/xpu/sycl/PowKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ static inline c10::complex<T> pow_(c10::complex<T> base, c10::complex<T> exp) {
} // namespace impl

#ifdef _MSC_VER
// Divergence for MSVC due to accuracy issue. https://github.com/intel/torch-xpu-ops/issues/842.
// Divergence for MSVC due to accuracy issue.
// https://github.com/intel/torch-xpu-ops/issues/842.
template <typename scalar_t>
struct PowTensorTensorCastFunctor {
using opmath_t = at::opmath_type<scalar_t>;
Expand Down
2 changes: 1 addition & 1 deletion src/ATen/native/xpu/sycl/ReduceNormKernel.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#include <ATen/Dispatch.h>

#include <ATen/native/xpu/sycl/Reduce.h>
#include <ATen/ops/imag.h>
#include <ATen/native/xpu/sycl/SharedReduceOps.h>
#include <ATen/ops/imag.h>

#include <ATen/native/xpu/sycl/ReduceNormKernel.h>

Expand Down
Loading

0 comments on commit 8e7d7eb

Please sign in to comment.