Skip to content

Commit

Permalink
Add aten::range (#553)
Browse files Browse the repository at this point in the history
Co-authored-by: Feng Yuan <[email protected]>
  • Loading branch information
chunhuanMeng and fengyuan14 authored Jul 21, 2024
1 parent caa8a7c commit 6feb2e2
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 56 deletions.
96 changes: 96 additions & 0 deletions src/ATen/native/xpu/RangeFactories.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/Dispatch.h>
#include <ATen/ExpandUtils.h>
#include <ATen/ScalarOps.h>
#include <ATen/core/Tensor.h>
Expand All @@ -14,7 +16,101 @@ Tensor& XPUNativeFunctions::arange_out(
const Scalar& end,
const Scalar& step,
Tensor& out) {
AT_DISPATCH_ALL_TYPES_AND2(
at::ScalarType::Half,
at::ScalarType::BFloat16,
out.scalar_type(),
"arange_xpu_preprocess",
[&]() {
using accscalar_t = at::acc_type<scalar_t, true>;
auto xstart = start.to<accscalar_t>();
auto xend = end.to<accscalar_t>();
auto xstep = step.to<accscalar_t>();

TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
TORCH_CHECK(
std::isfinite(xstart) && std::isfinite(xend),
"unsupported range: ",
xstart,
" -> ",
xend);
TORCH_CHECK(
((xstep > 0) && (xend >= xstart)) ||
((xstep < 0) && (xend <= xstart)),
"upper bound and larger bound inconsistent with step sign");

// we use double precision for (start - end) / step
// to compute size_d for consistency across devices.
// The problem with using accscalar_t is that accscalar_t might be
// float32 on gpu for a float32 scalar_t, but double on cpu for the
// same, and the effective output size starts differing on CPU vs GPU
// because of precision issues, which we dont want. the corner-case we
// do want to take into account is int64_t, which has higher precision
// than double
double size_d;
if constexpr (std::is_same_v<scalar_t, int64_t>) {
int64_t sgn = (xstep > 0) - (xstep < 0);
size_d = std::ceil((xend - xstart + xstep - sgn) / xstep);
} else {
size_d = std::ceil(
static_cast<double>(end.to<double>() - start.to<double>()) /
step.to<double>());
}

TORCH_CHECK(
size_d >= 0 &&
size_d <=
static_cast<double>(std::numeric_limits<int64_t>::max()),
"invalid size, possible overflow?");
int64_t size = static_cast<int64_t>(size_d);
int64_t numel = out.numel();

if (numel != size) {
if (numel > 0) {
TORCH_WARN(
"The number of elements in the out tensor of shape ",
out.sizes(),
" is ",
numel,
" which does not match the computed number of elements ",
size,
". Note that this may occur as a result of rounding error. "
"The out tensor will be resized to a tensor of shape (",
size,
",).");
}
out.resize_({size});
}
});

return at::native::xpu::arange_kernel(start, end, step, out);
}

Tensor& XPUNativeFunctions::range_out(
const Scalar& start,
const Scalar& end,
const Scalar& step,
Tensor& out) {
auto xstart = start.to<double>();
auto xend = end.to<double>();
auto xstep = step.to<double>();

TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
TORCH_CHECK(
std::isfinite(xstart) && std::isfinite(xend),
"unsupported range: ",
xstart,
" -> ",
xend);
TORCH_CHECK(
((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
"upper bound and larger bound inconsistent with step sign");
int64_t size = static_cast<int64_t>(((xend - xstart) / xstep) + 1);
if (out.numel() != size) {
out.resize_({size});
}

return at::native::xpu::range_kernel(start, end, step, out);
}

} // namespace at
98 changes: 42 additions & 56 deletions src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,64 +83,8 @@ Tensor& arange_kernel(
[&]() {
using accscalar_t = at::acc_type<scalar_t, true>;
auto xstart = start.to<accscalar_t>();
auto xend = end.to<accscalar_t>();
auto xstep = step.to<accscalar_t>();

TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
TORCH_CHECK(
std::isfinite(static_cast<double>(xstart)) &&
std::isfinite(static_cast<double>(xend)),
"unsupported range: ",
xstart,
" -> ",
xend);
TORCH_CHECK(
((xstep > 0) && (xend >= xstart)) ||
((xstep < 0) && (xend <= xstart)),
"upper bound and larger bound inconsistent with step sign");

// we use double precision for (start - end) / step
// to compute size_d for consistency across devices.
// The problem with using accscalar_t is that accscalar_t might be
// float32 on gpu for a float32 scalar_t, but double on cpu for the
// same, and the effective output size starts differing on CPU vs GPU
// because of precision issues, which we dont want. the corner-case we
// do want to take into account is int64_t, which has higher precision
// than double
double size_d;
if constexpr (std::is_same_v<scalar_t, int64_t>) {
int64_t sgn = (xstep > 0) - (xstep < 0);
size_d = std::ceil((xend - xstart + xstep - sgn) / xstep);
} else {
size_d = std::ceil(
static_cast<double>(end.to<double>() - start.to<double>()) /
step.to<double>());
}

TORCH_CHECK(
size_d >= 0 &&
size_d <=
static_cast<double>(std::numeric_limits<int64_t>::max()),
"invalid size, possible overflow?");
int64_t size = static_cast<int64_t>(size_d);
int64_t numel = result.numel();

if (numel != size) {
if (numel > 0) {
TORCH_WARN(
"The number of elements in the out tensor of shape ",
result.sizes(),
" is ",
numel,
" which does not match the computed number of elements ",
size,
". Note that this may occur as a result of rounding error. "
"The out tensor will be resized to a tensor of shape (",
size,
",).");
}
result.resize_({size});
}
bool is_contiguous = result.is_contiguous();
Tensor r = !is_contiguous
? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT)
Expand All @@ -157,6 +101,48 @@ Tensor& arange_kernel(
return result;
}

template <typename scalar_t, typename accscalar_t>
struct RangeFunctor {
scalar_t operator()(int64_t ind) const {
accscalar_t inc = xstep_ * static_cast<accscalar_t>(ind);
accscalar_t val = xstart_ + inc;
return static_cast<scalar_t>(val);
}
RangeFunctor(accscalar_t xstart, accscalar_t xstep)
: xstart_(xstart), xstep_(xstep) {}

private:
accscalar_t xstart_;
accscalar_t xstep_;
};

Tensor& range_kernel(
const Scalar& start,
const Scalar& end,
const Scalar& step,
Tensor& result) {
AT_DISPATCH_ALL_TYPES_AND(
at::ScalarType::Half, result.scalar_type(), "range_xpu", [&]() {
using accscalar_t = acc_type<scalar_t, true>;
auto xstart = start.to<accscalar_t>();
auto xstep = step.to<accscalar_t>();

bool is_contiguous = result.is_contiguous();
Tensor r = !is_contiguous
? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT)
: result;
auto f = RangeFunctor<scalar_t, accscalar_t>(xstart, xstep);

gpu_kernel_with_index(r, f);

if (!result.is_contiguous()) {
result.copy_(r);
}
});

return result;
}

} // namespace xpu
} // namespace native
} // namespace at
6 changes: 6 additions & 0 deletions src/ATen/native/xpu/sycl/RangeFactoriesKernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ Tensor& arange_kernel(
const Scalar& step,
Tensor& result);

Tensor& range_kernel(
const Scalar& start,
const Scalar& end,
const Scalar& step,
Tensor& result);

} // namespace xpu
} // namespace native
} // namespace at
1 change: 1 addition & 0 deletions yaml/xpu_functions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -657,3 +657,4 @@ supported:
- norm.ScalarOpt_dim
- norm.out
- nan_to_num.out
- range.out

0 comments on commit 6feb2e2

Please sign in to comment.