Skip to content

Commit

Permalink
Merge branch 'main' into majing/histogram
Browse files Browse the repository at this point in the history
  • Loading branch information
majing921201 authored Jul 23, 2024
2 parents 963cd91 + 6bb1633 commit 17c0315
Show file tree
Hide file tree
Showing 40 changed files with 771 additions and 161 deletions.
23 changes: 23 additions & 0 deletions src/ATen/native/xpu/BinaryOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <ATen/native/xpu/sycl/GcdLcmKernels.h>
#include <ATen/native/xpu/sycl/LogAddExpKernels.h>
#include <ATen/native/xpu/sycl/MaxMinElementwiseKernels.h>
#include <ATen/native/xpu/sycl/StepKernels.h>

namespace at {
Tensor XPUNativeFunctions::add(
Expand Down Expand Up @@ -383,6 +384,28 @@ Tensor& XPUNativeFunctions::gcd_out(
return out;
}

Tensor XPUNativeFunctions::nextafter(const Tensor& self, const Tensor& other) {
Tensor out;
auto iter = TensorIterator::borrowing_binary_op(out, self, other);
native::xpu::nextafter_kernel(iter);
return iter.output();
}

Tensor& XPUNativeFunctions::nextafter_(Tensor& self, const Tensor& other) {
auto iter = TensorIterator::borrowing_binary_op(self, self, other);
native::xpu::nextafter_kernel(iter);
return self;
}

Tensor& XPUNativeFunctions::nextafter_out(
const Tensor& self,
const Tensor& other,
Tensor& out) {
auto iter = TensorIterator::borrowing_binary_op(out, self, other);
native::xpu::nextafter_kernel(iter);
return out;
}

Tensor XPUNativeFunctions::hypot(const Tensor& self, const Tensor& other) {
Tensor out;
auto iter = TensorIterator::borrowing_binary_op(out, self, other);
Expand Down
2 changes: 1 addition & 1 deletion src/ATen/native/xpu/RangeFactories.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Tensor& XPUNativeFunctions::arange_out(
out.scalar_type(),
"arange_xpu_preprocess",
[&]() {
using accscalar_t = at::acc_type<scalar_t, true>;
using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
auto xstart = start.to<accscalar_t>();
auto xend = end.to<accscalar_t>();
auto xstep = step.to<accscalar_t>();
Expand Down
2 changes: 1 addition & 1 deletion src/ATen/native/xpu/ReduceOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -987,7 +987,7 @@ Tensor& XPUNativeFunctions::norm_out(
impl_func_norm(self, p_, dim, keepdim, c10::nullopt, result);
return result;
}

TensorIterator meta_aminmax(
const Tensor& self,
std::optional<int64_t> dim_opt,
Expand Down
1 change: 1 addition & 0 deletions src/ATen/native/xpu/XPUFallback.template
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
"nanmedian",
"nanmedian.dim_values",
"nansum",
"norm.out",
"nextafter.out",
"ormqr",
"_pdist_backward",
Expand Down
4 changes: 2 additions & 2 deletions src/ATen/native/xpu/sycl/AveragePool2dKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -672,7 +672,7 @@ void avg_pool2d_kernel(
if (count != 0) {
AT_DISPATCH_FLOATING_TYPES_AND2(
kHalf, kBFloat16, input.scalar_type(), "avg_pool2d_xpu", [&] {
using accscalar_t = acc_type<scalar_t, true>;
using accscalar_t = acc_type_device<scalar_t, kXPU>;

switch (memory_format) {
case MemoryFormat::ChannelsLast: {
Expand Down Expand Up @@ -775,7 +775,7 @@ void avg_pool2d_backward_kernel(
use_divisor ? divisor_override.value() : 0;
AT_DISPATCH_FLOATING_TYPES_AND2(
kHalf, kBFloat16, input.scalar_type(), "avg_pool2d_backward_xpu", [&] {
using accscalar_t = acc_type<scalar_t, true>;
using accscalar_t = acc_type_device<scalar_t, kXPU>;

AT_DISPATCH_INDEX_TYPES(
at::native::canUse32BitIndexMath(input, INT_MAX) ? ScalarType::Int
Expand Down
56 changes: 28 additions & 28 deletions src/ATen/native/xpu/sycl/BatchNormKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,7 @@ void batch_norm_stats_template(
const Tensor& out_invstd,
const Tensor& input_,
double epsilon) {
using accscalar_t = at::acc_type<scalar_t, true>;
using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
int64_t n_input = input_.size(1);
Tensor dummy_mean_;
Tensor dummy_var_;
Expand Down Expand Up @@ -1047,7 +1047,7 @@ void batch_norm_stats_channels_last_template(
Tensor& out_invstd,
const Tensor& input,
double epsilon) {
using accscalar_t = acc_type<scalar_t, true>;
using accscalar_t = acc_type_device<scalar_t, kXPU>;

const auto stride = input.sizes()[1];
const auto reduction_size = input.numel() / stride;
Expand Down Expand Up @@ -1261,7 +1261,7 @@ void batch_norm_elemt_template(
const Tensor& bias_,
const Tensor& mean_,
const Tensor& invstd_) {
using stat_accscalar_t = acc_type<stat_scalar_t, true>;
using stat_accscalar_t = acc_type_device<stat_scalar_t, kXPU>;
auto input_reshaped = input_.reshape(
{input_.size(0),
input_.size(1),
Expand Down Expand Up @@ -1430,7 +1430,7 @@ void batch_norm_elemt_channels_last_template(
if (input.scalar_type() != second_dtype) {
AT_DISPATCH_FLOATING_TYPES_AND2(
kHalf, kBFloat16, input.scalar_type(), "batchnorm_forward_xpu", [&] {
using accscalar_t = at::acc_type<scalar_t, true>;
using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
auto kfn = BatchNormTransformInputChannelsLastKernelFunctor<
scalar_t,
accscalar_t,
Expand Down Expand Up @@ -1459,7 +1459,7 @@ void batch_norm_elemt_channels_last_template(
}
AT_DISPATCH_FLOATING_TYPES_AND2(
kHalf, kBFloat16, input.scalar_type(), "batchnorm_forward_xpu", [&] {
using accscalar_t = at::acc_type<scalar_t, true>;
using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
auto kfn = BatchNormTransformInputChannelsLastKernelFunctor<
scalar_t,
accscalar_t,
Expand Down Expand Up @@ -1512,7 +1512,7 @@ void batch_norm_elemt_kernel(
self.scalar_type(),
"batch_norm_elementwise_xpu",
[&] {
using accscalar_t = acc_type<scalar_t, true>;
using accscalar_t = acc_type_device<scalar_t, kXPU>;
const bool mixed_type = is_mixed_type(self, *weight, *bias);
if (mixed_type) {
batch_norm_elemt_template<scalar_t, accscalar_t, int32_t>(
Expand Down Expand Up @@ -1581,7 +1581,7 @@ void batch_norm_elemt_kernel(
self.scalar_type(),
"batch_norm_elementwise_xpu",
[&] {
using acc_t = acc_type<scalar_t, true>;
using acc_t = acc_type_device<scalar_t, kXPU>;
auto f = BatchNormElementwiseLoopsFunctor<scalar_t, acc_t>();
gpu_kernel(iter, f);
});
Expand Down Expand Up @@ -1731,7 +1731,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_template(
const bool input_g,
const bool weight_g,
const bool bias_g) {
using stat_accscalar_t = acc_type<stat_scalar_t, true>;
using stat_accscalar_t = acc_type_device<stat_scalar_t, kXPU>;
int64_t n_input = input_.size(1);
Tensor sum_dy_;
Tensor sum_dy_xmu_;
Expand Down Expand Up @@ -2129,7 +2129,7 @@ batch_norm_backward_reduce_channels_last_template(
input.scalar_type(),
"batchnorm_backward_reduce_xpu",
[&] {
using accscalar_t = at::acc_type<scalar_t, true>;
using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
accscalar_t* staging_data_ptr = nwg_y > 1
? staging_data.mutable_data_ptr<accscalar_t>()
: nullptr;
Expand Down Expand Up @@ -2170,7 +2170,7 @@ batch_norm_backward_reduce_channels_last_template(
input.scalar_type(),
"batchnorm_backward_reduce_xpu",
[&] {
using accscalar_t = at::acc_type<scalar_t, true>;
using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
accscalar_t* staging_data_ptr = nwg_y > 1
? staging_data.mutable_data_ptr<accscalar_t>()
: nullptr;
Expand Down Expand Up @@ -2239,7 +2239,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_kernel(
mean_st == invstd_st,
"mean and invstd need to have the same data types");
const bool mixed_type = is_mixed_type(input, weight);
using accscalar_t = acc_type<scalar_t, true>;
using accscalar_t = acc_type_device<scalar_t, kXPU>;

if (canUse32BitIndexMath(grad_output)) {
if (mixed_type) {
Expand Down Expand Up @@ -2462,7 +2462,7 @@ Tensor batch_norm_backward_elemt_template(
const Tensor& weight_,
const Tensor& sum_dy_,
const Tensor& sum_dy_xmu_) {
using stat_accscalar_t = acc_type<stat_scalar_t, true>;
using stat_accscalar_t = acc_type_device<stat_scalar_t, kXPU>;
int64_t n_input = input_.size(1);
auto input_reshaped = input_.reshape(
{input_.size(0),
Expand Down Expand Up @@ -2543,7 +2543,7 @@ Tensor batch_norm_backward_elemt_template(
const Tensor& sum_dy_,
const Tensor& sum_dy_xmu_,
const Tensor& count) {
using stat_accscalar_t = at::acc_type<stat_scalar_t, true>;
using stat_accscalar_t = at::acc_type_device<stat_scalar_t, kXPU>;
auto input_reshaped = input_.reshape(
{input_.size(0),
input_.size(1),
Expand Down Expand Up @@ -2748,7 +2748,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_template(
input.scalar_type(),
"batchnorm_backward_element_xpu",
[&] {
using accscalar_t = at::acc_type<scalar_t, true>;
using accscalar_t = at::acc_type_device<scalar_t, kXPU>;

if (weight.defined() && weight.scalar_type() != input.scalar_type()) {
auto kfn = BatchNormBackwardElemtChannelsLastKernelFunctor<
Expand Down Expand Up @@ -2820,7 +2820,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_template(
input.scalar_type(),
"batchnorm_backward_element_xpu",
[&] {
using accscalar_t = acc_type<scalar_t, true>;
using accscalar_t = acc_type_device<scalar_t, kXPU>;
auto kfn = BatchNormBackwardElemtChannelsLastKernelFunctor<
ELEMENTS_PER_ITER,
scalar_t,
Expand Down Expand Up @@ -2857,7 +2857,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_template(
input.scalar_type(),
"batchnorm_backward_element_xpu",
[&] {
using accscalar_t = acc_type<scalar_t, true>;
using accscalar_t = acc_type_device<scalar_t, kXPU>;
auto kfn = BatchNormBackwardElemtChannelsLastKernelFunctor<
ELEMENTS_PER_ITER,
scalar_t,
Expand Down Expand Up @@ -2920,7 +2920,7 @@ Tensor batch_norm_backward_elemt_kernel(
std::is_same<scalar_t, at::Half>::value && mean_st == at::kFloat;
bool is_bfloat16_float = std::is_same<scalar_t, at::BFloat16>::value &&
mean_st == at::kFloat;
using accscalar_t = acc_type<scalar_t, true>;
using accscalar_t = acc_type_device<scalar_t, kXPU>;
if (canUse32BitIndexMath(self)) {
if (is_half_float || is_bfloat16_float) {
return batch_norm_backward_elemt_template<
Expand Down Expand Up @@ -3004,7 +3004,7 @@ void batch_norm_update_stats(
running_mean.scalar_type(),
"batch_norm_update_stats_xpu",
[&] {
using acc_t = acc_type<scalar_t, true>;
using acc_t = acc_type_device<scalar_t, kXPU>;
const auto bessel_correction_factor = static_cast<acc_t>(
static_cast<double>(N) / static_cast<double>(N - 1));
const auto momentum = static_cast<acc_t>(momentum_);
Expand Down Expand Up @@ -3151,7 +3151,7 @@ void batch_norm_update_stats_and_invert(
running_mean.scalar_type(),
"batch_norm_update_stats_and_invert_xpu",
[&] {
using acc_t = acc_type<scalar_t, true>;
using acc_t = acc_type_device<scalar_t, kXPU>;
const auto bessel_correction_factor = static_cast<acc_t>(
static_cast<double>(N) / static_cast<double>(N - 1));
const auto eps = static_cast<acc_t>(epsilon);
Expand Down Expand Up @@ -3191,7 +3191,7 @@ void batch_norm_calc_invstd(
running_var.scalar_type(),
"batch_norm_invert_std_xpu",
[&] {
using acc_t = at::acc_type<scalar_t, true>;
using acc_t = at::acc_type_device<scalar_t, kXPU>;
auto eps = static_cast<acc_t>(epsilon);
BatchNormCalcInvstdFunctor<scalar_t, acc_t> f(eps);
gpu_kernel(iter, f);
Expand Down Expand Up @@ -3229,7 +3229,7 @@ void batch_norm_elementwise(
self.scalar_type(),
"batch_norm_elementwise_xpu",
[&] {
using accscalar_t = at::acc_type<scalar_t, true>;
using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
const bool mixed_type = is_mixed_type(self, *weight, *bias);
if (mixed_type) {
batch_norm_elemt_template<scalar_t, accscalar_t, int32_t>(
Expand Down Expand Up @@ -3298,7 +3298,7 @@ void batch_norm_elementwise(
self.scalar_type(),
"batch_norm_elementwise_xpu",
[&] {
using acc_t = at::acc_type<scalar_t, true>;
using acc_t = at::acc_type_device<scalar_t, kXPU>;
BatchNormElementwiseFunctor<scalar_t, acc_t> f;
gpu_kernel(iter, f);
});
Expand Down Expand Up @@ -3572,7 +3572,7 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_template(
bool train,
double epsilon,
std::array<bool, 3> grad_input_mask) {
using accscalar_t = acc_type<stat_scalar_t, true>;
using accscalar_t = acc_type_device<stat_scalar_t, kXPU>;
Tensor grad_input_;
Tensor grad_input_reshaped;
Tensor grad_weight_;
Expand Down Expand Up @@ -3739,7 +3739,7 @@ Tensor batch_norm_elementwise_backward_train(
input.scalar_type(),
"batch_norm_backward_elemt_xpu",
[&] {
using accscalar_t = at::acc_type<scalar_t, true>;
using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
const bool mixed_type = is_mixed_type(input, weight);
if (mixed_type) {
return batch_norm_backward_elemt_template<
Expand Down Expand Up @@ -3803,7 +3803,7 @@ Tensor batch_norm_elementwise_backward_train(
grad_out.scalar_type(),
"batch_norm_eval_backward_xpu",
[&] {
using accscalar_t = at::acc_type<scalar_t, true>;
using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
auto norm_fct =
static_cast<accscalar_t>(1.0 / (input.numel() / input.size(1)));
BatchNormElementwiseBackwardTrainFunctor<scalar_t, accscalar_t> f(
Expand Down Expand Up @@ -3863,7 +3863,7 @@ Tensor batch_norm_elementwise_backward_eval(
grad_out.scalar_type(),
"batch_norm_eval_backward_xpu",
[&] {
using accscalar_t = at::acc_type<scalar_t, true>;
using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
BatchNormElementwiseBackwardEvalWithWeightfunctor<
scalar_t,
accscalar_t>
Expand All @@ -3885,7 +3885,7 @@ Tensor batch_norm_elementwise_backward_eval(
grad_out.scalar_type(),
"batch_norm_eval_backward_xpu",
[&] {
using accscalar_t = at::acc_type<scalar_t, true>;
using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
BatchNormElementwiseBackwardEvalfunctor<scalar_t, accscalar_t> f;
gpu_kernel(iter, f);
});
Expand Down Expand Up @@ -3924,7 +3924,7 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_kernel(
canUse32BitIndexMath(input) && canUse32BitIndexMath(grad_out)) {
return AT_DISPATCH_FLOATING_TYPES_AND2(
kHalf, kBFloat16, input.scalar_type(), "batch_norm_backward_xpu", [&] {
using accscalar_t = at::acc_type<scalar_t, true>;
using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
const bool mixed_type =
is_mixed_type(input, *weight, *running_mean, *running_var);
if (mixed_type) {
Expand Down
2 changes: 1 addition & 1 deletion src/ATen/native/xpu/sycl/BinaryDivFloorKernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ void div_floor_kernel(TensorIteratorBase& iter) {
// precision compared to computing the division.
AT_DISPATCH_FLOATING_TYPES_AND2(
kHalf, kBFloat16, dtype, "div_floor_xpu", [&]() {
using accscalar_t = at::acc_type<scalar_t, true>;
using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
auto b = iter.scalar_value<accscalar_t>(2);
if (C10_UNLIKELY(b == 0)) {
return div_true_kernel(iter);
Expand Down
2 changes: 1 addition & 1 deletion src/ATen/native/xpu/sycl/BinaryDivTruncKernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ void div_trunc_kernel(TensorIteratorBase& iter) {
// precision compared to computing the division.
AT_DISPATCH_FLOATING_TYPES_AND2(
kHalf, kBFloat16, dtype, "div_trunc_xpu", [&]() {
using accscalar_t = at::acc_type<scalar_t, true>;
using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
auto inv_b = accscalar_t(1.0) / iter.scalar_value<accscalar_t>(2);
iter.remove_operand(2);
gpu_kernel(iter, DivTruncScalarFunctor<scalar_t, accscalar_t>(inv_b));
Expand Down
2 changes: 1 addition & 1 deletion src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ void logit_backward_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar) {
iter.dtype(),
"logit_xpu",
[&]() {
using T_ACC = acc_type<scalar_t, true>;
using T_ACC = acc_type_device<scalar_t, kXPU>;
const T_ACC eps = eps_scalar.to<T_ACC>();
if (eps < T_ACC(0)) {
gpu_kernel(iter, LogitBackward0Functor<scalar_t>());
Expand Down
2 changes: 1 addition & 1 deletion src/ATen/native/xpu/sycl/DistanceKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ static void launch_cdist_forward_kernel(
const int64_t l2_size) {
const auto ngroups = result.numel();
auto wgroup_size = 32;
using accscalar_t = acc_type<scalar_t, true>;
using accscalar_t = acc_type_device<scalar_t, kXPU>;
auto p_val = static_cast<accscalar_t>(p);
auto out_data = result.data_ptr<scalar_t>();
auto x1_data = x1.data_ptr<scalar_t>();
Expand Down
Loading

0 comments on commit 17c0315

Please sign in to comment.