Skip to content

Commit

Permalink
Merge branch 'hjhee/cpu_fallback_retest' of https://github.com/intel/…
Browse files Browse the repository at this point in the history
…torch-xpu-ops into hjhee/cpu_fallback_retest
  • Loading branch information
hjhee committed Jul 30, 2024
2 parents 9218424 + 907a2a0 commit 6859bf5
Show file tree
Hide file tree
Showing 13 changed files with 173 additions and 5 deletions.
58 changes: 58 additions & 0 deletions src/ATen/native/xpu/Loss.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,64 @@ Tensor& XPUNativeFunctions::mse_loss_backward_out(
return grad_input;
}


Tensor& XPUNativeFunctions::smooth_l1_loss_out(
const Tensor& input,
const Tensor& target,
int64_t reduction,
double beta,
Tensor& result) {
if (reduction != Reduction::None) {
TORCH_INTERNAL_ASSERT(
reduction == Reduction::Mean || reduction == Reduction::Sum);
result.resize_({});
Tensor loss;
auto iter = TensorIterator::borrowing_binary_op(loss, input, target);
native::xpu::smooth_l1_kernel(iter, beta);
if (reduction == Reduction::Mean) {
at::mean_out(const_cast<Tensor&>(result), iter.output(), IntArrayRef{});
} else {
at::sum_out(const_cast<Tensor&>(result), iter.output(), IntArrayRef{});
}
} else {
auto iter = TensorIterator::borrowing_binary_op(result, input, target);
native::xpu::smooth_l1_kernel(iter, beta);
}
return result;
}

Tensor XPUNativeFunctions::smooth_l1_loss(
const Tensor& input,
const Tensor& target,
int64_t reduction,
double beta) {
Tensor result = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
result = XPUNativeFunctions::smooth_l1_loss_out(
input, target, reduction, beta, result);
return result;
}

Tensor& XPUNativeFunctions::smooth_l1_loss_backward_out(
const Tensor& grad_output,
const Tensor& input,
const Tensor& target,
int64_t reduction,
double beta,
Tensor& grad_input) {
auto norm = reduction == Reduction::Mean ? 1. / input.numel() : 1.;
auto iter = at::TensorIteratorConfig()
.add_output(grad_input)
.add_const_input(input)
.add_const_input(target)
.add_const_input(grad_output)
.promote_inputs_to_common_dtype(true)
.cast_common_dtype_to_outputs(true)
.enforce_safe_casting_to_output(true)
.build();
native::xpu::smooth_l1_backward_kernel(iter, norm, beta);
return grad_input;
}

Tensor XPUNativeFunctions::binary_cross_entropy(
const Tensor& self,
const Tensor& target,
Expand Down
15 changes: 15 additions & 0 deletions src/ATen/native/xpu/TensorFactories.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,21 @@ Tensor& XPUNativeFunctions::complex_out(
return result;
}

Tensor& XPUNativeFunctions::polar_out(
const Tensor& abs,
const Tensor& angle,
Tensor& result) {
complex_check_dtype(result, abs, angle);
auto iter = TensorIteratorConfig()
.add_output(result)
.add_const_input(abs)
.add_const_input(angle)
.check_all_same_dtype(false)
.build();
native::xpu::polar_kernel(iter);
return result;
}

Tensor& XPUNativeFunctions::randperm_out(
int64_t n,
c10::optional<Generator> generator,
Expand Down
3 changes: 0 additions & 3 deletions src/ATen/native/xpu/XPUFallback.template
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
"ormqr",
"_pdist_backward",
"_pdist_forward",
"polar.out",
"_prelu_kernel",
"_prelu_kernel_backward",
"prod",
Expand All @@ -257,8 +256,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
"signbit.out",
"sign.out",
"sinc.out",
"smooth_l1_loss_backward.grad_input",
"smooth_l1_loss.out",
"special_airy_ai.out",
"special_bessel_j0.out",
"special_bessel_j1.out",
Expand Down
26 changes: 26 additions & 0 deletions src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,32 @@ void mse_kernel(TensorIteratorBase& iter) {
[&]() { gpu_kernel(iter, MSEFunctor<scalar_t>()); });
}

template <typename scalar_t>
struct SmoothL1Functor {
scalar_t operator()(scalar_t input, scalar_t target) const {
auto z = std::abs(input - target);
return z < beta_val ? scalar_t(0.5) * z * z / beta_val
: z - scalar_t(0.5) * beta_val;
}
SmoothL1Functor(scalar_t beta_val) : beta_val(beta_val) {}

private:
scalar_t beta_val;
};

void smooth_l1_kernel(TensorIteratorBase& iter, double beta) {
AT_DISPATCH_FLOATING_TYPES_AND2(
at::ScalarType::Half,
at::ScalarType::BFloat16,
iter.dtype(),
"smooth_l1_xpu",
[&iter, beta]() {
scalar_t beta_val(beta);
SmoothL1Functor<scalar_t> f(beta_val);
gpu_kernel(iter, f);
});
}

template <typename scalar_t>
struct HuberFunctor {
scalar_t operator()(scalar_t a, scalar_t b) const {
Expand Down
2 changes: 2 additions & 0 deletions src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ namespace at::native::xpu {

void mse_kernel(TensorIteratorBase& iter);

void smooth_l1_kernel(TensorIteratorBase& iter, double beta);

void huber_kernel(TensorIterator& iter, double delta);

} // namespace at::native::xpu
14 changes: 14 additions & 0 deletions src/ATen/native/xpu/sycl/ComplexKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,18 @@ void complex_kernel(TensorIterator& iter) {
});
}

template <typename scalar_t>
struct PolarFunctor {
c10::complex<scalar_t> operator()(scalar_t a, scalar_t b) const {
return c10::complex<scalar_t>(a * std::cos(b), a * std::sin(b));
}
};

void polar_kernel(TensorIterator& iter) {
AT_DISPATCH_FLOATING_TYPES(iter.input_dtype(0), "polar_xpu", [&]() {
PolarFunctor<scalar_t> f;
gpu_kernel(iter, f);
});
}

} // namespace at::native::xpu
2 changes: 2 additions & 0 deletions src/ATen/native/xpu/sycl/ComplexKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,6 @@ namespace at::native::xpu {

void complex_kernel(TensorIterator& iter);

void polar_kernel(TensorIterator& iter);

} // namespace at::native::xpu
34 changes: 34 additions & 0 deletions src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,40 @@ void mse_backward_kernel(TensorIterator& iter, const Scalar& value) {
});
}

template <typename scalar_t>
struct SmoothL1BackwardFunctor {
scalar_t operator()(scalar_t input, scalar_t target, scalar_t grad_output)
const {
const auto x = input - target;
if (x < -beta_val)
return -norm_val * grad_output;
else if (x > beta_val)
return norm_val * grad_output;
else
return norm_val * x * grad_output / beta_val;
}
SmoothL1BackwardFunctor(scalar_t norm_val, scalar_t beta_val)
: norm_val(norm_val), beta_val(beta_val) {}

private:
scalar_t norm_val;
scalar_t beta_val;
};

void smooth_l1_backward_kernel(TensorIterator& iter, Scalar norm, double beta) {
AT_DISPATCH_ALL_TYPES_AND2(
kHalf,
kBFloat16,
iter.dtype(),
"smooth_l1_backward_xpu",
[&iter, &norm, beta] {
auto norm_val = norm.to<scalar_t>();
scalar_t beta_val(beta);
SmoothL1BackwardFunctor<scalar_t> f(norm_val, beta_val);
gpu_kernel(iter, f);
});
}

template <typename scalar_t>
struct HuberBackwardFunctor {
scalar_t operator()(scalar_t input, scalar_t target, scalar_t grad_output)
Expand Down
2 changes: 2 additions & 0 deletions src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ void addcdiv_kernel(TensorIterator& iter, Scalar value);

void mse_backward_kernel(TensorIterator& iter, const Scalar& value);

void smooth_l1_backward_kernel(TensorIterator& iter, Scalar norm, double beta);

void huber_backward_kernel(
TensorIterator& iter,
const Scalar& norm,
Expand Down
4 changes: 4 additions & 0 deletions test/xpu/extended/run_test_with_skip.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,10 @@
# Greatest relative difference: 0.00396728515625 at index (610,) (up to 0.001 allowed)
"test_compare_cpu_hypot_xpu_bfloat16",

# RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16.
# Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error.
"test_compare_cpu_polar_xpu_bfloat16",

# Regressions due to PyTorch uplift (Numeric difference in float and bfloat)
# https://github.com/intel/torch-xpu-ops/issues/549
# Example fail log
Expand Down
12 changes: 10 additions & 2 deletions test/xpu/run_test_with_skip.py
Original file line number Diff line number Diff line change
Expand Up @@ -776,6 +776,10 @@ def launch_test(test_case, skip_list=None, exe_list=None):
"test_dtypes_unique_consecutive_xpu",
"test_dtypes_unique_xpu",

# RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16.
# Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error.
"test_dtypes_polar_xpu",

# implemented aten::histogram to align MPS operators coverage, CUDA doesn't support
# but test_dtypes infrastructure leverage CUDA supported datatypes
"test_dtypes_histogram_xpu",
Expand Down Expand Up @@ -2996,8 +3000,12 @@ def launch_test(test_case, skip_list=None, exe_list=None):
res += launch_test("nn/test_load_state_dict_xpu.py")

# test_module_hooks

res += launch_test("nn/test_module_hooks_xpu.py")
skip_list = (
# TypeError: TestStateDictHooks.test_register_state_dict_post_hook() missing 1 required positional argument: 'private'
# https://github.com/intel/torch-xpu-ops/issues/658
"test_register_state_dict_post_hook",
)
res += launch_test("nn/test_module_hooks_xpu.py", skip_list)

# test_parametrization

Expand Down
2 changes: 2 additions & 0 deletions test/xpu/xpu_test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@
"nn.functional.upsample_bilinear",
"nn.functional.upsample_nearest",
"nn.functional.nll_loss",
"nn.functional.smooth_l1_loss",
"nn.functional.mse_loss",
"nn.functional.binary_cross_entropy",
"nn.functional.huber_loss",
Expand Down Expand Up @@ -207,6 +208,7 @@
"unique",
"multinomial",
"lerp",
"polar",
"frac",
"aminmax",
"argmin",
Expand Down
4 changes: 4 additions & 0 deletions yaml/xpu_functions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ supported:
- eye.m_out
- _efficientzerotensor
- complex.out
- polar.out
- clone
- fill_.Scalar
- fill_.Tensor
Expand Down Expand Up @@ -308,6 +309,9 @@ supported:
- bitwise_and.Tensor_out
- bitwise_or.Tensor_out
- bitwise_xor.Tensor_out
- smooth_l1_loss
- smooth_l1_loss.out
- smooth_l1_loss_backward.grad_input
- bitwise_not.out
- where.self_out
- where.self
Expand Down

0 comments on commit 6859bf5

Please sign in to comment.