Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/intel/torch-xpu-ops into ma…
Browse files Browse the repository at this point in the history
…jing/histogram
  • Loading branch information
majing921201 committed Jul 22, 2024
2 parents 967cba8 + e9516e6 commit c7f7ccb
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 62 deletions.
29 changes: 29 additions & 0 deletions src/ATen/native/xpu/EmbeddingBag.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,33 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> XPUNativeFunctions::
include_last_offset,
padding_idx);
}

Tensor XPUNativeFunctions::_embedding_bag_backward(
const Tensor& grad,
const Tensor& indices,
const Tensor& offsets,
const Tensor& offset2bag,
const Tensor& bag_size,
const Tensor& maximum_indices,
int64_t num_weights,
bool scale_grad_by_freq,
int64_t mode,
bool sparse,
const c10::optional<Tensor>& per_sample_weights,
int64_t padding_idx) {
return at::native::_embedding_bag_backward_symint(
grad,
indices,
offsets,
offset2bag,
bag_size,
maximum_indices,
num_weights,
scale_grad_by_freq,
mode,
sparse,
per_sample_weights,
padding_idx);
}

} // namespace at
124 changes: 67 additions & 57 deletions src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -416,68 +416,78 @@ void launch_max_pool2d_backward_kernel(
int dilation_h,
int dilation_w) {
auto& queue = at::xpu::getCurrentSYCLQueue();
int64_t gradOutputSize =
numBatch * numPlane * gradOutputSizeH * gradOutputSizeW;
int64_t gradInputSize = numBatch * numPlane * gradInputSizeH * gradInputSizeW;
auto out_cf_c_stride = gradOutputSizeH * gradOutputSizeW;
auto in_cf_c_stride = gradInputSizeH * gradInputSizeW;
auto out_n_stride = numPlane * out_cf_c_stride;
auto in_n_stride = numPlane * in_cf_c_stride;
if (globalContext().deterministicAlgorithms() ||
std::is_same_v<scalar_t, at::Half> ||
std::is_same_v<scalar_t, at::BFloat16>) {
using KernelClass =
MaxPool2dBackwardDeterministicKernelFunctor<scalar_t, is_channels_last>;
BatchKernelConfig cfg = {
1, gradInputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive};
cfg.template build<KernelClass>();
auto kfn = KernelClass(
gradInput,
gradOutput,
indices,
numPlane,
gradInputSizeH,
gradInputSizeW,
gradOutputSizeH,
gradOutputSizeW,
gradInputSize,
out_cf_c_stride,
in_cf_c_stride,
out_n_stride,
in_n_stride,
kernel_h,
kernel_w,
stride_h,
stride_w,
pad_h,
pad_w,
dilation_h,
dilation_w,
cfg);
sycl_kernel_submit(cfg.global_size(), cfg.group_size(), queue, kfn);
} else {
using KernelClass =
MaxPool2dBackwardKernelFunctor<scalar_t, is_channels_last>;
BatchKernelConfig cfg = {
1, gradOutputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive};
cfg.template build<KernelClass>();
auto kfn = KernelClass(
gradInput,
gradOutput,
indices,
numPlane,
gradInputSizeH,
gradInputSizeW,
gradOutputSizeH,
gradOutputSizeW,
gradOutputSize,
out_cf_c_stride,
in_cf_c_stride,
out_n_stride,
in_n_stride,
cfg);
sycl_kernel_submit(cfg.global_size(), cfg.group_size(), queue, kfn);
}

#ifndef XPU_ALLOW_UNDETERMINISTIC
// [Deterministic Note]

// By default, we disable the un-derterministic path in this kernel,
// so that we make sure there will no side-effect with the accuracy.
// In the future, we will re-enable the un-deterministic path to improve
// performance.
//
// The background of this is that we found this kernel has different behavior
// with CUDA in alexnet To avoid future problem, we decided to always use
// deterministic path.

using KernelClass =
MaxPool2dBackwardDeterministicKernelFunctor<scalar_t, is_channels_last>;
BatchKernelConfig cfg = {
1, gradInputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive};
cfg.template build<KernelClass>();
auto kfn = KernelClass(
gradInput,
gradOutput,
indices,
numPlane,
gradInputSizeH,
gradInputSizeW,
gradOutputSizeH,
gradOutputSizeW,
gradInputSize,
out_cf_c_stride,
in_cf_c_stride,
out_n_stride,
in_n_stride,
kernel_h,
kernel_w,
stride_h,
stride_w,
pad_h,
pad_w,
dilation_h,
dilation_w,
cfg);
sycl_kernel_submit(cfg.global_size(), cfg.group_size(), queue, kfn);
#else
int64_t gradOutputSize =
numBatch * numPlane * gradOutputSizeH * gradOutputSizeW;
using KernelClass =
MaxPool2dBackwardKernelFunctor<scalar_t, is_channels_last>;
BatchKernelConfig cfg = {
1, gradOutputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive};
cfg.template build<KernelClass>();
auto kfn = KernelClass(
gradInput,
gradOutput,
indices,
numPlane,
gradInputSizeH,
gradInputSizeW,
gradOutputSizeH,
gradOutputSizeW,
gradOutputSize,
out_cf_c_stride,
in_cf_c_stride,
out_n_stride,
in_n_stride,
cfg);
sycl_kernel_submit(cfg.global_size(), cfg.group_size(), queue, kfn);
#endif
}

void max_pool2d_with_indices_kernel(
Expand Down
5 changes: 0 additions & 5 deletions test/xpu/extended/run_test_with_skip.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,6 @@
"test_compare_cpu_nn_functional_embedding_bag_xpu_float64",
"test_view_replay_nn_functional_embedding_bag_xpu_float32",

# Not implemented operators, aten::_embedding_bag_backward.
# To retrieve cases when the operators are supported.
# https://github.com/intel/torch-xpu-ops/issues/536
"test_backward_nn_functional_embedding_bag_xpu_float32",

#Double and complex datatype matmul is not supported in oneDNN
"test_compare_cpu_cdist_xpu_float64",

Expand Down
1 change: 1 addition & 0 deletions yaml/xpu_functions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,7 @@ supported:
- bincount
- _embedding_bag
- _embedding_bag_forward_only
- _embedding_bag_backward
- sgn
- sgn.out
- sgn_
Expand Down

0 comments on commit c7f7ccb

Please sign in to comment.