diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml new file mode 100644 index 000000000..7cf2746c3 --- /dev/null +++ b/.github/workflows/_linux_ut.yml @@ -0,0 +1,123 @@ +name: inductor-xpu-ut-test + +on: + workflow_call: + inputs: + torch_xpu_ops_update: + required: false + type: string + default: 'true' + description: True means update xpu_ops when building pytorch, otherwise means not + ut_suite: + required: true + type: string + default: 'op_example,op_extended,op_ut,torch_xpu' + description: op_example,op_extended,op_ut,torch_xpu. Delimiter is comma + pytorch_branch: + required: false + type: string + default: 'main' + description: Set pytorch branch + runner: + required: true + type: string + default: 'linux.idc.xpu' + description: Set runner + + +jobs: + Inductor-XPU-UT-Tests: + runs-on: ${{ inputs.runner }} + timeout-minutes: 900 + steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + - name: Prepare Stock Pytorch + run: | + pwd + cd ../ && rm -rf pytorch + git clone -b ${{ inputs.pytorch_branch }} https://github.com/pytorch/pytorch + cd pytorch && git log -n 1 && git submodule sync && git submodule update --init --recursive + if [ -z ${{ inputs.torch_xpu_ops_update }} ]; then + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + else + if [[ ${{ inputs.torch_xpu_ops_update }} == 'true' ]]; then + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + else + echo "Not update torch-xpu-ops" + fi + fi + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + - name: Build Pytorch XPU + run: | + which conda && conda clean -ay + conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ + rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} + conda create -n xpu_op_${ZE_AFFINITY_MASK} python=3.10 cmake ninja -y + source activate xpu_op_${ZE_AFFINITY_MASK} + conda install -c intel mkl-static mkl-include -y + cd ../pytorch + pip install -r requirements.txt + export USE_XPU=1 + source /opt/intel/oneapi/compiler/latest/env/vars.sh + export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} + python setup.py bdist_wheel + pip install --force-reinstall dist/*.whl + git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. + pip install -r .ci/docker/requirements-ci.txt + - name: Run XPU OP Examples + if: contains(inputs.ut_suite, 'op_example') + run: | + cd ${{ github.workspace }} + mkdir -p ut_log + xpu-smi discovery + source /opt/intel/oneapi/compiler/latest/env/vars.sh + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ${{ github.workspace }} + cd examples + pip install pytest + timeout 8000 pytest -v + - name: Run XPU OP Extended UT + if: contains(inputs.ut_suite, 'op_extended') + run: | + source /opt/intel/oneapi/compiler/latest/env/vars.sh + source activate xpu_op_${ZE_AFFINITY_MASK} + export PYTORCH_TEST_WITH_SLOW=1 + cd ../pytorch/third_party/torch-xpu-ops/test/xpu/extended/ + timeout 10000 python run_test_with_skip.py + - name: Run XPU OP UT + if: contains(inputs.ut_suite, 'op_ut') + run: | + source /opt/intel/oneapi/compiler/latest/env/vars.sh + source activate xpu_op_${ZE_AFFINITY_MASK} + export PYTORCH_ENABLE_XPU_FALLBACK=1 + export PYTORCH_TEST_WITH_SLOW=1 + cd ../pytorch/third_party/torch-xpu-ops/test/xpu + timeout 10000 python run_test_with_skip.py + # Cases run with a on-demand white list, since some suites are too + # slow to go through all operators on CPU. So add cases on-demand + # when XPU implementatoin is done. + # test_foreach, test_decomp + timeout 10000 python run_test_with_only.py + - name: Run Torch XPU UT + if: contains(inputs.ut_suite, 'torch_xpu') + run: | + source /opt/intel/oneapi/compiler/latest/env/vars.sh + source activate xpu_op_${ZE_AFFINITY_MASK} + cd ../pytorch + TEST_REPORTS_DIR=$(pwd)/test/test-reports + rm -rf "$TEST_REPORTS_DIR" && mkdir -p "$TEST_REPORTS_DIR" + # Run Pytorch XPU binary UT + for xpu_case in build/bin/*{xpu,sycl}*; do + if [[ "$xpu_case" != *"*"* && "$xpu_case" != *.so && "$xpu_case" != *.a ]]; then + case_name=$(basename "$xpu_case") + echo "Testing ${case_name} ..." + "$xpu_case" --gtest_output=xml:"$TEST_REPORTS_DIR"/"$case_name".xml + fi + done + # Run Pytorch XPU python UT + export PYTORCH_ENABLE_XPU_FALLBACK=1 + sed -i 's/selected_tests = exclude_tests(XPU_BLOCKLIST.*/selected_tests = XPU_TEST/g' ./test/run_test.py + python test/run_test.py --xpu + diff --git a/.github/workflows/inductor_xpu_e2e_nightly.yml b/.github/workflows/inductor_xpu_e2e_nightly.yml index 8307edae7..a8d316580 100644 --- a/.github/workflows/inductor_xpu_e2e_nightly.yml +++ b/.github/workflows/inductor_xpu_e2e_nightly.yml @@ -41,6 +41,21 @@ on: type: string default: '' description: If set, will only launch this one + torch_xpu_ops_update: + required: false + type: string + default: 'true' + description: True means update xpu_ops when building pytorch, otherwise means not + ut_suite: + required: true + type: string + default: 'op_example,op_extended,op_ut,torch_xpu' + description: op_example,op_extended,op_ut,torch_xpu. Delimiter is comma + pytorch_branch: + required: false + type: string + default: 'main' + description: Set pytorch branch permissions: read-all @@ -244,6 +259,15 @@ jobs: name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} path: ${{ github.workspace }}/upload_files + Inductor-XPU-UT-Nightly-Tests: + if: ${{ inputs.ut_suite }} + name: Nightly Inductor XPU UT Test + uses: ./.github/workflows/_linux_ut.yml + with: + ut_suite: ${{ inputs.ut_suite }} + pytorch_branch: ${{ inputs.pytorch_branch }} + runner: linux.idc.xpu + Tests-Failure-And-Report: if: always() runs-on: pvc_e2e diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index bd65effa9..1bd635d1a 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -23,84 +23,8 @@ jobs: # Don't run on forked repos and draft PRs if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }} name: preci-ut - runs-on: linux.idc.xpu - timeout-minutes: 240 - steps: - - name: Checkout torch-xpu-ops - uses: actions/checkout@v3 - - name: Prepare Stock Pytorch - run: | - pwd - cd ../ && rm -rf pytorch - git clone -b main https://github.com/pytorch/pytorch - cd pytorch && git log -n 1 && git submodule sync && git submodule update --init --recursive - rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ - # Workaround for torch-xpu-ops ci test - sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt - - name: Build Pytorch XPU - run: | - which conda && conda clean -ay - conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \ - rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK} - conda create -n xpu_op_${ZE_AFFINITY_MASK} python=3.10 cmake ninja -y - source activate xpu_op_${ZE_AFFINITY_MASK} - conda install -c intel mkl-static mkl-include -y - cd ../pytorch - pip install -r requirements.txt - export USE_XPU=1 - source /opt/intel/oneapi/compiler/latest/env/vars.sh - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} - python setup.py bdist_wheel - pip install --force-reinstall dist/*.whl - git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. - pip install -r .ci/docker/requirements-ci.txt - - name: Run XPU OP Examples - if: ${{ hashFiles('examples/') != '' }} - run: | - xpu-smi discovery - source /opt/intel/oneapi/compiler/latest/env/vars.sh - source activate xpu_op_${ZE_AFFINITY_MASK} - cd examples - pip install pytest - timeout 8000 pytest -v - - name: Run XPU OP Extended UT - if: ${{ hashFiles('test/xpu/') != '' }} - run: | - source /opt/intel/oneapi/compiler/latest/env/vars.sh - source activate xpu_op_${ZE_AFFINITY_MASK} - export PYTORCH_TEST_WITH_SLOW=1 - cd ../pytorch/third_party/torch-xpu-ops/test/xpu/extended/ - timeout 10000 python run_test_with_skip.py - - name: Run XPU OP UT - if: ${{ hashFiles('test/xpu/') != '' }} - run: | - source /opt/intel/oneapi/compiler/latest/env/vars.sh - source activate xpu_op_${ZE_AFFINITY_MASK} - export PYTORCH_ENABLE_XPU_FALLBACK=1 - export PYTORCH_TEST_WITH_SLOW=1 - cd ../pytorch/third_party/torch-xpu-ops/test/xpu - timeout 10000 python run_test_with_skip.py - # Cases run with a on-demand white list, since some suites are too - # slow to go through all operators on CPU. So add cases on-demand - # when XPU implementatoin is done. - # test_foreach, test_decomp - timeout 10000 python run_test_with_only.py - - name: Run Torch XPU UT - run: | - source /opt/intel/oneapi/compiler/latest/env/vars.sh - source activate xpu_op_${ZE_AFFINITY_MASK} - cd ../pytorch - TEST_REPORTS_DIR=$(pwd)/test/test-reports - rm -rf "$TEST_REPORTS_DIR" && mkdir -p "$TEST_REPORTS_DIR" - # Run Pytorch XPU binary UT - for xpu_case in build/bin/*{xpu,sycl}*; do - if [[ "$xpu_case" != *"*"* && "$xpu_case" != *.so && "$xpu_case" != *.a ]]; then - case_name=$(basename "$xpu_case") - echo "Testing ${case_name} ..." - "$xpu_case" --gtest_output=xml:"$TEST_REPORTS_DIR"/"$case_name".xml - fi - done - # Run Pytorch XPU python UT - export PYTORCH_ENABLE_XPU_FALLBACK=1 - sed -i 's/selected_tests = exclude_tests(XPU_BLOCKLIST.*/selected_tests = XPU_TEST/g' ./test/run_test.py - python test/run_test.py --xpu + uses: ./.github/workflows/_linux_ut.yml + with: + ut_suite: op_example,op_extended,op_ut,torch_xpu + runner: linux.idc.xpu + \ No newline at end of file diff --git a/src/ATen/native/xpu/Bucketization.cpp b/src/ATen/native/xpu/Bucketization.cpp new file mode 100644 index 000000000..0d6c2a9f5 --- /dev/null +++ b/src/ATen/native/xpu/Bucketization.cpp @@ -0,0 +1,125 @@ +#include +#include +#include +#include + +namespace at { + +Tensor& XPUNativeFunctions::searchsorted_out( + const Tensor& sorted_sequence, + const Tensor& self, + bool out_int32, + bool right, + const std::optional side_opt, + const std::optional& sorter_opt, + Tensor& result) { + // See [Note: hacky wrapper removal for optional tensor] + c10::MaybeOwned sorter_maybe_owned = + at::borrow_from_optional_tensor(sorter_opt); + const Tensor& sorter = *sorter_maybe_owned; + at::native::searchsorted_pre_check( + sorted_sequence, self, result, out_int32, right, side_opt, sorter); + at::native::resize_output(result, self.sizes()); + + if (self.numel() == 0) { + return result; + } + + // we have two inputs to set right, pre_check checks that they aren't set to + // opposites + bool is_right = (side_opt && *side_opt == "right") || right; + at::native::xpu::searchsorted_kernel( + result, self, sorted_sequence, out_int32, is_right, sorter); + return result; +} + +Tensor& XPUNativeFunctions::searchsorted_out( + const Tensor& sorted_sequence, + const Scalar& self, + bool out_int32, + bool right, + const std::optional side_opt, + const std::optional& sorter_opt, + Tensor& result) { + const Tensor& scalar_tensor = + at::native::searchsorted_scalar_tensor(self, sorted_sequence.device()); + return searchsorted_out( + sorted_sequence, + scalar_tensor, + out_int32, + right, + side_opt, + sorter_opt, + result); +} + +Tensor XPUNativeFunctions::searchsorted( + const Tensor& sorted_sequence, + const Tensor& self, + bool out_int32, + bool right, + const std::optional side_opt, + const std::optional& sorter) { + ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long; + c10::TensorOptions options = + TensorOptions().device(self.options().device()).dtype(scalar_type); + Tensor result = at::empty({0}, options, MemoryFormat::Contiguous); + searchsorted_out( + sorted_sequence, self, out_int32, right, side_opt, sorter, result); + return result; +} + +Tensor XPUNativeFunctions::searchsorted( + const Tensor& sorted_sequence, + const Scalar& self, + bool out_int32, + bool right, + const std::optional side_opt, + const std::optional& sorter) { + const Tensor& scalar_tensor = + at::native::searchsorted_scalar_tensor(self, sorted_sequence.device()); + return searchsorted( + sorted_sequence, scalar_tensor, out_int32, right, side_opt, sorter); +} + +Tensor& XPUNativeFunctions::bucketize_out( + const Tensor& self, + const Tensor& boundaries, + bool out_int32, + bool right, + Tensor& result) { + TORCH_CHECK( + boundaries.dim() == 1, + "boundaries tensor must be 1 dimension, but got dim(", + boundaries.dim(), + ")"); + searchsorted_out( + boundaries, self, out_int32, right, nullopt, nullopt, result); + return result; +} + +Tensor XPUNativeFunctions::bucketize( + const Tensor& self, + const Tensor& boundaries, + bool out_int32, + bool right) { + ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long; + c10::TensorOptions options = + TensorOptions().device(self.options().device()).dtype(scalar_type); + Tensor result = at::empty({0}, options, MemoryFormat::Contiguous); + bucketize_out(self, boundaries, out_int32, right, result); + return result; +} + +Tensor XPUNativeFunctions::bucketize( + const Scalar& self, + const Tensor& boundaries, + bool out_int32, + bool right) { + return bucketize( + at::native::searchsorted_scalar_tensor(self, boundaries.device()), + boundaries, + out_int32, + right); +} +} // namespace at diff --git a/src/ATen/native/xpu/LinearAlgebra.cpp b/src/ATen/native/xpu/LinearAlgebra.cpp index dfc7bd70b..2f857f18b 100644 --- a/src/ATen/native/xpu/LinearAlgebra.cpp +++ b/src/ATen/native/xpu/LinearAlgebra.cpp @@ -1,15 +1,9 @@ #include -#include -#include -#include -#include #include #include -#include -#include - #include #include +#include #include namespace at { diff --git a/src/ATen/native/xpu/PointwiseOps.cpp b/src/ATen/native/xpu/PointwiseOps.cpp index 210cec3e6..a01bdc391 100644 --- a/src/ATen/native/xpu/PointwiseOps.cpp +++ b/src/ATen/native/xpu/PointwiseOps.cpp @@ -6,6 +6,63 @@ namespace at { +TensorIterator addcdiv_meta( + const Tensor& self, + const Tensor& tensor1, + const Tensor& tensor2, + const Scalar& value, + Tensor& out) { + if (isIntegralType(tensor1.scalar_type(), /*includeBool=*/true) && + isIntegralType(tensor2.scalar_type(), /*includeBool=*/true)) { + TORCH_CHECK( + false, + "Integer division with addcdiv is no longer supported, and in a future ", + "release addcdiv will perform a true division of tensor1 and tensor2. ", + "The historic addcdiv behavior can be implemented as ", + "(input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype) ", + "for integer inputs and as ", + "(input + value * tensor1 / tensor2) for float inputs. ", + "The future addcdiv behavior is just the latter implementation: ", + "(input + value * tensor1 / tensor2), for all dtypes."); + } + + TensorIterator iter; + iter.build_ternary_op(out, self, tensor1, tensor2); + return iter; +} + +Tensor& XPUNativeFunctions::addcdiv_out( + const Tensor& self, + const Tensor& tensor1, + const Tensor& tensor2, + const Scalar& value, + Tensor& out) { + auto iter = addcdiv_meta(self, tensor1, tensor2, value, out); + native::xpu::addcdiv_kernel(iter, value); + return out; +} + +Tensor XPUNativeFunctions::addcdiv( + const Tensor& self, + const Tensor& tensor1, + const Tensor& tensor2, + const Scalar& value) { + Tensor out; + auto iter = addcdiv_meta(self, tensor1, tensor2, value, out); + native::xpu::addcdiv_kernel(iter, value); + return iter.output(); +} + +Tensor& XPUNativeFunctions::addcdiv_( + Tensor& self, + const Tensor& tensor1, + const Tensor& tensor2, + const Scalar& value) { + auto iter = addcdiv_meta(self, tensor1, tensor2, value, self); + native::xpu::addcdiv_kernel(iter, value); + return self; +} + TensorIterator addcmul_meta( const Tensor& self, const Tensor& tensor1, diff --git a/src/ATen/native/xpu/TensorFactories.cpp b/src/ATen/native/xpu/TensorFactories.cpp index d7b79902f..110590958 100644 --- a/src/ATen/native/xpu/TensorFactories.cpp +++ b/src/ATen/native/xpu/TensorFactories.cpp @@ -1,8 +1,8 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include -#include #include +#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -18,6 +18,25 @@ namespace at { +Tensor& XPUNativeFunctions::eye_out(int64_t n, Tensor& result) { + return XPUNativeFunctions::eye_out(n, n, result); +} + +Tensor& XPUNativeFunctions::eye_out(int64_t n, int64_t m, Tensor& result) { + TORCH_CHECK(n >= 0, "n must be greater or equal to 0, got ", n); + TORCH_CHECK(m >= 0, "m must be greater or equal to 0, got ", m); + + result.resize_({n, m}); + result.zero_(); + + int64_t sz = std::min(n, m); + int64_t stride = result.stride(0) + result.stride(1); + + Tensor diag = result.as_strided({sz}, {stride}); + diag.fill_(1); + return result; +} + Tensor XPUNativeFunctions::empty( IntArrayRef size, c10::optional dtype_opt, diff --git a/src/ATen/native/xpu/TensorProperties.cpp b/src/ATen/native/xpu/TensorProperties.cpp new file mode 100644 index 000000000..428d18fcd --- /dev/null +++ b/src/ATen/native/xpu/TensorProperties.cpp @@ -0,0 +1,16 @@ +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#endif + +namespace at { + +bool XPUNativeFunctions::is_set_to(const Tensor& self, const Tensor& src) { + return at::native::is_set_to(self, src); +} + +} // namespace at diff --git a/src/ATen/native/xpu/UnaryOps.cpp b/src/ATen/native/xpu/UnaryOps.cpp index 419af3b8f..581ec0d54 100644 --- a/src/ATen/native/xpu/UnaryOps.cpp +++ b/src/ATen/native/xpu/UnaryOps.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -561,4 +562,52 @@ Tensor& XPUNativeFunctions::tan_out(const Tensor& self, Tensor& out) { return out; } +Tensor& XPUNativeFunctions::conj_physical_out(const Tensor& self, Tensor& out) { + auto iter = TensorIterator::unary_op(out, self); + native::xpu::conj_physical_kernel(iter); + return out; +} + +Tensor& XPUNativeFunctions::conj_physical_(Tensor& self) { + if (!self.is_complex()) + return self; + return XPUNativeFunctions::conj_physical_out(self, self); +} + +TensorIterator ceil_meta(const Tensor& self, Tensor& out) { + TORCH_CHECK(!self.is_complex(), "ceil is not supported for complex inputs"); + TensorIterator iter; + iter.build_borrowing_unary_op(out, self); + return iter; +} + +Tensor XPUNativeFunctions::ceil(const Tensor& self) { + if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) { + return self.clone(); + } + Tensor out; + auto iter = ceil_meta(self, out); + native::xpu::ceil_kernel(iter); + return iter.output(); +} + +Tensor& XPUNativeFunctions::ceil_(Tensor& self) { + if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) { + return self; + } + auto iter = ceil_meta(self, self); + native::xpu::ceil_kernel(iter); + return self; +} + +Tensor& XPUNativeFunctions::ceil_out(const Tensor& self, Tensor& out) { + if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) { + out.copy_(self); + return out; + } + auto iter = ceil_meta(self, out); + native::xpu::ceil_kernel(iter); + return out; +} + } // namespace at diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template index 3c1d5b5d1..60af8f15e 100644 --- a/src/ATen/native/xpu/XPUFallback.template +++ b/src/ATen/native/xpu/XPUFallback.template @@ -163,7 +163,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "adaptive_max_pool2d.out", "adaptive_max_pool3d_backward.grad_input", "adaptive_max_pool3d.out", - "addcdiv.out", "aminmax.out", "angle", "argmin.out", @@ -178,15 +177,12 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "binary_cross_entropy_backward", "bitwise_left_shift.Tensor_out", "bitwise_right_shift.Tensor_out", - "bucketize.Tensor", "cauchy_", "_cdist_backward", - "ceil.out", "channel_shuffle", "cholesky", "cholesky_inverse", "_cholesky_solve_helper", - "conj_physical.out", "copysign.out", "cosh.out", "count_nonzero.dim_IntList", @@ -205,7 +201,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "exp2.out", "expm1.out", "exponential_", - "eye.m_out", "_fft_c2c", "_fft_c2r", "_fft_r2c", @@ -322,7 +317,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "__rshift__.Scalar", "_scaled_dot_product_efficient_attention", "_scaled_mm", - "searchsorted.Tensor", "segment_reduce", "_segment_reduce_backward", "signbit.out", diff --git a/src/ATen/native/xpu/sycl/BucketizationKernels.cpp b/src/ATen/native/xpu/sycl/BucketizationKernels.cpp new file mode 100644 index 000000000..213283de0 --- /dev/null +++ b/src/ATen/native/xpu/sycl/BucketizationKernels.cpp @@ -0,0 +1,246 @@ +#include +#include +#include + +namespace at::native::xpu { + +// customized lower_bound func to ensure the low bound of 'nan', 'inf' etc. be +// the end of boundary and we can properly handle a sorter argument +// std::lower_bound can not be used here since its customized comparator need +// strict weak ordering and the customized comparators require both arguments to +// have the same type, which wouldn't happen when comparing val of input_t to an +// indexer value from sorter of int64 +template +int64_t cus_lower_bound( + int64_t start, + int64_t end, + const input_t val, + const input_t* bd, + const int64_t* sort) { + // sorter gives relative ordering for ND tensors, so we need to save and add + // the non-updated start as an offset i.e. the second row of a 3x3 tensors + // starts at element 3 but sorter's second row only contains 0, 1, or 2 + const int64_t orig_start = start; + while (start < end) { + const int64_t mid = start + ((end - start) >> 1); + const input_t mid_val = sort ? bd[sort[mid] + orig_start] : bd[mid]; + if (!(mid_val >= val)) { + start = mid + 1; + } else { + end = mid; + } + } + return start; +} + +// customized upper_bound func to ensure we can properly handle a sorter +// argument std::upper_bound can not be used here since its customized +// comparator requires both arguments to have the same type, which wouldn't +// happen when comparing val of input_t to an indexer value from sorter of int64 +template +int64_t cus_upper_bound( + int64_t start, + int64_t end, + const input_t val, + const input_t* bd, + const int64_t* sort) { + // sorter gives relative ordering for ND tensors, so we need to save and add + // the non-updated start as an offset i.e. the second row of a 3x3 tensors + // starts at element 3 but sorter's second row only contains 0, 1, or 2 + const int64_t orig_start = start; + while (start < end) { + const int64_t mid = start + ((end - start) >> 1); + const input_t mid_val = sort ? bd[sort[mid] + orig_start] : bd[mid]; + if (!(mid_val > val)) { + start = mid + 1; + } else { + end = mid; + } + } + return start; +} + +template +struct SearchsortedKernelFunctor { + void operator()(sycl::nd_item<1> item) const { + for (int64_t i = item.get_global_id(0); i < numel_in_; + i += item.get_global_range()[0]) { + // If boundaries tensor is 1d, we always search the entire boundary + // tensor + int64_t start_bd = is_1d_boundaries_ ? 0 : i / idim_in_ * idim_bd_; + int64_t end_bd = start_bd + idim_bd_; + + int64_t pos = !right_ + ? cus_lower_bound( + start_bd, end_bd, data_in_data_[i], data_bd_data_, data_st_) - + start_bd + : cus_upper_bound( + start_bd, end_bd, data_in_data_[i], data_bd_data_, data_st_) - + start_bd; + + // type conversion might happen here + data_out_data_[i] = pos; + } + } + + SearchsortedKernelFunctor( + const bool right, + int64_t numel_in, + int64_t idim_in, + int64_t idim_bd, + const int64_t* data_st, + output_t* data_out, + bool is_1d_boundaries, + input_t* data_in_data, + input_t* data_bd_data, + output_t* data_out_data) + : right_(right), + numel_in_(numel_in), + idim_in_(idim_in), + idim_bd_(idim_bd), + data_st_(data_st), + data_out_(data_out), + is_1d_boundaries_(is_1d_boundaries), + data_in_data_(data_in_data), + data_bd_data_(data_bd_data), + data_out_data_(data_out_data) {} + + private: + const bool right_; + int64_t numel_in_; + int64_t idim_in_; + int64_t idim_bd_; + const int64_t* data_st_; + output_t* data_out_; + bool is_1d_boundaries_; + input_t* data_in_data_; + input_t* data_bd_data_; + output_t* data_out_data_; +}; +template +void searchsorted_template( + Tensor& result, + const Tensor& input, + const Tensor& boundaries, + const bool& right, + const Tensor& sorter) { + int64_t numel_in = input.numel(); + int64_t rng, grng, tile_size; + tile_size = syclMaxWorkGroupSize(); + rng = numel_in; + if (rng == 0) { + rng = static_cast(1); + } + + grng = rng; + if (tile_size > grng) { + tile_size = grng; + } else if (grng > tile_size) { + int64_t xMode = static_cast(grng % tile_size); + if (xMode != 0) { + grng += static_cast(tile_size - xMode); + } + } + + bool is_scalar_input = input.dim() == 0 && numel_in == 1; + // inner most dim size of input and boundaries + int64_t idim_in = is_scalar_input ? 1 : input.sizes().back(); + int64_t idim_bd = boundaries.sizes().back(); + + const int64_t* data_st = + sorter.defined() ? sorter.data_ptr() : nullptr; + output_t* data_out = result.data_ptr(); + + bool is_1d_boundaries = boundaries.dim() == 1; + auto data_in_data = input.data_ptr(); + auto data_bd_data = boundaries.data_ptr(); + auto data_out_data = result.data_ptr(); + SearchsortedKernelFunctor kfn( + right, + numel_in, + idim_in, + idim_bd, + data_st, + data_out, + is_1d_boundaries, + data_in_data, + data_bd_data, + data_out_data); + + sycl_kernel_submit(grng, tile_size, getCurrentSYCLQueue(), kfn); +} + +void searchsorted_dispatch( + Tensor& result, + const Tensor& input, + const Tensor& boundaries, + bool out_int32, + bool right, + const Tensor& sorter) { + if (!out_int32) { + AT_DISPATCH_ALL_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + input.scalar_type(), + "searchsorted_xpu", + [&] { + searchsorted_template( + result, input, boundaries, right, sorter); + }); + } else { + AT_DISPATCH_ALL_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + input.scalar_type(), + "searchsorted_xpu", + [&] { + searchsorted_template( + result, input, boundaries, right, sorter); + }); + } +} + +void searchsorted_kernel( + Tensor& result, + const Tensor& input, + const Tensor& sorted_sequence, + bool out_int32, + bool right, + const Tensor& sorter) { + // for non-contiguous result tensors, we write the output to a contiguous copy + // so we can later copy back, maintaining the original result tensor + Tensor out = result; + if (!result.is_contiguous()) { + out = result.contiguous(); + } + if (sorted_sequence.is_contiguous() && input.is_contiguous() && + sorted_sequence.dtype() == input.dtype() && sorter.is_contiguous()) { + searchsorted_dispatch( + out, input, sorted_sequence, out_int32, right, sorter); + } else { + Tensor trimmed_input; + Tensor trimmed_boundaries; + Tensor trimmed_sorter; + at::native::searchsorted_maybe_trim_input_tensors( + trimmed_input, + trimmed_boundaries, + trimmed_sorter, + input, + sorted_sequence, + sorter); + const Tensor& final_input = trimmed_input.defined() ? trimmed_input : input; + const Tensor& final_boundaries = + trimmed_boundaries.defined() ? trimmed_boundaries : sorted_sequence; + const Tensor& final_sorter = + trimmed_sorter.defined() ? trimmed_sorter : sorter; + searchsorted_dispatch( + out, final_input, final_boundaries, out_int32, right, final_sorter); + } + + // if result is non-contiguous, we wrote the answer to a copied version, so we + // copy back to the original result tensor + if (!result.is_contiguous()) { + result.copy_(out); + } +} +} // namespace at::native::xpu \ No newline at end of file diff --git a/src/ATen/native/xpu/sycl/BucketizationKernels.h b/src/ATen/native/xpu/sycl/BucketizationKernels.h new file mode 100644 index 000000000..f47cea2af --- /dev/null +++ b/src/ATen/native/xpu/sycl/BucketizationKernels.h @@ -0,0 +1,12 @@ +#pragma once +#include + +namespace at::native::xpu { +void searchsorted_kernel( + Tensor& result, + const Tensor& input, + const Tensor& sorted_sequence, + bool out_int32, + bool right, + const Tensor& sorter); +} // namespace at::native::xpu \ No newline at end of file diff --git a/src/ATen/native/xpu/sycl/ComplexKernels.cpp b/src/ATen/native/xpu/sycl/ComplexKernels.cpp index 686f5e2d3..56b25d0ef 100644 --- a/src/ATen/native/xpu/sycl/ComplexKernels.cpp +++ b/src/ATen/native/xpu/sycl/ComplexKernels.cpp @@ -14,10 +14,11 @@ struct ComplexFunctor { }; void complex_kernel(TensorIterator& iter) { - AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.input_dtype(0), "complex_xpu", [&]() { - ComplexFunctor f; - gpu_kernel(iter, f); - }); + AT_DISPATCH_FLOATING_TYPES_AND( + kHalf, iter.input_dtype(0), "complex_xpu", [&]() { + ComplexFunctor f; + gpu_kernel(iter, f); + }); } } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp index 6e117ce61..8d3128e9e 100644 --- a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp +++ b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp @@ -1,7 +1,4 @@ #include -#include -#include - #include namespace at::native::xpu { diff --git a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h index fcfdad46e..32f987a2e 100644 --- a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h +++ b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h @@ -1,6 +1,5 @@ #pragma once #include -#include namespace at::native::xpu { diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp index 7b00d09e3..d38f511d7 100644 --- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp @@ -1,6 +1,6 @@ #include +#include #include -#include #include #include @@ -8,31 +8,98 @@ namespace at::native::xpu { template -struct AddcmulKernelFunctor { - using opmath_t = at::opmath_type; +struct AddcmulFunctor { + using accscalar_t = at::acc_type; scalar_t operator()(scalar_t a, scalar_t b, scalar_t c) const { - return static_cast(a) + - alpha_ * static_cast(b) * static_cast(c); + return static_cast(a) + + alpha_ * static_cast(b) * static_cast(c); } - AddcmulKernelFunctor(opmath_t alpha) : alpha_(alpha) {} + AddcmulFunctor(accscalar_t alpha) : alpha_(alpha) {} private: - opmath_t alpha_; + accscalar_t alpha_; +}; + +template +struct AddcmulComplexFunctor { + scalar_t operator()(scalar_t a, scalar_t b, scalar_t c) const { + return a + alpha_ * b * c; + } + + AddcmulComplexFunctor(scalar_t alpha) : alpha_(alpha) {} + + private: + scalar_t alpha_; }; void addcmul_kernel(TensorIterator& iter, Scalar value) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( - at::ScalarType::Half, - at::ScalarType::BFloat16, - iter.dtype(), - "addcmul_xpu", - [&]() { - using opmath_t = at::opmath_type; - auto alpha = value.to(); - AddcmulKernelFunctor f(alpha); - gpu_kernel(iter, f); - }); + auto dtype = iter.common_dtype(); + if (at::isComplexType(dtype)) { + AT_DISPATCH_COMPLEX_TYPES(dtype, "addcmul_xpu", [&]() { + auto alpha = value.to(); + gpu_kernel(iter, AddcmulComplexFunctor(alpha)); + }); + } else { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + iter.dtype(), + "addcmul_xpu", + [&]() { + using accscalar_t = at::acc_type; + auto alpha = value.to(); + gpu_kernel(iter, AddcmulFunctor(alpha)); + }); + } +} + +template +struct AddcdivFunctor { + using accscalar_t = at::acc_type; + scalar_t operator()(scalar_t a, scalar_t b, scalar_t c) const { + return a + alpha_ * (b / static_cast(c)); + } + + AddcdivFunctor(accscalar_t alpha) : alpha_(alpha) {} + + private: + accscalar_t alpha_; +}; + +template +struct AddcdivComplexFunctor { + scalar_t operator()(scalar_t a, scalar_t b, scalar_t c) const { + return a + alpha_ * (b / c); + } + + AddcdivComplexFunctor(scalar_t alpha) : alpha_(alpha) {} + + private: + scalar_t alpha_; +}; + +void addcdiv_kernel(TensorIterator& iter, Scalar value) { + auto dtype = iter.common_dtype(); + if (at::isComplexType(dtype)) { + AT_DISPATCH_COMPLEX_TYPES(dtype, "addcdiv_xpu", [&]() { + auto alpha = value.to(); + AddcdivComplexFunctor f(alpha); + gpu_kernel(iter, f); + }); + } else { + AT_DISPATCH_ALL_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + iter.dtype(), + "addcdiv_xpu", + [&]() { + using accscalar_t = at::acc_type; + auto alpha = value.to(); + AddcdivFunctor f(alpha); + gpu_kernel(iter, f); + }); + } } template diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h index fdb216dbd..c775b88e5 100644 --- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h +++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h @@ -6,6 +6,8 @@ namespace at::native::xpu { void addcmul_kernel(TensorIterator& iter, Scalar value); +void addcdiv_kernel(TensorIterator& iter, Scalar value); + void mse_backward_kernel(TensorIterator& iter, const Scalar& value); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp b/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp index e082096c1..87de57a3a 100644 --- a/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp @@ -30,6 +30,32 @@ void conj_kernel(TensorIterator& iter) { })); } +template +struct ConjPhysicalFunctor { + scalar_t operator()(scalar_t z) const { + return std::conj(z); + } +}; + +template +struct ConjPhysicalFunctor> { + c10::complex operator()(c10::complex z) const { + return c10::complex(z.real(), -z.imag()); + } +}; + +void conj_physical_kernel(TensorIterator& iter) { + AT_DISPATCH_SWITCH( + iter.common_dtype(), + "conj_xpu", + AT_DISPATCH_CASE_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, [&] { + // Conj is a no-op for non-complex types + copy_kernel(iter); + }) AT_DISPATCH_CASE_COMPLEX_TYPES_AND(kComplexHalf, [&] { + gpu_kernel(iter, ConjPhysicalFunctor()); + })); +} + template struct NegConjScalarFunc { scalar_t operator()(scalar_t src_val) const { diff --git a/src/ATen/native/xpu/sycl/UnaryComplexKernels.h b/src/ATen/native/xpu/sycl/UnaryComplexKernels.h index 8d19381b3..d3ad4fe15 100644 --- a/src/ATen/native/xpu/sycl/UnaryComplexKernels.h +++ b/src/ATen/native/xpu/sycl/UnaryComplexKernels.h @@ -6,6 +6,8 @@ namespace at::native::xpu { void conj_kernel(TensorIterator& iter); +void conj_physical_kernel(TensorIterator& iter); + void neg_conj_kernel(TensorIterator& iter); void neg_kernel(TensorIterator& iter); diff --git a/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp b/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp index b33be1a30..82bdc4c28 100644 --- a/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp @@ -55,4 +55,25 @@ void reciprocal_kernel(TensorIteratorBase& iter) { [&]() { gpu_kernel(iter, ReciprocalFunctor()); }); } +template +struct CeilFunctor { + scalar_t operator()(const scalar_t a) const { + return std::ceil(a); + } +}; + +template +struct CeilFunctor> { + c10::complex operator()(const c10::complex a) const { + return c10::complex(std::ceil(a.real()), std::ceil(a.imag())); + } +}; + +void ceil_kernel(TensorIteratorBase& iter) { + AT_DISPATCH_FLOATING_TYPES_AND2( + ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "ceil_xpu", [&]() { + gpu_kernel(iter, CeilFunctor()); + }); +} + } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryFractionKernels.h b/src/ATen/native/xpu/sycl/UnaryFractionKernels.h index 5de711c78..7ab9baf32 100644 --- a/src/ATen/native/xpu/sycl/UnaryFractionKernels.h +++ b/src/ATen/native/xpu/sycl/UnaryFractionKernels.h @@ -6,4 +6,6 @@ namespace at::native::xpu { void reciprocal_kernel(TensorIteratorBase& iter); +void ceil_kernel(TensorIteratorBase& iter); + } // namespace at::native::xpu diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py index f4bf64aad..2404fdb9b 100644 --- a/test/xpu/run_test_with_skip.py +++ b/test/xpu/run_test_with_skip.py @@ -207,7 +207,6 @@ def launch_test(test_case, skip_list=None, exe_list=None): "test_python_ref_torch_fallback__refs_square_xpu_bool", "test_python_ref_torch_fallback__refs_vdot_xpu_complex128", "test_python_ref_torch_fallback__refs_vdot_xpu_complex64", - "test_variant_consistency_eager_conj_physical_xpu_complex64", "test_variant_consistency_eager_nn_functional_conv_transpose2d_xpu_complex64", "test_variant_consistency_eager_nn_functional_conv_transpose2d_xpu_float32", "test_variant_consistency_eager_nn_functional_conv_transpose3d_xpu_complex64", @@ -242,8 +241,6 @@ def launch_test(test_case, skip_list=None, exe_list=None): "test_python_ref_executor__refs_square_executor_aten_xpu_complex128", "test_python_ref_torch_fallback__refs_square_xpu_complex128", "test_python_ref_torch_fallback__refs_square_xpu_complex64", - "test_conj_view_conj_physical_xpu_complex64", - "test_neg_conj_view_conj_physical_xpu_complex128", # Skip list of new added when porting XPU operators. # See: https://github.com/intel/torch-xpu-ops/issues/128 @@ -2215,9 +2212,7 @@ def launch_test(test_case, skip_list=None, exe_list=None): # torch.autograd.gradcheck.GradcheckError: Jacobian computed with forward mode mismatch for output 0 with respect to input 0, "test_fn_fwgrad_bwgrad_nn_functional_rrelu_xpu_float64", "test_forward_mode_AD_nn_functional_rrelu_xpu_float64", - # RuntimeError: DispatchStub: unsupported device typexpu - "test_inplace_forward_mode_AD_conj_physical_xpu_complex128", - # NotImplementedError: Could not run 'aten::_to_dense' with arguments from the 'SparseXPU' backend. +# NotImplementedError: Could not run 'aten::_to_dense' with arguments from the 'SparseXPU' backend. "test_fn_fwgrad_bwgrad_to_sparse_xpu_float64", "test_forward_mode_AD_to_sparse_xpu_float64", ) @@ -2753,9 +2748,6 @@ def launch_test(test_case, skip_list=None, exe_list=None): ### Error #7 in TestBwdGradientsXPU , totally 2 , NotImplementedError: Could not run 'aten::_sparse_coo_tensor_with_dims_and_tensors' with arguments from the 'SparseXPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::_sparse_coo_tensor_with_dims_and_tensors' is only available for these backends: [XPU, Meta, SparseCPU, SparseMeta, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradHIP, AutogradXLA, AutogradMPS, AutogradIPU, AutogradXPU, AutogradHPU, AutogradVE, AutogradLazy, AutogradMTIA, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, AutogradMeta, AutogradNestedTensor, Tracer, AutocastCPU, AutocastXPU, AutocastCUDA, FuncTorchBatched, BatchedNestedTensor, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PreDispatch, PythonDispatcher]. "test_fn_grad_to_sparse_xpu_float64", "test_fn_gradgrad_to_sparse_xpu_float64", - ### Error #8 in TestBwdGradientsXPU , totally 2 , RuntimeError: DispatchStub: unsupported device typexpu - "test_inplace_grad_conj_physical_xpu_complex128", - "test_inplace_gradgrad_conj_physical_xpu_complex128", ) res += launch_test("test_ops_gradients_xpu.py", skip_list) diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index befd27df1..1228bd607 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -19,6 +19,7 @@ _xpu_computation_op_list = [ "empty", + "eye", "fill", "zeros", "zeros_like", @@ -43,6 +44,7 @@ "bitwise_or", "bitwise_xor", "addcmul", + "addcdiv", "clamp", "clamp_max", "clamp_min", @@ -140,6 +142,8 @@ "sigmoid", "sgn", "nn.functional.embedding_bag", + "bucketize", + "searchsorted", "grid_sampler_2d", # "nn.functional.grid_sample", # Lack of XPU implementation of aten::grid_sampler_3d. "acos", @@ -154,6 +158,7 @@ "bincount", "renorm", "lerp", + "conj_physical", ] diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index 13509c591..f88f3108c 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -196,6 +196,8 @@ supported: - exp_ - empty.memory_format - empty_strided + - eye.out + - eye.m_out - _efficientzerotensor - complex.out - clone @@ -459,6 +461,14 @@ supported: - _cdist_forward - _pin_memory - is_pinned + - is_set_to + - bucketize.Tensor + - bucketize.Tensor_out + - bucketize.Scalar + - searchsorted.Tensor + - searchsorted.Tensor_out + - searchsorted.Scalar + - searchsorted.Scalar_out - trace - reflection_pad2d - reflection_pad2d.out @@ -493,9 +503,17 @@ supported: - avg_pool2d.out - avg_pool2d_backward - avg_pool2d_backward.grad_input + - addcdiv.out + - addcdiv + - addcdiv_ - addcmul.out - addcmul - addcmul_ - randperm.generator_out - _amp_foreach_non_finite_check_and_unscale_ - _amp_update_scale_ + - conj_physical.out + - conj_physical_ + - ceil + - ceil_ + - ceil.out