intel
diff --git a/‎.github/workflows/_linux_ut.yml
Lines changed: 123 additions & 0 deletions b/‎.github/workflows/_linux_ut.yml
Lines changed: 123 additions & 0 deletions
diff --git a/‎.github/workflows/inductor_xpu_e2e_nightly.yml
Lines changed: 24 additions & 0 deletions b/‎.github/workflows/inductor_xpu_e2e_nightly.yml
Lines changed: 24 additions & 0 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 5 additions & 81 deletions b/‎.github/workflows/pull.yml
Lines changed: 5 additions & 81 deletions
diff --git a/‎src/ATen/native/xpu/Bucketization.cpp
Lines changed: 125 additions & 0 deletions b/‎src/ATen/native/xpu/Bucketization.cpp
Lines changed: 125 additions & 0 deletions
diff --git a/‎src/ATen/native/xpu/LinearAlgebra.cpp
Lines changed: 1 addition & 7 deletions b/‎src/ATen/native/xpu/LinearAlgebra.cpp
Lines changed: 1 addition & 7 deletions
@@ -0,0 +1,123 @@
+name: inductor-xpu-ut-test
+
+on:
+  workflow_call:
+    inputs:
+      torch_xpu_ops_update:
+        required: false
+        type: string
+        default: 'true'
+        description: True means update xpu_ops when building pytorch, otherwise means not
+      ut_suite:
+        required: true
+        type: string
+        default: 'op_example,op_extended,op_ut,torch_xpu'
+        description: op_example,op_extended,op_ut,torch_xpu. Delimiter is comma
+      pytorch_branch:
+        required: false
+        type: string
+        default: 'main'
+        description: Set pytorch branch
+      runner:
+        required: true
+        type: string
+        default: 'linux.idc.xpu'
+        description: Set runner
+
+
+jobs:
+  Inductor-XPU-UT-Tests:
+    runs-on: ${{ inputs.runner }} 
+    timeout-minutes: 900
+    steps:
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Prepare Stock Pytorch
+        run: |
+          pwd
+          cd ../ && rm -rf pytorch
+          git clone -b ${{ inputs.pytorch_branch }} https://github.com/pytorch/pytorch
+          cd pytorch && git log -n 1 && git submodule sync && git submodule update --init --recursive
+          if [ -z ${{ inputs.torch_xpu_ops_update }} ]; then
+            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+          else
+            if [[ ${{ inputs.torch_xpu_ops_update }} == 'true' ]]; then
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+            else
+              echo "Not update torch-xpu-ops"
+            fi
+          fi
+          # Workaround for torch-xpu-ops ci test
+          sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+      - name: Build Pytorch XPU
+        run: |
+          which conda && conda clean -ay
+          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
+                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
+          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=3.10 cmake ninja -y
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          conda install -c intel mkl-static mkl-include -y
+          cd ../pytorch
+          pip install -r requirements.txt
+          export USE_XPU=1
+          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+          python setup.py bdist_wheel
+          pip install --force-reinstall dist/*.whl
+          git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
+          pip install -r .ci/docker/requirements-ci.txt
+      - name: Run XPU OP Examples
+        if: contains(inputs.ut_suite, 'op_example')
+        run: |
+          cd ${{ github.workspace }}
+          mkdir -p ut_log
+          xpu-smi discovery
+          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ${{ github.workspace }}
+          cd examples
+          pip install pytest
+          timeout 8000 pytest -v 
+      - name: Run XPU OP Extended UT
+        if: contains(inputs.ut_suite, 'op_extended')
+        run: |
+          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          export PYTORCH_TEST_WITH_SLOW=1
+          cd ../pytorch/third_party/torch-xpu-ops/test/xpu/extended/
+          timeout 10000 python run_test_with_skip.py 
+      - name: Run XPU OP UT
+        if: contains(inputs.ut_suite, 'op_ut')
+        run: |
+          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          export PYTORCH_ENABLE_XPU_FALLBACK=1
+          export PYTORCH_TEST_WITH_SLOW=1
+          cd ../pytorch/third_party/torch-xpu-ops/test/xpu
+          timeout 10000 python run_test_with_skip.py 
+          # Cases run with a on-demand white list, since some suites are too
+          # slow to go through all operators on CPU. So add cases on-demand
+          # when XPU implementatoin is done.
+          # test_foreach, test_decomp
+          timeout 10000 python run_test_with_only.py 
+      - name: Run Torch XPU UT
+        if: contains(inputs.ut_suite, 'torch_xpu')
+        run: |
+          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../pytorch
+          TEST_REPORTS_DIR=$(pwd)/test/test-reports
+          rm -rf "$TEST_REPORTS_DIR" && mkdir -p "$TEST_REPORTS_DIR"
+          # Run Pytorch XPU binary UT
+          for xpu_case in build/bin/*{xpu,sycl}*; do
+            if [[ "$xpu_case" != *"*"* && "$xpu_case" != *.so && "$xpu_case" != *.a ]]; then
+              case_name=$(basename "$xpu_case")
+              echo "Testing ${case_name} ..."
+              "$xpu_case" --gtest_output=xml:"$TEST_REPORTS_DIR"/"$case_name".xml 
+            fi
+          done
+          # Run Pytorch XPU python UT
+          export PYTORCH_ENABLE_XPU_FALLBACK=1
+          sed -i 's/selected_tests = exclude_tests(XPU_BLOCKLIST.*/selected_tests = XPU_TEST/g' ./test/run_test.py
+          python test/run_test.py --xpu 
+
@@ -41,6 +41,21 @@ on:
         type: string
         default: ''
         description: If set, will only launch this one
+      torch_xpu_ops_update:
+        required: false
+        type: string
+        default: 'true'
+        description: True means update xpu_ops when building pytorch, otherwise means not
+      ut_suite:
+        required: true
+        type: string
+        default: 'op_example,op_extended,op_ut,torch_xpu'
+        description: op_example,op_extended,op_ut,torch_xpu. Delimiter is comma
+      pytorch_branch:
+        required: false
+        type: string
+        default: 'main'
+        description: Set pytorch branch
 
 
 permissions: read-all
@@ -244,6 +259,15 @@ jobs:
           name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
           path: ${{ github.workspace }}/upload_files
 
+  Inductor-XPU-UT-Nightly-Tests:
+    if: ${{ inputs.ut_suite }}
+    name: Nightly Inductor XPU UT Test
+    uses: ./.github/workflows/_linux_ut.yml
+    with: 
+      ut_suite: ${{ inputs.ut_suite }}
+      pytorch_branch: ${{ inputs.pytorch_branch }}
+      runner: linux.idc.xpu
+      
   Tests-Failure-And-Report:
     if: always()
     runs-on: pvc_e2e
 
@@ -23,84 +23,8 @@ jobs:
     # Don't run on forked repos and draft PRs
     if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
     name: preci-ut
-    runs-on: linux.idc.xpu
-    timeout-minutes: 240
-    steps:
-      - name: Checkout torch-xpu-ops
-        uses: actions/checkout@v3
-      - name: Prepare Stock Pytorch
-        run: |
-          pwd
-          cd ../ && rm -rf pytorch
-          git clone -b main https://github.com/pytorch/pytorch
-          cd pytorch && git log -n 1 && git submodule sync && git submodule update --init --recursive
-          rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-          # Workaround for torch-xpu-ops ci test
-          sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
-      - name: Build Pytorch XPU
-        run: |
-          which conda && conda clean -ay
-          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
-                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
-          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=3.10 cmake ninja -y
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          conda install -c intel mkl-static mkl-include -y
-          cd ../pytorch
-          pip install -r requirements.txt
-          export USE_XPU=1
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
-          export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
-          python setup.py bdist_wheel
-          pip install --force-reinstall dist/*.whl
-          git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
-          pip install -r .ci/docker/requirements-ci.txt
-      - name: Run XPU OP Examples
-        if: ${{ hashFiles('examples/') != '' }}
-        run: |
-          xpu-smi discovery
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          cd examples
-          pip install pytest
-          timeout 8000 pytest -v
-      - name: Run XPU OP Extended UT
-        if: ${{ hashFiles('test/xpu/') != '' }}
-        run: |
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          export PYTORCH_TEST_WITH_SLOW=1
-          cd ../pytorch/third_party/torch-xpu-ops/test/xpu/extended/
-          timeout 10000 python run_test_with_skip.py
-      - name: Run XPU OP UT
-        if: ${{ hashFiles('test/xpu/') != '' }}
-        run: |
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          export PYTORCH_ENABLE_XPU_FALLBACK=1
-          export PYTORCH_TEST_WITH_SLOW=1
-          cd ../pytorch/third_party/torch-xpu-ops/test/xpu
-          timeout 10000 python run_test_with_skip.py
-          # Cases run with a on-demand white list, since some suites are too
-          # slow to go through all operators on CPU. So add cases on-demand
-          # when XPU implementatoin is done.
-          # test_foreach, test_decomp
-          timeout 10000 python run_test_with_only.py
-      - name: Run Torch XPU UT
-        run: |
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          cd ../pytorch
-          TEST_REPORTS_DIR=$(pwd)/test/test-reports
-          rm -rf "$TEST_REPORTS_DIR" && mkdir -p "$TEST_REPORTS_DIR"
-          # Run Pytorch XPU binary UT
-          for xpu_case in build/bin/*{xpu,sycl}*; do
-            if [[ "$xpu_case" != *"*"* && "$xpu_case" != *.so && "$xpu_case" != *.a ]]; then
-              case_name=$(basename "$xpu_case")
-              echo "Testing ${case_name} ..."
-              "$xpu_case" --gtest_output=xml:"$TEST_REPORTS_DIR"/"$case_name".xml
-            fi
-          done
-          # Run Pytorch XPU python UT
-          export PYTORCH_ENABLE_XPU_FALLBACK=1
-          sed -i 's/selected_tests = exclude_tests(XPU_BLOCKLIST.*/selected_tests = XPU_TEST/g' ./test/run_test.py
-          python test/run_test.py --xpu
+    uses: ./.github/workflows/_linux_ut.yml
+    with: 
+      ut_suite: op_example,op_extended,op_ut,torch_xpu
+      runner: linux.idc.xpu
+      
@@ -0,0 +1,125 @@
+#include <ATen/native/BucketizationUtils.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/xpu/sycl/BucketizationKernels.h>
+#include <ATen/xpu/XPUNativeFunctions.h>
+
+namespace at {
+
+Tensor& XPUNativeFunctions::searchsorted_out(
+    const Tensor& sorted_sequence,
+    const Tensor& self,
+    bool out_int32,
+    bool right,
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter_opt,
+    Tensor& result) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> sorter_maybe_owned =
+      at::borrow_from_optional_tensor(sorter_opt);
+  const Tensor& sorter = *sorter_maybe_owned;
+  at::native::searchsorted_pre_check(
+      sorted_sequence, self, result, out_int32, right, side_opt, sorter);
+  at::native::resize_output(result, self.sizes());
+
+  if (self.numel() == 0) {
+    return result;
+  }
+
+  // we have two inputs to set right, pre_check checks that they aren't set to
+  // opposites
+  bool is_right = (side_opt && *side_opt == "right") || right;
+  at::native::xpu::searchsorted_kernel(
+      result, self, sorted_sequence, out_int32, is_right, sorter);
+  return result;
+}
+
+Tensor& XPUNativeFunctions::searchsorted_out(
+    const Tensor& sorted_sequence,
+    const Scalar& self,
+    bool out_int32,
+    bool right,
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter_opt,
+    Tensor& result) {
+  const Tensor& scalar_tensor =
+      at::native::searchsorted_scalar_tensor(self, sorted_sequence.device());
+  return searchsorted_out(
+      sorted_sequence,
+      scalar_tensor,
+      out_int32,
+      right,
+      side_opt,
+      sorter_opt,
+      result);
+}
+
+Tensor XPUNativeFunctions::searchsorted(
+    const Tensor& sorted_sequence,
+    const Tensor& self,
+    bool out_int32,
+    bool right,
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter) {
+  ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long;
+  c10::TensorOptions options =
+      TensorOptions().device(self.options().device()).dtype(scalar_type);
+  Tensor result = at::empty({0}, options, MemoryFormat::Contiguous);
+  searchsorted_out(
+      sorted_sequence, self, out_int32, right, side_opt, sorter, result);
+  return result;
+}
+
+Tensor XPUNativeFunctions::searchsorted(
+    const Tensor& sorted_sequence,
+    const Scalar& self,
+    bool out_int32,
+    bool right,
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter) {
+  const Tensor& scalar_tensor =
+      at::native::searchsorted_scalar_tensor(self, sorted_sequence.device());
+  return searchsorted(
+      sorted_sequence, scalar_tensor, out_int32, right, side_opt, sorter);
+}
+
+Tensor& XPUNativeFunctions::bucketize_out(
+    const Tensor& self,
+    const Tensor& boundaries,
+    bool out_int32,
+    bool right,
+    Tensor& result) {
+  TORCH_CHECK(
+      boundaries.dim() == 1,
+      "boundaries tensor must be 1 dimension, but got dim(",
+      boundaries.dim(),
+      ")");
+  searchsorted_out(
+      boundaries, self, out_int32, right, nullopt, nullopt, result);
+  return result;
+}
+
+Tensor XPUNativeFunctions::bucketize(
+    const Tensor& self,
+    const Tensor& boundaries,
+    bool out_int32,
+    bool right) {
+  ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long;
+  c10::TensorOptions options =
+      TensorOptions().device(self.options().device()).dtype(scalar_type);
+  Tensor result = at::empty({0}, options, MemoryFormat::Contiguous);
+  bucketize_out(self, boundaries, out_int32, right, result);
+  return result;
+}
+
+Tensor XPUNativeFunctions::bucketize(
+    const Scalar& self,
+    const Tensor& boundaries,
+    bool out_int32,
+    bool right) {
+  return bucketize(
+      at::native::searchsorted_scalar_tensor(self, boundaries.device()),
+      boundaries,
+      out_int32,
+      right);
+}
+} // namespace at
@@ -1,15 +1,9 @@
 #include <ATen/ATen.h>
-#include <ATen/ExpandUtils.h>
-#include <ATen/WrapDimUtils.h>
-#include <ATen/core/Tensor.h>
-#include <ATen/core/op_registration/adaption.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/ReduceOpsUtils.h>
-#include <ATen/native/utils/ParamUtils.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-
 #include <ATen/native/xpu/sycl/LinearAlgebraKernels.h>
 #include <ATen/native/xpu/sycl/ReduceNormKernel.h>
+#include <ATen/xpu/XPUNativeFunctions.h>
 #include <comm/RegisterUtils.h>
 
 namespace at {