Merge branch 'main' into hjhee/asinh

intel · Jul 8, 2024 · 22faf93 · 22faf93
2 parents 1c64c60 + 1951fce
commit 22faf93
Show file tree

Hide file tree

Showing 10 changed files with 236 additions and 83 deletions.
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
@@ -0,0 +1,123 @@
+name: inductor-xpu-ut-test
+
+on:
+  workflow_call:
+    inputs:
+      torch_xpu_ops_update:
+        required: false
+        type: string
+        default: 'true'
+        description: True means update xpu_ops when building pytorch, otherwise means not
+      ut_suite:
+        required: true
+        type: string
+        default: 'op_example,op_extended,op_ut,torch_xpu'
+        description: op_example,op_extended,op_ut,torch_xpu. Delimiter is comma
+      pytorch_branch:
+        required: false
+        type: string
+        default: 'main'
+        description: Set pytorch branch
+      runner:
+        required: true
+        type: string
+        default: 'linux.idc.xpu'
+        description: Set runner
+
+
+jobs:
+  Inductor-XPU-UT-Tests:
+    runs-on: ${{ inputs.runner }} 
+    timeout-minutes: 900
+    steps:
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Prepare Stock Pytorch
+        run: |
+          pwd
+          cd ../ && rm -rf pytorch
+          git clone -b ${{ inputs.pytorch_branch }} https://github.com/pytorch/pytorch
+          cd pytorch && git log -n 1 && git submodule sync && git submodule update --init --recursive
+          if [ -z ${{ inputs.torch_xpu_ops_update }} ]; then
+            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+          else
+            if [[ ${{ inputs.torch_xpu_ops_update }} == 'true' ]]; then
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+            else
+              echo "Not update torch-xpu-ops"
+            fi
+          fi
+          # Workaround for torch-xpu-ops ci test
+          sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+      - name: Build Pytorch XPU
+        run: |
+          which conda && conda clean -ay
+          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
+                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
+          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=3.10 cmake ninja -y
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          conda install -c intel mkl-static mkl-include -y
+          cd ../pytorch
+          pip install -r requirements.txt
+          export USE_XPU=1
+          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+          python setup.py bdist_wheel
+          pip install --force-reinstall dist/*.whl
+          git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
+          pip install -r .ci/docker/requirements-ci.txt
+      - name: Run XPU OP Examples
+        if: contains(inputs.ut_suite, 'op_example')
+        run: |
+          cd ${{ github.workspace }}
+          mkdir -p ut_log
+          xpu-smi discovery
+          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ${{ github.workspace }}
+          cd examples
+          pip install pytest
+          timeout 8000 pytest -v 
+      - name: Run XPU OP Extended UT
+        if: contains(inputs.ut_suite, 'op_extended')
+        run: |
+          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          export PYTORCH_TEST_WITH_SLOW=1
+          cd ../pytorch/third_party/torch-xpu-ops/test/xpu/extended/
+          timeout 10000 python run_test_with_skip.py 
+      - name: Run XPU OP UT
+        if: contains(inputs.ut_suite, 'op_ut')
+        run: |
+          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          export PYTORCH_ENABLE_XPU_FALLBACK=1
+          export PYTORCH_TEST_WITH_SLOW=1
+          cd ../pytorch/third_party/torch-xpu-ops/test/xpu
+          timeout 10000 python run_test_with_skip.py 
+          # Cases run with a on-demand white list, since some suites are too
+          # slow to go through all operators on CPU. So add cases on-demand
+          # when XPU implementatoin is done.
+          # test_foreach, test_decomp
+          timeout 10000 python run_test_with_only.py 
+      - name: Run Torch XPU UT
+        if: contains(inputs.ut_suite, 'torch_xpu')
+        run: |
+          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../pytorch
+          TEST_REPORTS_DIR=$(pwd)/test/test-reports
+          rm -rf "$TEST_REPORTS_DIR" && mkdir -p "$TEST_REPORTS_DIR"
+          # Run Pytorch XPU binary UT
+          for xpu_case in build/bin/*{xpu,sycl}*; do
+            if [[ "$xpu_case" != *"*"* && "$xpu_case" != *.so && "$xpu_case" != *.a ]]; then
+              case_name=$(basename "$xpu_case")
+              echo "Testing ${case_name} ..."
+              "$xpu_case" --gtest_output=xml:"$TEST_REPORTS_DIR"/"$case_name".xml 
+            fi
+          done
+          # Run Pytorch XPU python UT
+          export PYTORCH_ENABLE_XPU_FALLBACK=1
+          sed -i 's/selected_tests = exclude_tests(XPU_BLOCKLIST.*/selected_tests = XPU_TEST/g' ./test/run_test.py
+          python test/run_test.py --xpu 
+
diff --git a/.github/workflows/inductor_xpu_e2e_nightly.yml b/.github/workflows/inductor_xpu_e2e_nightly.yml
@@ -41,6 +41,21 @@ on:
         type: string
         default: ''
         description: If set, will only launch this one
+      torch_xpu_ops_update:
+        required: false
+        type: string
+        default: 'true'
+        description: True means update xpu_ops when building pytorch, otherwise means not
+      ut_suite:
+        required: true
+        type: string
+        default: 'op_example,op_extended,op_ut,torch_xpu'
+        description: op_example,op_extended,op_ut,torch_xpu. Delimiter is comma
+      pytorch_branch:
+        required: false
+        type: string
+        default: 'main'
+        description: Set pytorch branch
 
 
 permissions: read-all
@@ -244,6 +259,15 @@ jobs:
           name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
           path: ${{ github.workspace }}/upload_files
 
+  Inductor-XPU-UT-Nightly-Tests:
+    if: ${{ inputs.ut_suite }}
+    name: Nightly Inductor XPU UT Test
+    uses: ./.github/workflows/_linux_ut.yml
+    with: 
+      ut_suite: ${{ inputs.ut_suite }}
+      pytorch_branch: ${{ inputs.pytorch_branch }}
+      runner: linux.idc.xpu
+
   Tests-Failure-And-Report:
     if: always()
     runs-on: pvc_e2e

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -23,84 +23,8 @@ jobs:
     # Don't run on forked repos and draft PRs
     if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
     name: preci-ut
-    runs-on: linux.idc.xpu
-    timeout-minutes: 240
-    steps:
-      - name: Checkout torch-xpu-ops
-        uses: actions/checkout@v3
-      - name: Prepare Stock Pytorch
-        run: |
-          pwd
-          cd ../ && rm -rf pytorch
-          git clone -b main https://github.com/pytorch/pytorch
-          cd pytorch && git log -n 1 && git submodule sync && git submodule update --init --recursive
-          rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-          # Workaround for torch-xpu-ops ci test
-          sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
-      - name: Build Pytorch XPU
-        run: |
-          which conda && conda clean -ay
-          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
-                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
-          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=3.10 cmake ninja -y
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          conda install -c intel mkl-static mkl-include -y
-          cd ../pytorch
-          pip install -r requirements.txt
-          export USE_XPU=1
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
-          export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
-          python setup.py bdist_wheel
-          pip install --force-reinstall dist/*.whl
-          git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
-          pip install -r .ci/docker/requirements-ci.txt
-      - name: Run XPU OP Examples
-        if: ${{ hashFiles('examples/') != '' }}
-        run: |
-          xpu-smi discovery
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          cd examples
-          pip install pytest
-          timeout 8000 pytest -v
-      - name: Run XPU OP Extended UT
-        if: ${{ hashFiles('test/xpu/') != '' }}
-        run: |
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          export PYTORCH_TEST_WITH_SLOW=1
-          cd ../pytorch/third_party/torch-xpu-ops/test/xpu/extended/
-          timeout 10000 python run_test_with_skip.py
-      - name: Run XPU OP UT
-        if: ${{ hashFiles('test/xpu/') != '' }}
-        run: |
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          export PYTORCH_ENABLE_XPU_FALLBACK=1
-          export PYTORCH_TEST_WITH_SLOW=1
-          cd ../pytorch/third_party/torch-xpu-ops/test/xpu
-          timeout 10000 python run_test_with_skip.py
-          # Cases run with a on-demand white list, since some suites are too
-          # slow to go through all operators on CPU. So add cases on-demand
-          # when XPU implementatoin is done.
-          # test_foreach, test_decomp
-          timeout 10000 python run_test_with_only.py
-      - name: Run Torch XPU UT
-        run: |
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          cd ../pytorch
-          TEST_REPORTS_DIR=$(pwd)/test/test-reports
-          rm -rf "$TEST_REPORTS_DIR" && mkdir -p "$TEST_REPORTS_DIR"
-          # Run Pytorch XPU binary UT
-          for xpu_case in build/bin/*{xpu,sycl}*; do
-            if [[ "$xpu_case" != *"*"* && "$xpu_case" != *.so && "$xpu_case" != *.a ]]; then
-              case_name=$(basename "$xpu_case")
-              echo "Testing ${case_name} ..."
-              "$xpu_case" --gtest_output=xml:"$TEST_REPORTS_DIR"/"$case_name".xml
-            fi
-          done
-          # Run Pytorch XPU python UT
-          export PYTORCH_ENABLE_XPU_FALLBACK=1
-          sed -i 's/selected_tests = exclude_tests(XPU_BLOCKLIST.*/selected_tests = XPU_TEST/g' ./test/run_test.py
-          python test/run_test.py --xpu
+    uses: ./.github/workflows/_linux_ut.yml
+    with: 
+      ut_suite: op_example,op_extended,op_ut,torch_xpu
+      runner: linux.idc.xpu
+
diff --git a/src/ATen/native/xpu/TensorFactories.cpp b/src/ATen/native/xpu/TensorFactories.cpp
@@ -18,6 +18,25 @@
 
 namespace at {
 
+Tensor& XPUNativeFunctions::eye_out(int64_t n, Tensor& result) {
+  return XPUNativeFunctions::eye_out(n, n, result);
+}
+
+Tensor& XPUNativeFunctions::eye_out(int64_t n, int64_t m, Tensor& result) {
+  TORCH_CHECK(n >= 0, "n must be greater or equal to 0, got ", n);
+  TORCH_CHECK(m >= 0, "m must be greater or equal to 0, got ", m);
+
+  result.resize_({n, m});
+  result.zero_();
+
+  int64_t sz = std::min<int64_t>(n, m);
+  int64_t stride = result.stride(0) + result.stride(1);
+
+  Tensor diag = result.as_strided({sz}, {stride});
+  diag.fill_(1);
+  return result;
+}
+
 Tensor XPUNativeFunctions::empty(
     IntArrayRef size,
     c10::optional<ScalarType> dtype_opt,

diff --git a/src/ATen/native/xpu/UnaryOps.cpp b/src/ATen/native/xpu/UnaryOps.cpp
@@ -630,4 +630,40 @@ Tensor& XPUNativeFunctions::cosh_out(const Tensor& self, Tensor& out) {
   return out;
 }
 
+TensorIterator ceil_meta(const Tensor& self, Tensor& out) {
+  TORCH_CHECK(!self.is_complex(), "ceil is not supported for complex inputs");
+  TensorIterator iter;
+  iter.build_borrowing_unary_op(out, self);
+  return iter;
+}
+
+Tensor XPUNativeFunctions::ceil(const Tensor& self) {
+  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) {
+    return self.clone();
+  }
+  Tensor out;
+  auto iter = ceil_meta(self, out);
+  native::xpu::ceil_kernel(iter);
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::ceil_(Tensor& self) {
+  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) {
+    return self;
+  }
+  auto iter = ceil_meta(self, self);
+  native::xpu::ceil_kernel(iter);
+  return self;
+}
+
+Tensor& XPUNativeFunctions::ceil_out(const Tensor& self, Tensor& out) {
+  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) {
+    out.copy_(self);
+    return out;
+  }
+  auto iter = ceil_meta(self, out);
+  native::xpu::ceil_kernel(iter);
+  return out;
+}
+
 } // namespace at
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
@@ -175,7 +175,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "bitwise_right_shift.Tensor_out",
     "cauchy_",
     "_cdist_backward",
-    "ceil.out",
     "channel_shuffle",
     "cholesky",
     "cholesky_inverse",
@@ -198,7 +197,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "exp2.out",
     "expm1.out",
     "exponential_",
-    "eye.m_out",
     "_fft_c2c",
     "_fft_c2r",
     "_fft_r2c",

diff --git a/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp b/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp
@@ -55,4 +55,25 @@ void reciprocal_kernel(TensorIteratorBase& iter) {
       [&]() { gpu_kernel(iter, ReciprocalFunctor<scalar_t>()); });
 }
 
+template <typename scalar_t>
+struct CeilFunctor {
+  scalar_t operator()(const scalar_t a) const {
+    return std::ceil(a);
+  }
+};
+
+template <typename T>
+struct CeilFunctor<c10::complex<T>> {
+  c10::complex<T> operator()(const c10::complex<T> a) const {
+    return c10::complex<T>(std::ceil(a.real()), std::ceil(a.imag()));
+  }
+};
+
+void ceil_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "ceil_xpu", [&]() {
+        gpu_kernel(iter, CeilFunctor<scalar_t>());
+      });
+}
+
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryFractionKernels.h b/src/ATen/native/xpu/sycl/UnaryFractionKernels.h
@@ -6,4 +6,6 @@ namespace at::native::xpu {
 
 void reciprocal_kernel(TensorIteratorBase& iter);
 
+void ceil_kernel(TensorIteratorBase& iter);
+
 } // namespace at::native::xpu
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
@@ -19,6 +19,7 @@
 
 _xpu_computation_op_list = [
     "empty",
+    "eye",
     "fill",
     "zeros",
     "zeros_like",

diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
@@ -190,6 +190,8 @@ supported:
   - exp_
   - empty.memory_format
   - empty_strided
+  - eye.out
+  - eye.m_out
   - _efficientzerotensor
   - complex.out
   - clone
@@ -518,3 +520,6 @@ supported:
   - randperm.generator_out
   - _amp_foreach_non_finite_check_and_unscale_
   - _amp_update_scale_
+  - ceil
+  - ceil_
+  - ceil.out
Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,4 +6,6 @@ namespace at::native::xpu {

		void reciprocal_kernel(TensorIteratorBase& iter);

		void ceil_kernel(TensorIteratorBase& iter);

		} // namespace at::native::xpu