merge main (#1051)

intel · Nov 6, 2024 · 0c0c6cd · 0c0c6cd
2 parents 69ebbed + fda86cf
commit 0c0c6cd
Show file tree

Hide file tree

Showing 101 changed files with 6,843 additions and 227 deletions.
diff --git a/.github/scripts/apply_torch_pr.py b/.github/scripts/apply_torch_pr.py
@@ -11,10 +11,6 @@
     default=[
         # Fallback to CPU for XPU FP64
         "https://github.com/pytorch/pytorch/pull/126516",
-        # Enable deterministic for mkldnn ops
-        "https://github.com/pytorch/pytorch/pull/127277",
-        # [Inductor][Intel GPU] Support reduction split.
-        "https://github.com/pytorch/pytorch/pull/129120",
         # Modify the tolerance level in TIMM benchmark
         "https://github.com/pytorch/pytorch/pull/129735",
         # [Intel GPU] Allow XPU device in cdist and pdist operators

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
@@ -66,7 +66,7 @@ jobs:
           source activate xpu_op_${ZE_AFFINITY_MASK}
           cd ../ && rm -rf pytorch
           git clone https://github.com/pytorch/pytorch pytorch
-          cd pytorch && git checkout $(echo ${{ env.pytorch }} |sed 's/^nightly_wheel$/nightly/')
+          cd pytorch && git checkout $(echo ${{ inputs.pytorch }} |sed 's/^nightly_wheel$/nightly/')
           # apply PRs for stock pytorch
           pip install requests
           python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
@@ -125,6 +125,8 @@ jobs:
 
           cd ..
           python pytorch/torch/utils/collect_env.py
+          rm -rf /tmp/torchinductor_*
+          rm -rf ~/.triton/cache
       - name: Run XPU OP Examples
         if: contains(inputs.ut, 'op_regression') || github.event_name == 'schedule'
         run: |

diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml
@@ -186,6 +186,7 @@ jobs:
           echo "$GITHUB_ENV"
           rm -rf ../pytorch/inductor_log
           rm -rf /tmp/torchinductor_*
+          rm -rf ~/.triton/cache
 
       # Nihglty launch
       - name: Nightly Huggingface FP32/BF16/FP16 Inference & Training Accuracy Test

diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake
@@ -50,7 +50,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
   if(USE_PER_OPERATOR_HEADERS)
     list(APPEND SYCL_HOST_FLAGS -DAT_PER_OPERATOR_HEADERS)
   endif()
-
+  list(APPEND SYCL_HOST_FLAGS -D__INTEL_LLVM_COMPILER_VERSION=${__INTEL_LLVM_COMPILER})
   # -- Kernel flags (SYCL_KERNEL_OPTIONS)
   # The fast-math will be enabled by default in SYCL compiler.
   # Refer to [https://clang.llvm.org/docs/UsersManual.html#cmdoption-fno-fast-math]
@@ -89,6 +89,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D__INTEL_PREVIEW_BREAKING_CHANGES)
     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI})
   endif()
+  set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D__INTEL_LLVM_COMPILER_VERSION=${__INTEL_LLVM_COMPILER})
 
   CHECK_SYCL_FLAG("-fsycl-fp64-conv-emu" SUPPORTS_FP64_CONV_EMU)
   if(SUPPORTS_FP64_CONV_EMU)

diff --git a/cmake/Modules/FindSYCLToolkit.cmake b/cmake/Modules/FindSYCLToolkit.cmake
@@ -35,6 +35,7 @@ endif()
 if(SYCLTOOLKIT_FOUND)
   return()
 endif()
+
 set(SYCLTOOLKIT_FOUND TRUE)
 
 include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)
@@ -77,7 +78,7 @@ endif()
 
 # Function to write a test case to verify SYCL features.
 
-function(SYCL_CMPLR_TEST_WRITE src)
+function(SYCL_CMPLR_TEST_WRITE src macro_name)
 
   set(cpp_macro_if "#if")
   set(cpp_macro_endif "#endif")
@@ -88,8 +89,8 @@ function(SYCL_CMPLR_TEST_WRITE src)
 
   # Feature tests goes here
 
-  string(APPEND SYCL_CMPLR_TEST_CONTENT "${cpp_macro_if} defined(SYCL_LANGUAGE_VERSION)\n")
-  string(APPEND SYCL_CMPLR_TEST_CONTENT "cout << \"SYCL_LANGUAGE_VERSION=\"<<SYCL_LANGUAGE_VERSION<<endl;\n")
+  string(APPEND SYCL_CMPLR_TEST_CONTENT "${cpp_macro_if} defined(${macro_name})\n")
+  string(APPEND SYCL_CMPLR_TEST_CONTENT "cout << \"${macro_name}=\"<<${macro_name}<<endl;\n")
   string(APPEND SYCL_CMPLR_TEST_CONTENT "${cpp_macro_endif}\n")
 
   string(APPEND SYCL_CMPLR_TEST_CONTENT "return 0;}\n")
@@ -103,6 +104,7 @@ endfunction()
 function(SYCL_CMPLR_TEST_BUILD error TEST_SRC_FILE TEST_EXE)
 
   set(SYCL_CXX_FLAGS_LIST "${SYCL_CXX_FLAGS}")
+  string(REPLACE "-Wno-stringop-overflow" "" SYCL_CXX_FLAGS_LIST "${SYCL_CXX_FLAGS_LIST}")
   separate_arguments(SYCL_CXX_FLAGS_LIST)
 
   execute_process(
@@ -150,19 +152,19 @@ function(SYCL_CMPLR_TEST_RUN error TEST_EXE)
 
 endfunction()
 
-function(SYCL_CMPLR_TEST_EXTRACT test_output)
+function(SYCL_CMPLR_TEST_EXTRACT test_output macro_name)
 
   string(REGEX REPLACE "\n" ";" test_output_list "${test_output}")
 
-  set(SYCL_LANGUAGE_VERSION "")
+  set(${macro_name} "")
   foreach(strl ${test_output_list})
-     if(${strl} MATCHES "^SYCL_LANGUAGE_VERSION=([A-Za-z0-9_]+)$")
-       string(REGEX REPLACE "^SYCL_LANGUAGE_VERSION=" "" extracted_sycl_lang "${strl}")
-       set(SYCL_LANGUAGE_VERSION ${extracted_sycl_lang})
+     if(${strl} MATCHES "^${macro_name}=([A-Za-z0-9_]+)$")
+       string(REGEX REPLACE "^${macro_name}=" "" extracted_sycl_lang "${strl}")
+       set(${macro_name} ${extracted_sycl_lang})
      endif()
   endforeach()
 
-  set(SYCL_LANGUAGE_VERSION "${SYCL_LANGUAGE_VERSION}" PARENT_SCOPE)
+  set(${macro_name} "${extracted_sycl_lang}" PARENT_SCOPE)
 endfunction()
 
 set(SYCL_FLAGS "")
@@ -189,7 +191,7 @@ if(${has_werror} EQUAL -1)
   # Create the test source file
   set(TEST_SRC_FILE "${SYCL_CMPLR_TEST_DIR}/sycl_features.cpp")
   set(TEST_EXE "${TEST_SRC_FILE}.exe")
-  SYCL_CMPLR_TEST_WRITE(${TEST_SRC_FILE})
+  SYCL_CMPLR_TEST_WRITE(${TEST_SRC_FILE} "SYCL_LANGUAGE_VERSION")
 
   # Build the test and create test executable
   SYCL_CMPLR_TEST_BUILD(error ${TEST_SRC_FILE} ${TEST_EXE})
@@ -204,7 +206,7 @@ if(${has_werror} EQUAL -1)
   endif()
 
   # Extract test output for information
-  SYCL_CMPLR_TEST_EXTRACT(${test_output})
+  SYCL_CMPLR_TEST_EXTRACT(${test_output} "SYCL_LANGUAGE_VERSION")
 
   # As per specification, all the SYCL compatible compilers should
   # define macro  SYCL_LANGUAGE_VERSION
@@ -221,5 +223,38 @@ if(${has_werror} EQUAL -1)
   set(SYCL_LANGUAGE_VERSION "${SYCL_LANGUAGE_VERSION}" CACHE STRING "SYCL Language version")
 endif()
 
+# Create a clean working directory.
+set(SYCL_CMPLR_TEST_DIR "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/TESTSYCLCMPLR")
+file(REMOVE_RECURSE ${SYCL_CMPLR_TEST_DIR})
+file(MAKE_DIRECTORY ${SYCL_CMPLR_TEST_DIR})
+# Create the test source file
+set(TEST_SRC_FILE "${SYCL_CMPLR_TEST_DIR}/llvm_features.cpp")
+set(TEST_EXE "${TEST_SRC_FILE}.exe")
+SYCL_CMPLR_TEST_WRITE(${TEST_SRC_FILE} "__INTEL_LLVM_COMPILER")
+# Build the test and create test executable
+SYCL_CMPLR_TEST_BUILD(error ${TEST_SRC_FILE} ${TEST_EXE})
+if(error)
+  message(FATAL_ERROR "Can not build SYCL_CMPLR_TEST")
+endif()
+# Execute the test to extract information
+SYCL_CMPLR_TEST_RUN(error ${TEST_EXE})
+if(error)
+  message(FATAL_ERROR "Can not run SYCL_CMPLR_TEST")
+endif()
+# Extract test output for information
+SYCL_CMPLR_TEST_EXTRACT(${test_output} "__INTEL_LLVM_COMPILER")
+
+# Check whether the value of __INTEL_LLVM_COMPILER macro was successfully extracted
+string(COMPARE EQUAL "${__INTEL_LLVM_COMPILER}" "" nosycllang)
+if(nosycllang)
+  set(SYCLTOOLKIT_FOUND False)
+  set(SYCL_REASON_FAILURE "Can not find __INTEL_LLVM_COMPILER}")
+  set(SYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}")
+endif()
+
+
+# Include in Cache
+set(__INTEL_LLVM_COMPILER "${__INTEL_LLVM_COMPILER}" CACHE STRING "Intel llvm compiler")
+
 message(DEBUG "The SYCL compiler is ${SYCL_COMPILER}")
 message(DEBUG "The SYCL Flags are ${SYCL_FLAGS}")
diff --git a/src/ATen/native/xpu/AiryAi.cpp b/src/ATen/native/xpu/AiryAi.cpp
@@ -0,0 +1,11 @@
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/native/xpu/sycl/AiryAiKernel.h>
+
+namespace at {
+namespace native {
+REGISTER_XPU_DISPATCH(special_airy_ai_stub, &xpu::airy_ai_kernel);
+
+} // namespace native
+} // namespace at
diff --git a/src/ATen/native/xpu/Bessel.cpp b/src/ATen/native/xpu/Bessel.cpp
@@ -1,7 +1,7 @@
-#include <ATen/native/UnaryOps.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/core/Tensor.h>
+#include <ATen/native/UnaryOps.h>
 #include <ATen/native/xpu/sycl/BesselJ0Kernel.h>
 #include <ATen/native/xpu/sycl/BesselJ1Kernel.h>
 #include <ATen/native/xpu/sycl/BesselY0Kernel.h>
@@ -10,6 +10,8 @@
 #include <ATen/native/xpu/sycl/ModifiedBesselI1Kernel.h>
 #include <ATen/native/xpu/sycl/ModifiedBesselK0Kernel.h>
 #include <ATen/native/xpu/sycl/ModifiedBesselK1Kernel.h>
+#include <ATen/native/xpu/sycl/ScaledModifiedBesselK0Kernel.h>
+#include <ATen/native/xpu/sycl/ScaledModifiedBesselK1Kernel.h>
 #include <ATen/native/xpu/sycl/SphericalBesselJ0Kernel.h>
 
 namespace at {
@@ -18,10 +20,27 @@ REGISTER_XPU_DISPATCH(special_bessel_j0_stub, &xpu::bessel_j0_kernel);
 REGISTER_XPU_DISPATCH(special_bessel_j1_stub, &xpu::bessel_j1_kernel);
 REGISTER_XPU_DISPATCH(special_bessel_y0_stub, &xpu::bessel_y0_kernel);
 REGISTER_XPU_DISPATCH(special_bessel_y1_stub, &xpu::bessel_y1_kernel);
-REGISTER_XPU_DISPATCH(special_modified_bessel_i0_stub, &xpu::modified_bessel_i0_kernel);
-REGISTER_XPU_DISPATCH(special_modified_bessel_i1_stub, &xpu::modified_bessel_i1_kernel);
-REGISTER_XPU_DISPATCH(special_modified_bessel_k0_stub, &xpu::modified_bessel_k0_kernel);
-REGISTER_XPU_DISPATCH(special_modified_bessel_k1_stub, &xpu::modified_bessel_k1_kernel);
-REGISTER_XPU_DISPATCH(special_spherical_bessel_j0_stub, &xpu::spherical_bessel_j0_kernel);
+REGISTER_XPU_DISPATCH(
+    special_modified_bessel_i0_stub,
+    &xpu::modified_bessel_i0_kernel);
+REGISTER_XPU_DISPATCH(
+    special_modified_bessel_i1_stub,
+    &xpu::modified_bessel_i1_kernel);
+REGISTER_XPU_DISPATCH(
+    special_modified_bessel_k0_stub,
+    &xpu::modified_bessel_k0_kernel);
+REGISTER_XPU_DISPATCH(
+    special_modified_bessel_k1_stub,
+    &xpu::modified_bessel_k1_kernel);
+REGISTER_XPU_DISPATCH(
+    special_spherical_bessel_j0_stub,
+    &xpu::spherical_bessel_j0_kernel);
+REGISTER_XPU_DISPATCH(
+    special_scaled_modified_bessel_k0_stub,
+    &xpu::scaled_modified_bessel_k0_kernel);
+REGISTER_XPU_DISPATCH(
+    special_scaled_modified_bessel_k1_stub,
+    &xpu::scaled_modified_bessel_k1_kernel);
+
 } // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/BinaryOps.cpp b/src/ATen/native/xpu/BinaryOps.cpp
@@ -14,17 +14,19 @@
 #include <ATen/native/xpu/sycl/BinaryMiscOpsKernels.h>
 #include <ATen/native/xpu/sycl/BinaryRemainderKernel.h>
 #include <ATen/native/xpu/sycl/BinaryShiftOpsKernels.h>
+#include <ATen/native/xpu/sycl/ChebyshevPolynomialKernels.h>
 #include <ATen/native/xpu/sycl/CopysignKernel.h>
 #include <ATen/native/xpu/sycl/GcdLcmKernels.h>
 #include <ATen/native/xpu/sycl/HermitePolynomialHKernel.h>
 #include <ATen/native/xpu/sycl/HermitePolynomialHeKernel.h>
+#include <ATen/native/xpu/sycl/IGammaKernel.h>
 #include <ATen/native/xpu/sycl/LaguerrePolynomialLKernel.h>
 #include <ATen/native/xpu/sycl/LegendrePolynomialPKernel.h>
 #include <ATen/native/xpu/sycl/LogAddExpKernels.h>
 #include <ATen/native/xpu/sycl/MaxMinElementwiseKernels.h>
-#include <ATen/native/xpu/sycl/StepKernels.h>
-#include <ATen/native/xpu/sycl/ChebyshevPolynomialKernels.h>
 #include <ATen/native/xpu/sycl/ShiftedChebyshevPolynomialKernels.h>
+#include <ATen/native/xpu/sycl/StepKernels.h>
+#include <ATen/native/xpu/sycl/ZetaKernel.h>
 
 namespace at {
 namespace native {
@@ -47,7 +49,10 @@ REGISTER_XPU_DISPATCH(maximum_stub, &xpu::maximum_kernel);
 REGISTER_XPU_DISPATCH(minimum_stub, &xpu::minimum_kernel);
 REGISTER_XPU_DISPATCH(sigmoid_backward_stub, &xpu::sigmoid_backward_kernel);
 REGISTER_XPU_DISPATCH(nextafter_stub, &xpu::nextafter_kernel);
+REGISTER_XPU_DISPATCH(heaviside_stub, &xpu::heaviside_kernel);
 REGISTER_XPU_DISPATCH(hypot_stub, &xpu::hypot_kernel);
+REGISTER_XPU_DISPATCH(igamma_stub, &xpu::igamma_kernel);
+REGISTER_XPU_DISPATCH(igammac_stub, &xpu::igammac_kernel);
 REGISTER_XPU_DISPATCH(atan2_stub, &xpu::atan2_kernel);
 REGISTER_XPU_DISPATCH(copysign_stub, &xpu::copysign_kernel);
 REGISTER_XPU_DISPATCH(logical_and_stub, &xpu::logical_and_kernel);
@@ -61,6 +66,8 @@ REGISTER_XPU_DISPATCH(fmin_stub, &xpu::fmin_kernel);
 REGISTER_XPU_DISPATCH(lshift_stub, &xpu::lshift_kernel);
 REGISTER_XPU_DISPATCH(rshift_stub, &xpu::rshift_kernel);
 REGISTER_XPU_DISPATCH(xlogy_stub, &xpu::xlogy_kernel);
+REGISTER_XPU_DISPATCH(xlog1py_stub, &xpu::xlog1py_kernel);
+REGISTER_XPU_DISPATCH(zeta_stub, &xpu::zeta_kernel);
 REGISTER_XPU_DISPATCH(
     hermite_polynomial_h_stub,
     &xpu::hermite_polynomial_h_kernel);

diff --git a/src/ATen/native/xpu/Distributions.cpp b/src/ATen/native/xpu/Distributions.cpp
@@ -57,6 +57,14 @@ Tensor _s_binomial_xpu(
   return ret;
 }
 
+Tensor _s_gamma_xpu(const Tensor& alpha, c10::optional<Generator> gen_) {
+  auto gen = get_generator_or_default<at::XPUGeneratorImpl>(
+      gen_, at::xpu::detail::getDefaultXPUGenerator());
+  Tensor ret = at::empty(alpha.sizes(), alpha.options());
+  xpu::launch_gamma_kernel(ret, alpha, gen);
+  return ret;
+}
+
 Tensor _sample_dirichlet_xpu(
     const Tensor& alpha,
     std::optional<Generator> generator) {
@@ -74,6 +82,17 @@ Tensor _sample_dirichlet_xpu(
   return ret;
 }
 
+Tensor _standard_gamma_grad_xpu(const Tensor& self, const Tensor& output) {
+  Tensor ret = at::empty(self.sizes(), self.options());
+  TensorIterator iter = TensorIteratorConfig()
+                            .add_output(ret)
+                            .add_input(self)
+                            .add_input(output)
+                            .build();
+  xpu::launch_standard_gamma_grad_kernel(iter);
+  return ret;
+}
+
 Tensor _dirichlet_grad_xpu(
     const Tensor& x,
     const Tensor& alpha,

diff --git a/src/ATen/native/xpu/Embedding.cpp b/src/ATen/native/xpu/Embedding.cpp
@@ -35,6 +35,5 @@ Tensor& embedding_renorm_xpu_(
       self, indices, max_norm, norm_type);
 }
 
-
 } // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/ForeachOpList.cpp b/src/ATen/native/xpu/ForeachOpList.cpp
@@ -4,12 +4,15 @@
 #include <ATen/ops/_foreach_addcmul_native.h>
 #include <ATen/ops/_foreach_clamp_max_native.h>
 #include <ATen/ops/_foreach_clamp_min_native.h>
+#include <ATen/ops/_foreach_copy_native.h>
 #include <ATen/ops/_foreach_div_native.h>
 #include <ATen/ops/_foreach_lerp_native.h>
 #include <ATen/ops/_foreach_mul_native.h>
 #include <ATen/ops/_foreach_pow_native.h>
+#include <ATen/ops/_foreach_sub_native.h>
 
 #include <ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h>
+#include <ATen/native/xpu/sycl/ForeachCopyKernels.h>
 #include <ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h>
 #include <ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h>
 
@@ -65,6 +68,7 @@ namespace native {
   }
 
 FOREACH_BINARY_OP_LIST_ALPHA(add);
+FOREACH_BINARY_OP_LIST_ALPHA(sub);
 FOREACH_BINARY_OP_LIST(mul, false);
 FOREACH_BINARY_OP_LIST(div, true);
 FOREACH_BINARY_OP_LIST(clamp_max, true);
@@ -147,5 +151,23 @@ void foreach_tensor_lerp_ternary_xpu_(
   }
 }
 
+void foreach_tensor_copy_list_kernel_xpu_(
+    TensorList self,
+    TensorList src,
+    bool non_blocking) {
+  check_foreach_api_restrictions(self, src);
+  if (!can_use_fast_route(
+          self, src, /* does_op_promote_integer_inputs_to_float */ false)) {
+    return foreach_tensor_copy_list_kernel_slow_(self, src, non_blocking);
+  }
+
+  xpu::foreach_copy_list_kernel_(self, src);
+
+  // increment_version
+  for (const auto& t : self) {
+    t.unsafeGetTensorImpl()->bump_version();
+  }
+}
+
 } // namespace native
 } // namespace at