intel · xytintel · Nov 22, 2024 · Nov 21, 2024 · Nov 21, 2024 · Nov 22, 2024
diff --git a/.github/scripts/apply_torch_pr.py b/.github/scripts/apply_torch_pr.py
@@ -13,6 +13,8 @@
         "https://github.com/pytorch/pytorch/pull/126516",
         # Modify the tolerance level in TIMM benchmark
         "https://github.com/pytorch/pytorch/pull/129735",
+        # [XPU] Update XPU C Shim Header
+        "https://github.com/pytorch/pytorch/pull/141086",
     ]
 )
 parser.add_argument('--extra-pr-list', '-e', nargs='+',default=[])

diff --git a/.github/scripts/env.sh b/.github/scripts/env.sh
@@ -1,3 +1,4 @@
 #!/bin/bash
-source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
+source /opt/intel/oneapi/compiler/latest/env/vars.sh
+source /opt/intel/oneapi/umf/latest/env/vars.sh
 source /opt/intel/oneapi/pti/latest/env/vars.sh
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
@@ -97,7 +97,7 @@ jobs:
         run: |
           source activate xpu_op_${ZE_AFFINITY_MASK}
           source .github/scripts/env.sh
-          pip install mkl-static mkl-include
+          pip install mkl-static==2025.0.1 mkl-include==2025.0.1
           cd ../pytorch
           if [[ ${{ inputs.abi }} == '0' ]]; then
             export _GLIBCXX_USE_CXX11_ABI=0

diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml
@@ -123,7 +123,7 @@ jobs:
           conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
           conda create -n e2e_ci python=${{ env.python }} cmake ninja -y
           source activate e2e_ci
-          pip install mkl-static mkl-include
+          pip install mkl-static==2025.0.1 mkl-include==2025.0.1
           pip install pandas scipy tqdm
       - name: Prepare Stock Pytorch
         run: |

diff --git a/.github/workflows/nightly_ondemand_rolling.yml b/.github/workflows/nightly_ondemand_rolling.yml
@@ -125,7 +125,7 @@ jobs:
           conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
           conda create -n e2e_ci python=${{ env.python }} cmake ninja -y
           source activate e2e_ci
-          pip install mkl-static mkl-include
+          pip install mkl-static==2025.0.1 mkl-include==2025.0.1
           pip install pandas scipy tqdm
       - name: Prepare Stock Pytorch
         run: |

diff --git a/.github/workflows/nightly_ondemand_whl.yml b/.github/workflows/nightly_ondemand_whl.yml
@@ -98,7 +98,7 @@ jobs:
           conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
           conda create -n e2e_ci python=${{ env.python }} cmake ninja -y
           source activate e2e_ci
-          pip install mkl-static mkl-include
+          pip install mkl-static==2025.0.1 mkl-include==2025.0.1
           pip install pandas scipy tqdm
       - name: Prepare Stock Pytorch
         run: |

diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake
@@ -122,7 +122,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
   set(SYCL_OFFLINE_COMPILER_CG_OPTIONS "-options '${SYCL_OFFLINE_COMPILER_CG_OPTIONS}'")
 
   if(WIN32)
-    set(AOT_TARGETS "ats-m150,lnl-m,mtl-u,mtl-h")
+    set(AOT_TARGETS "ats-m150,mtl-u,mtl-h,xe2-lpg,xe2-hpg")
   else()
     set(AOT_TARGETS "pvc,xe-lpg,ats-m150")
   endif()

diff --git a/src/ATen/native/xpu/sycl/MultiTensorApply.h b/src/ATen/native/xpu/sycl/MultiTensorApply.h
@@ -68,7 +68,7 @@ static inline int64_t multi_tensor_apply_fused_kernel_get_chunk_size() {
 }
 
 template <typename T, typename Y, typename U, typename... ArgTypes>
-struct MultiTensorApplyKernelFunctor {
+struct MultiTensorApplyKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
   void operator()(sycl::nd_item<1> item_id) const {
     // Expand the tuple elements manually and call the callable
     expandAndCall(item_id, std::index_sequence_for<ArgTypes...>());
@@ -85,6 +85,12 @@ struct MultiTensorApplyKernelFunctor {
         callable(callable_),
         args(std::make_tuple(args_...)) {}
 
+  void sycl_ker_config_convention(sycl::handler& cgh) {
+    if constexpr (std::is_base_of_v<__SYCL_KER_CONFIG_CONVENTION__, U>) {
+      callable.sycl_ker_config_convention(cgh);
+    }
+  }
+
  private:
   template <std::size_t... Indices>
   void expandAndCall(sycl::nd_item<1> item_id, std::index_sequence<Indices...>)
@@ -117,7 +123,6 @@ void launch_multi_tensor_apply_kernel(
     U callable,
     int num_wg,
     ArgTypes... args) {
-
   auto& q = getCurrentSYCLQueue();
   int64_t simd = syclMaxSubGroupSize();
   int64_t max_wg_size = multi_tensor_apply_kernel_get_wg_size(simd);
@@ -226,7 +231,6 @@ void multi_tensor_apply(
     std::vector<std::vector<at::Tensor>>& tensor_lists,
     T callable,
     ArgTypes... args) {
-
   TORCH_CHECK(
       tensor_lists.size() == depth,
       "Number of tensor lists has to match he depth");

diff --git a/src/ATen/native/xpu/sycl/ScatterGatherKernels.cpp b/src/ATen/native/xpu/sycl/ScatterGatherKernels.cpp
@@ -143,41 +143,64 @@ struct alignas(N) OpaqueType {
   char data[N];
 };
 
-template <int work_group_size, int thread_work_size, typename func_t>
+template <typename func_t>
 struct ScatterGatherElementwiseKernelFunctor {
   void operator()(sycl::nd_item<1> item) const {
-    constexpr int nv = work_group_size * thread_work_size;
+    int nv = work_group_size_ * thread_work_size_;
     auto wg_id = item.get_group_linear_id();
     auto local_id = item.get_local_linear_id();
     int idx = nv * wg_id + local_id;
-#pragma unroll
-    for (int i = 0; i < thread_work_size; ++i) {
+    for (int i = 0; i < thread_work_size_; ++i) {
       if (idx < N_) {
         f_(idx);
-        idx += work_group_size;
+        idx += work_group_size_;
       }
     }
   }
-  ScatterGatherElementwiseKernelFunctor(int N, func_t f) : N_(N), f_(f) {}
+  ScatterGatherElementwiseKernelFunctor(
+      int N,
+      func_t f,
+      int work_group_size,
+      int thread_work_size)
+      : N_(N),
+        f_(f),
+        work_group_size_(work_group_size),
+        thread_work_size_(thread_work_size) {}
 
  private:
   int N_;
   func_t f_;
+  int work_group_size_;
+  int thread_work_size_;
 };
 
-template <int nt, int vt, typename func_t>
+template <typename func_t>
 static void launch_scatter_gather_kernel(int64_t N, const func_t& f) {
   TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits<int32_t>::max());
   if (N == 0) {
     return;
   }
 
-  sycl::range<1> local_range{(size_t)nt};
-  int num_workgroups = (N + nt * vt - 1) / (nt * vt);
-  sycl::range<1> global_range{(size_t)(num_workgroups * nt)};
-
-  auto caller =
-      ScatterGatherElementwiseKernelFunctor<nt, vt, func_t>((int)N, f);
+  using KernelFn = ScatterGatherElementwiseKernelFunctor<func_t>;
+  int64_t max_wg_size = syclMaxWorkGroupSize<KernelFn>();
+  int outputSize = N;
+  int work_group_size = outputSize > max_wg_size ? max_wg_size : outputSize;
+  const auto target_global_size = syclMaxWorkItemsPerTile();
+  // Each work group size is work_group_size, one full device launch is
+  // target_global_size, so we can calculate max work group num as below
+  const int max_work_group_num = target_global_size / work_group_size;
+  int work_group_num = outputSize / work_group_size < max_work_group_num
+      ? outputSize / work_group_size
+      : max_work_group_num;
+  int draft_work_group_num =
+      (outputSize + work_group_size - 1) / work_group_size;
+
+  int thread_work_size = draft_work_group_num / work_group_num + 1;
+
+  sycl::range<1> local_range(work_group_size);
+  sycl::range<1> global_range(work_group_num * work_group_size);
+
+  auto caller = KernelFn((int)N, f, work_group_size, thread_work_size);
   sycl_kernel_submit(
       global_range, local_range, at::xpu::getCurrentSYCLQueue(), caller);
 }
@@ -268,11 +291,7 @@ struct ScatterGatherInternalKernel {
         numel,
         f);
 
-    // TODO: optimize it
-    constexpr int group_work_items = 256;
-    constexpr int work_size_per_item = 4;
-    launch_scatter_gather_kernel<group_work_items, work_size_per_item>(
-        iter.numel(), loop);
+    launch_scatter_gather_kernel(iter.numel(), loop);
   }
 };
 
@@ -521,11 +540,7 @@ struct ScatterFillInternalKernel {
         decltype(offset_calc),
         func_t>(self_ptr, index_ptr, offset_calc, index_stride, f, src_val);
 
-    // TODO: optimize it
-    constexpr int group_work_items = 256;
-    constexpr int work_size_per_item = 4;
-    launch_scatter_gather_kernel<group_work_items, work_size_per_item>(
-        iter.numel(), loop);
+    launch_scatter_gather_kernel(iter.numel(), loop);
   }
 };
 

diff --git a/src/BuildOnWindows.cmake b/src/BuildOnWindows.cmake
@@ -3,11 +3,6 @@
 set(TORCH_XPU_OPS_LIBRARIES)
 set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE)
 
-# Walk around cyclic dependence
-# libtorch_xpu.so links to libtorch_xpu_ops.a
-# Load libtorch_xpu_ops_aten.so explicitly by torch/__init__.py:_load_dll_libraries (Break cycle)
-# libtorch_xpu_ops_aten.so links to libtorch_xpu_ops_sycl_unary_binary_kernels.so and libtorch_xpu_ops_sycl_kernels.so
-# libtorch_xpu_ops_sycl_unary_binary_kernels.so and libtorch_xpu_ops_sycl_kernels.so links to libtorch_xpu.so
 add_library(
   torch_xpu_ops
   STATIC
@@ -21,7 +16,6 @@ add_library(
   ${ATen_XPU_NATIVE_CPP_SRCS}
   ${ATen_XPU_GEN_SRCS})
 install(TARGETS torch_xpu_ops_aten DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-# target_compile_definitions(torch_xpu_ops_aten PRIVATE CAFFE2_BUILD_MAIN_LIB)
 target_compile_definitions(torch_xpu_ops_aten PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
 target_link_libraries(torch_xpu_ops_aten PUBLIC torch_xpu)
 target_link_libraries(torch_xpu_ops_aten PUBLIC torch_cpu)
@@ -48,8 +42,11 @@ else()
   set(ATen_XPU_SYCL_REDUCE_SRCS)
   set(ATen_XPU_SYCL_ACTIVATION_SRCS)
   set(ATen_XPU_SYCL_FOREACH_SRCS)
+  set(ATen_XPU_SYCL_TENSOR_SRCS)
+  set(ATen_XPU_SYCL_NORM_LOSS_SRCS)
+  set(ATen_XPU_SYCL_POLY_SRCS)
+  set(ATen_XPU_SYCL_DISTRIBUTION_SRCS)
   set(ATen_XPU_SYCL_OTHERS_SRCS)
-
   foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
     string(REGEX MATCH "Binary" IS_BINARY ${sycl_src})
     string(REGEX MATCH "Unary" IS_UNARY ${sycl_src})
@@ -63,6 +60,13 @@ else()
     string(REGEX MATCH "Activation" IS_ACTIVATION ${sycl_src})
     string(REGEX MATCH "Foreach" IS_FOREACH ${sycl_src})
     string(REGEX MATCH "Reduce" IS_REDUCE ${sycl_src})
+	string(REGEX MATCH "Tensor" IS_TENSOR ${sycl_src})
+	string(REGEX MATCH "Norm" IS_NORM ${sycl_src})
+	string(REGEX MATCH "Loss" IS_LOSS ${sycl_src})
+	string(REGEX MATCH "Polynomial" IS_POLY ${sycl_src})
+	#Move resize kernel to Norm and Loss lib, to resolve symbol.
+	string(REGEX MATCH "Resize" IS_RESIZE ${sycl_src})
+	string(REGEX MATCH "Distribution" IS_DISTRIBUTION ${sycl_src})
 
     if(NOT IS_FOREACH STREQUAL "")
       list(APPEND ATen_XPU_SYCL_FOREACH_SRCS ${sycl_src})
@@ -74,11 +78,18 @@ else()
       list(APPEND ATen_XPU_SYCL_REDUCE_SRCS ${sycl_src})
     elseif(NOT IS_ACTIVATION STREQUAL "")
       list(APPEND ATen_XPU_SYCL_ACTIVATION_SRCS ${sycl_src})
+	elseif(NOT IS_TENSOR STREQUAL "")
+	  list(APPEND ATen_XPU_SYCL_TENSOR_SRCS ${sycl_src})
+	elseif(NOT IS_DISTRIBUTION STREQUAL "")
+	  list(APPEND ATen_XPU_SYCL_DISTRIBUTION_SRCS ${sycl_src})
+	elseif(NOT IS_NORM STREQUAL "" OR NOT IS_LOSS STREQUAL "" OR NOT IS_RESIZE STREQUAL "")
+	  list(APPEND ATen_XPU_SYCL_NORM_LOSS_SRCS ${sycl_src})
+	elseif(NOT IS_POLY STREQUAL "")
+	  list(APPEND ATen_XPU_SYCL_POLY_SRCS ${sycl_src})
     else()
       list(APPEND ATen_XPU_SYCL_OTHERS_SRCS ${sycl_src})
     endif()
   endforeach()
-
   # Binary kernel lib
   set(sycl_binary_lib torch_xpu_ops_sycl_binary_kernels)
   sycl_add_library(
@@ -148,7 +159,63 @@ else()
 
   # Decouple with PyTorch cmake definition.
   install(TARGETS ${sycl_foreach_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  # Tensor kernel lib
+  set(sycl_tensor_lib torch_xpu_ops_sycl_tensor_kernels)
+  sycl_add_library(
+    ${sycl_tensor_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_TENSOR_SRCS})
+  target_compile_definitions(${sycl_tensor_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_tensor_lib})
+  target_link_libraries(${sycl_tensor_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_tensor_lib})
 
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_tensor_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  # Norm and Loss kernel lib
+  set(sycl_norm_loss_lib torch_xpu_ops_sycl_norm_loss_kernels)
+  sycl_add_library(
+    ${sycl_norm_loss_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_NORM_LOSS_SRCS})
+  target_compile_definitions(${sycl_norm_loss_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_norm_loss_lib})
+  target_link_libraries(${sycl_norm_loss_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_norm_loss_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_norm_loss_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  # Polynomial kernel lib
+  set(sycl_poly_lib torch_xpu_ops_sycl_poly_kernels)
+  sycl_add_library(
+    ${sycl_poly_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_POLY_SRCS})
+  target_compile_definitions(${sycl_poly_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_poly_lib})
+  target_link_libraries(${sycl_poly_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_poly_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_poly_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  # Distribution kernel lib
+  set(sycl_dist_lib torch_xpu_ops_sycl_dist_kernels)
+  sycl_add_library(
+    ${sycl_dist_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_DISTRIBUTION_SRCS})
+  target_compile_definitions(${sycl_dist_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_dist_lib})
+  target_link_libraries(${sycl_dist_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_dist_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_dist_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
   # Other kernel lib
   set(sycl_lib torch_xpu_ops_sycl_kernels)
   sycl_add_library(

diff --git a/test/xpu/test_binary_ufuncs_xpu.py b/test/xpu/test_binary_ufuncs_xpu.py
@@ -65,7 +65,7 @@ def to_np(value):
                 else:
                     self.assertRaisesRegex(
                         RuntimeError,
-                        "Found dtype \\w+ but expected \\w+",
+                        r"result type \w+ can't be cast to the desired output type \w+",
                         lambda: actual.pow_(exponent),
                     )