From d294ebd1ebf0ee9ad4d792a7d61962f2ff8a247f Mon Sep 17 00:00:00 2001
From: "Wang, Chuanqi" <chuanqi.wang@intel.com>
Date: Thu, 11 Jul 2024 09:21:13 +0800
Subject: [PATCH 1/6] [CI] enable WERROR build for ut test (#560)

---
 .github/ci_commit_pins/torchbench.txt  | 1 +
 .github/workflows/_linux_ut.yml        | 2 +-
 .github/workflows/nightly_ondemand.yml | 8 +++-----
 .github/workflows/pull.yml             | 2 +-
 4 files changed, 6 insertions(+), 7 deletions(-)
 create mode 100644 .github/ci_commit_pins/torchbench.txt

diff --git a/.github/ci_commit_pins/torchbench.txt b/.github/ci_commit_pins/torchbench.txt
new file mode 100644
index 000000000..ac2a7c011
--- /dev/null
+++ b/.github/ci_commit_pins/torchbench.txt
@@ -0,0 +1 @@
+bb5294090a397b15fadf10cd2172f9bd9c461f9a
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 4abc9a95b..0440d202b 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -68,7 +68,7 @@ jobs:
           export USE_XPU=1
           source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
           export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
-          python setup.py bdist_wheel
+          WERROR=1 python setup.py bdist_wheel
           pip install --force-reinstall dist/*.whl
           git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
           pip install -r .ci/docker/requirements-ci.txt
diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml
index 039407bc8..926b099f3 100644
--- a/.github/workflows/nightly_ondemand.yml
+++ b/.github/workflows/nightly_ondemand.yml
@@ -17,10 +17,10 @@ on:
         default: 'false'
         description: Keep torch-xpu-ops pin. `true` means use pined commit
       ut:
-        required: true
+        required: false
         type: string
         default: 'torch_xpu'
-        description: UT scope. `op_example,op_extended,op_ut,torch_xpu` Delimiter is comma
+        description: UT scope. `op_example,op_extended,op_ut,torch_xpu`. Delimiter is comma
       triton:
         required: false
         type: string
@@ -141,7 +141,6 @@ jobs:
           echo "TORCHBENCH_COMMIT_ID=$(<third_party/torch-xpu-ops/.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          # echo "TORCHTEXT_COMMIT_ID=$(<.github/ci_commit_pins/text.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
@@ -257,7 +256,6 @@ jobs:
           TORCHBENCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHBENCH_COMMIT_ID }}"
           TORCHVISION_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHVISION_COMMIT_ID }}"
           TORCHAUDIO_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHAUDIO_COMMIT_ID }}"
-          # TORCHTEXT_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHTEXT_COMMIT_ID }}"
           TRANSFORMERS_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TRANSFORMERS_VERSION }}"
           TIMM_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TIMM_COMMIT_ID }}"
           TRITON_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TRITON_COMMIT_ID }}"
@@ -303,7 +301,7 @@ jobs:
             fi
             echo -e "Inputs | $test_scope\n--- | --- \n" >> ${{ github.workspace }}/report.txt
           fi
-          echo "$cc_comment\n" >> ${{ github.workspace }}/report.txt
+          echo "$cc_comment" >> ${{ github.workspace }}/report.txt
           # Report
           report_txt=$(cat ${{ github.workspace }}/report.txt)
           gh --repo $repo issue comment $test_issue_id --body "$report_txt"
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 3b103e4d9..5ee55fdcf 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -82,7 +82,7 @@ jobs:
           cd ../pytorch
           echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" >> "${GITHUB_ENV}"
           echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" >> "${GITHUB_ENV}"
-          echo "TORCHTEXT_COMMIT_ID=$(<.github/ci_commit_pins/text.txt)" >> "${GITHUB_ENV}"
+          echo "TORCHBENCH_COMMIT_ID=$(<third_party/torch-xpu-ops/.github/ci_commit_pins/torchbench.txt)" >> "${GITHUB_ENV}"
           echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" >> "${GITHUB_ENV}"
           echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" >> "${GITHUB_ENV}"
           echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" >> "${GITHUB_ENV}"

From 0253fb9021ada9c8df124b0c12d8f4aded4fb601 Mon Sep 17 00:00:00 2001
From: Feng Yuan <feng1.yuan@intel.com>
Date: Thu, 11 Jul 2024 10:44:20 +0800
Subject: [PATCH 2/6] =?UTF-8?q?Using=20kernel=20specific=20max=20work=20gr?=
 =?UTF-8?q?oup=20size=20instead=20of=20device=20max=20work=20=E2=80=A6=20(?=
 =?UTF-8?q?#542)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…group size.

Max work group size of kernel is not a static and device related only
property. It now in SYCL depends on driver/compiler implementation.
Device max work group means the probable max work group size allowd by
the device. But actual max work group size depends on driver/compiler
implementation, like compilaton optimization. Using kernel specific max
work group size could get actual max work group allowed correctly. For
example, on Xe, if compiler chooses SIMD16 and large GRF (32 HW threads
per SS), the actual max work group size will be 512 (16 * 32), not 1024
queried by device::info::max_work_group_size.

---------

Signed-off-by: Feng Yuan <feng1.yuan@intel.com>
---
 .../native/xpu/sycl/ActivationGluKernels.cpp  |   8 +-
 .../sycl/AdaptiveAveragePooling2dKernels.cpp  |   2 +-
 src/ATen/native/xpu/sycl/BatchKernel.h        |  15 +-
 src/ATen/native/xpu/sycl/BatchNormKernels.cpp | 274 ++++++++++++------
 .../native/xpu/sycl/BucketizationKernels.cpp  |  36 +--
 src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp |  60 ++--
 .../native/xpu/sycl/EmbeddingBackwardKernel.h |  80 ++---
 src/ATen/native/xpu/sycl/EmbeddingBag.cpp     |  23 +-
 .../native/xpu/sycl/ForeachReduceKernels.cpp  |  67 ++++-
 src/ATen/native/xpu/sycl/GridSampler.cpp      |  18 +-
 src/ATen/native/xpu/sycl/GroupNormKernels.cpp |  36 ++-
 src/ATen/native/xpu/sycl/GroupReduceUtils.h   |   6 +-
 src/ATen/native/xpu/sycl/Indexing.cpp         |  53 +++-
 src/ATen/native/xpu/sycl/Indexing.h           |  33 ++-
 src/ATen/native/xpu/sycl/LossNLLKernel.cpp    |  19 +-
 src/ATen/native/xpu/sycl/MultiTensorApply.h   |  37 ++-
 src/ATen/native/xpu/sycl/NonzeroKernel.cpp    |   6 +-
 src/ATen/native/xpu/sycl/Norm.h               |  28 +-
 src/ATen/native/xpu/sycl/RandpermKernel.cpp   |   3 +-
 src/ATen/native/xpu/sycl/Reduce.h             |  49 ++--
 src/ATen/native/xpu/sycl/ScanUtils.h          |  55 ++--
 src/ATen/native/xpu/sycl/Shape.cpp            |  21 +-
 src/ATen/native/xpu/sycl/SoftMaxKernels.cpp   | 246 +++++++++++-----
 .../xpu/sycl/TensorTransformationsKernels.cpp |  16 +-
 .../native/xpu/sycl/TriangularOpsKernels.cpp  |  12 +-
 .../xpu/sycl/UpSampleBicubic2dKernels.cpp     |   9 +-
 .../xpu/sycl/UpSampleBilinear2dKernels.cpp    |  17 +-
 src/ATen/native/xpu/sycl/pstl/PSTLFunctions.h |  30 +-
 src/comm/DeviceProperties.h                   |  25 +-
 29 files changed, 824 insertions(+), 460 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
index b3606ddef..056e8c332 100644
--- a/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
@@ -97,13 +97,13 @@ void launch_glu_backward_kernel(
     OffsetCalc offset_calculator,
     int64_t gI_byte_offset,
     int64_t I_byte_offset) {
-  const int64_t local_size = syclMaxWorkGroupSize();
-  const int64_t num_wg = (numel + local_size - 1) / local_size;
-  const int64_t global_size = num_wg * local_size;
-
   GluBackwardKernelFunctor<scalar_t, OffsetCalc> kfn(
       numel, gI, I, gO, offset_calculator, gI_byte_offset, I_byte_offset);
 
+  const int64_t local_size = syclMaxWorkGroupSize(kfn);
+  const int64_t num_wg = (numel + local_size - 1) / local_size;
+  const int64_t global_size = num_wg * local_size;
+
   sycl_kernel_submit(global_size, local_size, getCurrentSYCLQueue(), kfn);
 }
 
diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
index aacc66062..4d7ef286d 100644
--- a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
@@ -194,7 +194,7 @@ struct AdaptiveAvgPool2dBwdSLMKernelFunctor
     numel_ = ib_ * ic_ * ih_ * iw_;
     int total_item = std::min(numel_, syclMaxWorkItemsPerTile());
 
-    local_range_ = syclMaxWorkGroupSize();
+    local_range_ = syclMaxWorkGroupSize(*this);
     global_range_ = total_item < local_range_
         ? local_range_
         : (total_item / local_range_) * local_range_;
diff --git a/src/ATen/native/xpu/sycl/BatchKernel.h b/src/ATen/native/xpu/sycl/BatchKernel.h
index fef42b938..cff967a76 100644
--- a/src/ATen/native/xpu/sycl/BatchKernel.h
+++ b/src/ATen/native/xpu/sycl/BatchKernel.h
@@ -39,6 +39,7 @@ class BatchKernelConfig {
         problem_batch_(problem_batch),
         problem_along_x_(problem_along_x),
         policy_(policy_combine(policies)),
+        prefer_wg_size_(prefer_wg_size),
         problem_wg_range_(0),
         problem_glb_range_(0),
         problem_range_(0),
@@ -47,12 +48,15 @@ class BatchKernelConfig {
         glb_range_x_(0),
         glb_range_y_(0),
         wg_range_x_(0),
-        wg_range_y_(0) {
-    size_t wg_size = syclMaxWorkGroupSize();
+        wg_range_y_(0) {}
+
+  template <class KernelClass>
+  void build() {
+    size_t wg_size = syclMaxWorkGroupSize<KernelClass>();
     size_t sg_size = syclMaxSubGroupSize();
-    if (prefer_wg_size != 0 && prefer_wg_size % sg_size == 0 &&
-        prefer_wg_size < wg_size) {
-      wg_size = prefer_wg_size;
+    if (prefer_wg_size_ != 0 && prefer_wg_size_ % sg_size == 0 &&
+        prefer_wg_size_ < wg_size) {
+      wg_size = prefer_wg_size_;
     }
     wg_range_x_ = sg_size;
     wg_range_y_ = wg_size / wg_range_x_;
@@ -263,6 +267,7 @@ class BatchKernelConfig {
   /* logical active batch */ int64_t problem_batch_;
   bool problem_along_x_;
   Policy policy_;
+  size_t prefer_wg_size_;
   int64_t problem_wg_range_;
   int64_t problem_glb_range_;
   size_t problem_range_;
diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
index 5c64816fb..f0ab4df47 100644
--- a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
@@ -133,16 +133,39 @@ struct Var {
   }
 };
 
+template <class KernelClass>
 int get_max_group_size(int simd = SIMD32) {
   // The max work group size required by batch_norm needs to ensure that the two
   // subgroup reduces can obtain correct results.
-  int max_size = syclMaxWorkGroupSize();
+  int max_size = syclMaxWorkGroupSize<KernelClass>();
   int shfl2_restricted_size = simd * simd;
   return max_size > shfl2_restricted_size ? shfl2_restricted_size : max_size;
 }
 
+template <class KernelClass>
 int get_num_threads(int nelem, int restricted_simd = SIMD32) {
-  int max_size = get_max_group_size(restricted_simd);
+  int max_size = get_max_group_size<KernelClass>(restricted_simd);
+  int thread_sizes[5] = {32, 64, 128, 256, max_size};
+  for (int i = 0; i < 5; ++i) {
+    if (nelem <= thread_sizes[i]) {
+      return thread_sizes[i];
+    }
+  }
+  return max_size;
+}
+
+int get_dev_max_group_size(int simd = SIMD32) {
+  // The max work group size required by batch_norm needs to ensure that the two
+  // subgroup reduces can obtain correct results.
+  int max_size = syclDeviceMaxWorkGroupSize();
+  int shfl2_restricted_size = simd * simd;
+  return max_size > shfl2_restricted_size ? shfl2_restricted_size : max_size;
+}
+
+int get_num_threads_by_dev_max_group_size(
+    int nelem,
+    int restricted_simd = SIMD32) {
+  int max_size = get_dev_max_group_size(restricted_simd);
   int thread_sizes[5] = {32, 64, 128, 256, max_size};
   for (int i = 0; i < 5; ++i) {
     if (nelem <= thread_sizes[i]) {
@@ -565,39 +588,53 @@ void batch_norm_stats_template(
 
   auto& queue = getCurrentSYCLQueue();
   int simd = get_prefer_simd(input.size(1), input.size(0) * input.size(2));
-  int max_group_size = get_max_group_size(simd);
-  int tf = get_num_threads(input.size(2), simd);
-  int64_t work_group_size_x = tf;
-  int64_t work_group_size_y = std::max(1, max_group_size / tf);
-  int64_t global_size_x = input.size(1) * work_group_size_x;
-  int64_t global_size_y = 1 * work_group_size_y;
 
   if (simd == SIMD32) {
-    auto caller = BatchNormCollectStatisticsKernelFunctor<
+    using KernelClass = BatchNormCollectStatisticsKernelFunctor<
         SIMD32,
         VarTransform,
         scalar_t,
         scalar_t,
         accscalar_t,
-        index_t>(input, epsilon, 0.0, mean, invstd);
+        index_t>;
+
+    auto kfn = KernelClass(input, epsilon, 0.0, mean, invstd);
+
+    int max_group_size = get_max_group_size<KernelClass>(simd);
+    int tf = get_num_threads<KernelClass>(input.size(2), simd);
+    int64_t work_group_size_x = tf;
+    int64_t work_group_size_y = std::max(1, max_group_size / tf);
+    int64_t global_size_x = input.size(1) * work_group_size_x;
+    int64_t global_size_y = 1 * work_group_size_y;
+
     sycl_kernel_submit(
         sycl::range<2>(global_size_y, global_size_x),
         sycl::range<2>(work_group_size_y, work_group_size_x),
         queue,
-        caller);
+        kfn);
   } else {
-    auto caller = BatchNormCollectStatisticsKernelFunctor<
+    using KernelClass = BatchNormCollectStatisticsKernelFunctor<
         SIMD16,
         VarTransform,
         scalar_t,
         scalar_t,
         accscalar_t,
-        index_t>(input, epsilon, 0.0, mean, invstd);
+        index_t>;
+
+    auto kfn = KernelClass(input, epsilon, 0.0, mean, invstd);
+
+    int max_group_size = get_max_group_size<KernelClass>(simd);
+    int tf = get_num_threads<KernelClass>(input.size(2), simd);
+    int64_t work_group_size_x = tf;
+    int64_t work_group_size_y = std::max(1, max_group_size / tf);
+    int64_t global_size_x = input.size(1) * work_group_size_x;
+    int64_t global_size_y = 1 * work_group_size_y;
+
     sycl_kernel_submit(
         sycl::range<2>(global_size_y, global_size_x),
         sycl::range<2>(work_group_size_y, work_group_size_x),
         queue,
-        caller);
+        kfn);
   }
 }
 
@@ -960,10 +997,11 @@ struct BatchNormCollectStatisticsChannelsLastKernelFunctor
   }
 
   void sycl_ker_config_convention(sycl::handler& cgh) {
-    size_t max_wg_sz = syclMaxWorkGroupSize();
-    shmem_mean_ = sycl_local_acc_t<accscalar_t>(sycl::range<1>{max_wg_sz}, cgh);
-    shmem_m2n_ = sycl_local_acc_t<accscalar_t>(sycl::range<1>{max_wg_sz}, cgh);
-    shmem_count_ = sycl_local_acc_t<int>(sycl::range<1>{max_wg_sz}, cgh);
+    shmem_mean_ =
+        sycl_local_acc_t<accscalar_t>(sycl::range<1>{(size_t)wg_size_}, cgh);
+    shmem_m2n_ =
+        sycl_local_acc_t<accscalar_t>(sycl::range<1>{(size_t)wg_size_}, cgh);
+    shmem_count_ = sycl_local_acc_t<int>(sycl::range<1>{(size_t)wg_size_}, cgh);
     is_last_group_done_ = sycl_local_acc_t<bool>(sycl::range<1>{1}, cgh);
   }
 
@@ -975,7 +1013,8 @@ struct BatchNormCollectStatisticsChannelsLastKernelFunctor
       int* semaphores,
       const int reduction_size,
       const int stride,
-      accscalar_t epsilon)
+      accscalar_t epsilon,
+      int wg_size)
       : input_(input),
         out_mean_(out_mean),
         out_invstd_(out_invstd),
@@ -983,7 +1022,8 @@ struct BatchNormCollectStatisticsChannelsLastKernelFunctor
         semaphores_(semaphores),
         reduction_size_(reduction_size),
         stride_(stride),
-        epsilon_(epsilon) {}
+        epsilon_(epsilon),
+        wg_size_(wg_size) {}
 
  private:
   const scalar_t* __restrict__ input_;
@@ -994,6 +1034,7 @@ struct BatchNormCollectStatisticsChannelsLastKernelFunctor
   const int reduction_size_;
   const int stride_;
   accscalar_t epsilon_;
+  int wg_size_;
   sycl_local_acc_t<accscalar_t> shmem_mean_;
   sycl_local_acc_t<accscalar_t> shmem_m2n_;
   sycl_local_acc_t<int> shmem_count_;
@@ -1039,7 +1080,7 @@ void batch_norm_stats_channels_last_template(
   int* semaphores_ptr =
       nwg_y > 1 ? semaphores.mutable_data_ptr<int>() : nullptr;
 
-  auto caller = BatchNormCollectStatisticsChannelsLastKernelFunctor<
+  auto kfn = BatchNormCollectStatisticsChannelsLastKernelFunctor<
       VarTransform,
       scalar_t,
       accscalar_t,
@@ -1051,8 +1092,10 @@ void batch_norm_stats_channels_last_template(
       semaphores_ptr,
       reduction_size,
       stride,
-      epsilon);
-  sycl_kernel_submit(global_range, local_range, getCurrentSYCLQueue(), caller);
+      epsilon,
+      wg_size_y * wg_size_x);
+
+  sycl_kernel_submit(global_range, local_range, getCurrentSYCLQueue(), kfn);
 }
 
 std::tuple<Tensor, Tensor> batch_norm_stats_kernel(
@@ -1254,8 +1297,8 @@ void batch_norm_elemt_template(
   const double dummy_epsilon = 1e-5;
 
   int tf = std::max<int>(
-      get_num_threads(input.size(2) / 4),
-      std::min<int>(get_num_threads(input.size(2)), 64));
+      get_num_threads_by_dev_max_group_size(input.size(2) / 4),
+      std::min<int>(get_num_threads_by_dev_max_group_size(input.size(2)), 64));
   int tb = std::max<int>(64 / tf, 1);
   sycl::range<2> local_range(tb, tf);
   int nwg_x = input.size(1);
@@ -1266,14 +1309,14 @@ void batch_norm_elemt_template(
   nwg_y = std::min<int>(nwg_y, syclMaxWorkItemsPerTile() / (tf * tb));
   sycl::range<2> global_range(nwg_y * tb, nwg_x * tf);
 
-  auto caller = BatchNormTransformInputKernelFunctor<
+  auto kfn = BatchNormTransformInputKernelFunctor<
       input_scalar_t,
       stat_scalar_t,
       stat_accscalar_t,
       true,
       index_t>(input, output, mean, invstd, weight, bias, dummy_epsilon);
 
-  sycl_kernel_submit(global_range, local_range, queue, caller);
+  sycl_kernel_submit(global_range, local_range, queue, kfn);
 }
 
 template <
@@ -1388,7 +1431,7 @@ void batch_norm_elemt_channels_last_template(
     AT_DISPATCH_FLOATING_TYPES_AND2(
         kHalf, kBFloat16, input.scalar_type(), "batchnorm_forward_xpu", [&] {
           using accscalar_t = at::acc_type<scalar_t, true>;
-          auto caller = BatchNormTransformInputChannelsLastKernelFunctor<
+          auto kfn = BatchNormTransformInputChannelsLastKernelFunctor<
               scalar_t,
               accscalar_t,
               accscalar_t,
@@ -1403,7 +1446,7 @@ void batch_norm_elemt_channels_last_template(
               reduction_size,
               stride,
               fuse_relu);
-          sycl_kernel_submit(global_range, local_range, queue, caller);
+          sycl_kernel_submit(global_range, local_range, queue, kfn);
         });
   } else {
     if (weight.defined()) {
@@ -1417,7 +1460,7 @@ void batch_norm_elemt_channels_last_template(
     AT_DISPATCH_FLOATING_TYPES_AND2(
         kHalf, kBFloat16, input.scalar_type(), "batchnorm_forward_xpu", [&] {
           using accscalar_t = at::acc_type<scalar_t, true>;
-          auto caller = BatchNormTransformInputChannelsLastKernelFunctor<
+          auto kfn = BatchNormTransformInputChannelsLastKernelFunctor<
               scalar_t,
               accscalar_t,
               scalar_t,
@@ -1432,7 +1475,7 @@ void batch_norm_elemt_channels_last_template(
               reduction_size,
               stride,
               fuse_relu);
-          sycl_kernel_submit(global_range, local_range, queue, caller);
+          sycl_kernel_submit(global_range, local_range, queue, kfn);
         });
   }
 }
@@ -1599,7 +1642,7 @@ struct BatchNormBackwardReduceKernelFunctor
 
   void sycl_ker_config_convention(sycl::handler& cgh) {
     local_sum_ = sycl_local_acc_t<Float2<input_scalar_t, stat_accscalar_t>>(
-        sycl::range<1>{(size_t)get_max_group_size(SIMD)}, cgh);
+        sycl::range<1>{(size_t)wg_size_}, cgh);
   }
 
   BatchNormBackwardReduceKernelFunctor(
@@ -1636,7 +1679,8 @@ struct BatchNormBackwardReduceKernelFunctor
       GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t>
           grad_weight,
       GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t>
-          grad_bias)
+          grad_bias,
+      int wg_size)
       : input_(input),
         grad_output_(grad_output),
         mean_(mean),
@@ -1644,7 +1688,8 @@ struct BatchNormBackwardReduceKernelFunctor
         sum_dy_(sum_dy),
         sum_dy_xmu_(sum_dy_xmu),
         grad_weight_(grad_weight),
-        grad_bias_(grad_bias) {}
+        grad_bias_(grad_bias),
+        wg_size_(wg_size) {}
 
  private:
   const GenericPackedTensorAccessor<
@@ -1671,6 +1716,7 @@ struct BatchNormBackwardReduceKernelFunctor
       grad_weight_;
   GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t>
       grad_bias_;
+  int wg_size_;
   sycl_local_acc_t<Float2<input_scalar_t, stat_accscalar_t>> local_sum_;
 };
 
@@ -1739,21 +1785,24 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_template(
   auto& queue = getCurrentSYCLQueue();
   int simd = get_prefer_simd(
       input_reshaped.size(1), input_reshaped.size(0) * input_reshaped.size(1));
-  int max_wg_size = get_max_group_size(simd);
-  int wg_size_y = std::min<int>(last_pow2(batch_size), max_wg_size / simd);
-  int wg_size_x = std::min<int>(
-      std::max<int>(get_num_threads(feature_size, simd), simd),
-      max_wg_size / wg_size_y);
-  sycl::range<2> local_range(wg_size_y, wg_size_x);
-  sycl::range<2> global_range(1 * wg_size_y, n_input * wg_size_x);
 
   if (simd == SIMD32) {
-    auto caller = BatchNormBackwardReduceKernelFunctor<
+    using KernelClass = BatchNormBackwardReduceKernelFunctor<
         SIMD32,
         input_scalar_t,
         stat_scalar_t,
         stat_accscalar_t,
-        index_t>(
+        index_t>;
+
+    int max_wg_size = get_max_group_size<KernelClass>(simd);
+    int wg_size_y = std::min<int>(last_pow2(batch_size), max_wg_size / simd);
+    int wg_size_x = std::min<int>(
+        std::max<int>(get_num_threads<KernelClass>(feature_size, simd), simd),
+        max_wg_size / wg_size_y);
+    sycl::range<2> local_range(wg_size_y, wg_size_x);
+    sycl::range<2> global_range(1 * wg_size_y, n_input * wg_size_x);
+
+    auto kfn = KernelClass(
         input,
         grad_output,
         mean,
@@ -1761,15 +1810,27 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_template(
         sum_dy,
         sum_dy_xmu,
         grad_weight,
-        grad_bias);
-    sycl_kernel_submit(global_range, local_range, queue, caller);
+        grad_bias,
+        wg_size_y * wg_size_x);
+
+    sycl_kernel_submit(global_range, local_range, queue, kfn);
   } else {
-    auto caller = BatchNormBackwardReduceKernelFunctor<
+    using KernelClass = BatchNormBackwardReduceKernelFunctor<
         SIMD16,
         input_scalar_t,
         stat_scalar_t,
         stat_accscalar_t,
-        index_t>(
+        index_t>;
+
+    int max_wg_size = get_max_group_size<KernelClass>(simd);
+    int wg_size_y = std::min<int>(last_pow2(batch_size), max_wg_size / simd);
+    int wg_size_x = std::min<int>(
+        std::max<int>(get_num_threads<KernelClass>(feature_size, simd), simd),
+        max_wg_size / wg_size_y);
+    sycl::range<2> local_range(wg_size_y, wg_size_x);
+    sycl::range<2> global_range(1 * wg_size_y, n_input * wg_size_x);
+
+    auto kfn = KernelClass(
         input,
         grad_output,
         mean,
@@ -1777,8 +1838,10 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_template(
         sum_dy,
         sum_dy_xmu,
         grad_weight,
-        grad_bias);
-    sycl_kernel_submit(global_range, local_range, queue, caller);
+        grad_bias,
+        wg_size_y * wg_size_x);
+
+    sycl_kernel_submit(global_range, local_range, queue, kfn);
   }
   return std::make_tuple(sum_dy_, sum_dy_xmu_, grad_weight_, grad_bias_);
 }
@@ -1961,10 +2024,10 @@ struct BatchNormBackwardReduceChannelsLastKernelFunctor
   }
 
   void sycl_ker_config_convention(sycl::handler& cgh) {
-    shmem_sum_dy_ = sycl_local_acc_t<accscalar_t>(
-        sycl::range<1>{(size_t)get_max_group_size()}, cgh);
-    shmem_sum_dy_xmu_ = sycl_local_acc_t<accscalar_t>(
-        sycl::range<1>{(size_t)get_max_group_size()}, cgh);
+    shmem_sum_dy_ =
+        sycl_local_acc_t<accscalar_t>(sycl::range<1>{(size_t)wg_size_}, cgh);
+    shmem_sum_dy_xmu_ =
+        sycl_local_acc_t<accscalar_t>(sycl::range<1>{(size_t)wg_size_}, cgh);
     is_last_group_done_ = sycl_local_acc_t<bool>(sycl::range<1>{1}, cgh);
   }
 
@@ -1980,7 +2043,8 @@ struct BatchNormBackwardReduceChannelsLastKernelFunctor
       volatile accscalar_t* staging_data,
       int* semaphores,
       const int reduction_size,
-      const int stride)
+      const int stride,
+      const int wg_size)
       : input_(input),
         grad_output_(grad_output),
         mean_(mean),
@@ -1992,7 +2056,8 @@ struct BatchNormBackwardReduceChannelsLastKernelFunctor
         staging_data_(staging_data),
         semaphores_(semaphores),
         reduction_size_(reduction_size),
-        stride_(stride) {}
+        stride_(stride),
+        wg_size_(wg_size) {}
 
  private:
   const scalar_t* __restrict__ input_;
@@ -2007,6 +2072,7 @@ struct BatchNormBackwardReduceChannelsLastKernelFunctor
   int* semaphores_;
   const int reduction_size_;
   const int stride_;
+  const int wg_size_;
   sycl_local_acc_t<accscalar_t> shmem_sum_dy_;
   sycl_local_acc_t<accscalar_t> shmem_sum_dy_xmu_;
   sycl_local_acc_t<bool> is_last_group_done_;
@@ -2069,7 +2135,7 @@ batch_norm_backward_reduce_channels_last_template(
               : nullptr;
           int* semaphores_ptr =
               nwg_y > 1 ? semaphores.mutable_data_ptr<int>() : nullptr;
-          auto caller = BatchNormBackwardReduceChannelsLastKernelFunctor<
+          auto kfn = BatchNormBackwardReduceChannelsLastKernelFunctor<
               ELEMENTS_PER_ITER,
               scalar_t,
               accscalar_t,
@@ -2085,8 +2151,9 @@ batch_norm_backward_reduce_channels_last_template(
               staging_data_ptr,
               semaphores_ptr,
               reduction_size,
-              stride);
-          sycl_kernel_submit(global_range, local_range, queue, caller);
+              stride,
+              wg_size_y * wg_size_x);
+          sycl_kernel_submit(global_range, local_range, queue, kfn);
         });
   } else {
     if (weight.defined()) {
@@ -2109,7 +2176,8 @@ batch_norm_backward_reduce_channels_last_template(
               : nullptr;
           int* semaphores_ptr =
               nwg_y > 1 ? semaphores.mutable_data_ptr<int>() : nullptr;
-          auto caller = BatchNormBackwardReduceChannelsLastKernelFunctor<
+
+          auto kfn = BatchNormBackwardReduceChannelsLastKernelFunctor<
               ELEMENTS_PER_ITER,
               scalar_t,
               accscalar_t,
@@ -2127,8 +2195,10 @@ batch_norm_backward_reduce_channels_last_template(
               staging_data_ptr,
               semaphores_ptr,
               reduction_size,
-              stride);
-          sycl_kernel_submit(global_range, local_range, queue, caller);
+              stride,
+              wg_size_y * wg_size_x);
+
+          sycl_kernel_submit(global_range, local_range, queue, kfn);
         });
   }
 
@@ -2429,8 +2499,8 @@ Tensor batch_norm_backward_elemt_template(
 
   auto& queue = getCurrentSYCLQueue();
   int tf = std::max<int>(
-      get_num_threads(input.size(2) / 4),
-      std::min<int>(get_num_threads(input.size(2)), 64));
+      get_num_threads_by_dev_max_group_size(input.size(2) / 4),
+      std::min<int>(get_num_threads_by_dev_max_group_size(input.size(2)), 64));
   int tb = std::max<int>(64 / tf, 1);
   int nwg_x = input.size(1);
   int nwg_y = std::max<int>(
@@ -2444,7 +2514,7 @@ Tensor batch_norm_backward_elemt_template(
   sycl::range<2> local_range(tb, tf);
   sycl::range<2> global_range(nwg_y * tb, nwg_x * tf);
 
-  auto caller = BatchNormBackwardElemtKernelFunctor<
+  auto kfn = BatchNormBackwardElemtKernelFunctor<
       input_scalar_t,
       stat_scalar_t,
       stat_accscalar_t,
@@ -2458,7 +2528,7 @@ Tensor batch_norm_backward_elemt_template(
       sum_dy_xmu,
       grad_input,
       norm_fct);
-  sycl_kernel_submit(global_range, local_range, queue, caller);
+  sycl_kernel_submit(global_range, local_range, queue, kfn);
 
   return grad_input_reshaped.view(input_.sizes());
 }
@@ -2510,8 +2580,8 @@ Tensor batch_norm_backward_elemt_template(
   auto& queue = getCurrentSYCLQueue();
 
   int tf = std::max<int>(
-      get_num_threads(input.size(2) / 4),
-      std::min<int>(get_num_threads(input.size(2)), 64));
+      get_num_threads_by_dev_max_group_size(input.size(2) / 4),
+      std::min<int>(get_num_threads_by_dev_max_group_size(input.size(2)), 64));
   int tb = std::max<int>(64 / tf, 1);
   int nwg_x = input.size(1);
   int nwg_y = std::max<int>(
@@ -2523,7 +2593,7 @@ Tensor batch_norm_backward_elemt_template(
   sycl::range<2> local_range(tb, tf);
   sycl::range<2> global_range(nwg_y * tb, nwg_x * tf);
 
-  auto caller = BatchNormBackwardElemtKernelFunctor<
+  auto kfn = BatchNormBackwardElemtKernelFunctor<
       input_scalar_t,
       stat_scalar_t,
       stat_accscalar_t,
@@ -2540,7 +2610,7 @@ Tensor batch_norm_backward_elemt_template(
       0,
       count.const_data_ptr<int>(),
       count.numel());
-  sycl_kernel_submit(global_range, local_range, queue, caller);
+  sycl_kernel_submit(global_range, local_range, queue, kfn);
 
   return grad_input_reshaped.view(input_.sizes());
 }
@@ -2681,7 +2751,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_template(
         using accscalar_t = at::acc_type<scalar_t, true>;
 
         if (weight.defined() && weight.scalar_type() != input.scalar_type()) {
-          auto caller = BatchNormBackwardElemtChannelsLastKernelFunctor<
+          auto kfn = BatchNormBackwardElemtChannelsLastKernelFunctor<
               ELEMENTS_PER_ITER,
               scalar_t,
               accscalar_t,
@@ -2697,9 +2767,9 @@ at::Tensor batch_norm_backward_elemt_channels_last_template(
               static_cast<accscalar_t>(norm_fct),
               reduction_size,
               stride);
-          sycl_kernel_submit(global_range, local_range, queue, caller);
+          sycl_kernel_submit(global_range, local_range, queue, kfn);
         } else {
-          auto caller = BatchNormBackwardElemtChannelsLastKernelFunctor<
+          auto kfn = BatchNormBackwardElemtChannelsLastKernelFunctor<
               ELEMENTS_PER_ITER,
               scalar_t,
               accscalar_t,
@@ -2715,7 +2785,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_template(
               static_cast<accscalar_t>(norm_fct),
               reduction_size,
               stride);
-          sycl_kernel_submit(global_range, local_range, queue, caller);
+          sycl_kernel_submit(global_range, local_range, queue, kfn);
         }
       });
 
@@ -2751,7 +2821,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_template(
         "batchnorm_backward_element_xpu",
         [&] {
           using accscalar_t = acc_type<scalar_t, true>;
-          auto caller = BatchNormBackwardElemtChannelsLastKernelFunctor<
+          auto kfn = BatchNormBackwardElemtChannelsLastKernelFunctor<
               ELEMENTS_PER_ITER,
               scalar_t,
               accscalar_t,
@@ -2770,7 +2840,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_template(
               stride,
               count.const_data_ptr<int>(),
               count.numel());
-          sycl_kernel_submit(global_range, local_range, queue, caller);
+          sycl_kernel_submit(global_range, local_range, queue, kfn);
         });
   } else {
     if (weight.defined()) {
@@ -2788,7 +2858,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_template(
         "batchnorm_backward_element_xpu",
         [&] {
           using accscalar_t = acc_type<scalar_t, true>;
-          auto caller = BatchNormBackwardElemtChannelsLastKernelFunctor<
+          auto kfn = BatchNormBackwardElemtChannelsLastKernelFunctor<
               ELEMENTS_PER_ITER,
               scalar_t,
               accscalar_t,
@@ -2807,7 +2877,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_template(
               stride,
               count.const_data_ptr<int>(),
               count.numel());
-          sycl_kernel_submit(global_range, local_range, queue, caller);
+          sycl_kernel_submit(global_range, local_range, queue, kfn);
         });
   }
 
@@ -3373,7 +3443,7 @@ struct BatchNormBackwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
 
   void sycl_ker_config_convention(sycl::handler& cgh) {
     local_sum_ = sycl_local_acc_t<Float2<input_scalar_t, stat_accscalar_t>>(
-        sycl::range<1>{(size_t)get_max_group_size(SIMD)}, cgh);
+        sycl::range<1>{(size_t)wg_size_}, cgh);
   }
 
   BatchNormBackwardKernelFunctor(
@@ -3419,7 +3489,8 @@ struct BatchNormBackwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
           DefaultPtrTraits,
           index_t> save_invstd,
       bool train,
-      stat_accscalar_t epsilon)
+      stat_accscalar_t epsilon,
+      int wg_size)
       : input_(input),
         grad_output_(grad_output),
         grad_input_(grad_input),
@@ -3431,7 +3502,8 @@ struct BatchNormBackwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
         save_mean_(save_mean),
         save_invstd_(save_invstd),
         train_(train),
-        epsilon_(epsilon) {}
+        epsilon_(epsilon),
+        wg_size_(wg_size) {}
 
  private:
   const GenericPackedTensorAccessor<
@@ -3484,6 +3556,7 @@ struct BatchNormBackwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
       save_invstd_;
   bool train_;
   stat_accscalar_t epsilon_;
+  int wg_size_;
   sycl_local_acc_t<Float2<input_scalar_t, stat_accscalar_t>> local_sum_;
 };
 
@@ -3559,19 +3632,22 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_template(
       input_reshaped.size(1), input_reshaped.size(0) * input_reshaped.size(1));
 
   auto& queue = getCurrentSYCLQueue();
-  int max_group_size = get_max_group_size(simd);
-  int tf = get_num_threads(input.size(2), simd);
-  int wg_sz_y = std::max<int>(1, max_group_size / tf);
-  sycl::range<2> local_range(wg_sz_y, tf);
-  sycl::range<2> global_range(1 * wg_sz_y, input.size(1) * tf);
 
   if (simd == SIMD32) {
-    auto caller = BatchNormBackwardKernelFunctor<
+    using KernelClass = BatchNormBackwardKernelFunctor<
         SIMD32,
         input_scalar_t,
         stat_scalar_t,
         accscalar_t,
-        index_t>(
+        index_t>;
+
+    int max_group_size = get_max_group_size<KernelClass>(simd);
+    int tf = get_num_threads<KernelClass>(input.size(2), simd);
+    int wg_sz_y = std::max<int>(1, max_group_size / tf);
+    sycl::range<2> local_range(wg_sz_y, tf);
+    sycl::range<2> global_range(1 * wg_sz_y, input.size(1) * tf);
+
+    auto kfn = KernelClass(
         input,
         grad_output,
         grad_input,
@@ -3583,15 +3659,25 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_template(
         save_mean,
         save_invstd,
         train,
-        epsilon);
-    sycl_kernel_submit(global_range, local_range, queue, caller);
+        epsilon,
+        wg_sz_y * tf);
+
+    sycl_kernel_submit(global_range, local_range, queue, kfn);
   } else {
-    auto caller = BatchNormBackwardKernelFunctor<
+    using KernelClass = BatchNormBackwardKernelFunctor<
         SIMD16,
         input_scalar_t,
         stat_scalar_t,
         accscalar_t,
-        index_t>(
+        index_t>;
+
+    int max_group_size = get_max_group_size<KernelClass>(simd);
+    int tf = get_num_threads<KernelClass>(input.size(2), simd);
+    int wg_sz_y = std::max<int>(1, max_group_size / tf);
+    sycl::range<2> local_range(wg_sz_y, tf);
+    sycl::range<2> global_range(1 * wg_sz_y, input.size(1) * tf);
+
+    auto kfn = KernelClass(
         input,
         grad_output,
         grad_input,
@@ -3603,8 +3689,10 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_template(
         save_mean,
         save_invstd,
         train,
-        epsilon);
-    sycl_kernel_submit(global_range, local_range, queue, caller);
+        epsilon,
+        wg_sz_y * tf);
+
+    sycl_kernel_submit(global_range, local_range, queue, kfn);
   }
   return std::make_tuple(grad_input_, grad_weight_, grad_bias_);
 }
diff --git a/src/ATen/native/xpu/sycl/BucketizationKernels.cpp b/src/ATen/native/xpu/sycl/BucketizationKernels.cpp
index 213283de0..d56eff222 100644
--- a/src/ATen/native/xpu/sycl/BucketizationKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BucketizationKernels.cpp
@@ -125,23 +125,6 @@ void searchsorted_template(
     const bool& right,
     const Tensor& sorter) {
   int64_t numel_in = input.numel();
-  int64_t rng, grng, tile_size;
-  tile_size = syclMaxWorkGroupSize();
-  rng = numel_in;
-  if (rng == 0) {
-    rng = static_cast<int64_t>(1);
-  }
-
-  grng = rng;
-  if (tile_size > grng) {
-    tile_size = grng;
-  } else if (grng > tile_size) {
-    int64_t xMode = static_cast<int64_t>(grng % tile_size);
-    if (xMode != 0) {
-      grng += static_cast<int64_t>(tile_size - xMode);
-    }
-  }
-
   bool is_scalar_input = input.dim() == 0 && numel_in == 1;
   // inner most dim size of input and boundaries
   int64_t idim_in = is_scalar_input ? 1 : input.sizes().back();
@@ -167,6 +150,23 @@ void searchsorted_template(
       data_bd_data,
       data_out_data);
 
+  int64_t rng, grng, tile_size;
+  tile_size = syclMaxWorkGroupSize(kfn);
+  rng = numel_in;
+  if (rng == 0) {
+    rng = static_cast<int64_t>(1);
+  }
+
+  grng = rng;
+  if (tile_size > grng) {
+    tile_size = grng;
+  } else if (grng > tile_size) {
+    int64_t xMode = static_cast<int64_t>(grng % tile_size);
+    if (xMode != 0) {
+      grng += static_cast<int64_t>(tile_size - xMode);
+    }
+  }
+
   sycl_kernel_submit(grng, tile_size, getCurrentSYCLQueue(), kfn);
 }
 
@@ -243,4 +243,4 @@ void searchsorted_kernel(
     result.copy_(out);
   }
 }
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
index 009d86859..358b3ed84 100644
--- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
+++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
@@ -366,12 +366,15 @@ void launch_max_pool2d_kernel(
     int padW,
     int dilationH,
     int dilationW) {
+  using KernelClass = MaxPool2dKernelFunctor<scalar_t, is_channels_last>;
+
   auto& queue = at::xpu::getCurrentSYCLQueue();
   int outputSize = numBatch * numPlane * outputSizeH * outputSizeW;
   int stride = numPlane * outputSizeH * outputSizeW;
   BatchKernelConfig cfg = {
       1, outputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive};
-  auto kfn = MaxPool2dKernelFunctor<scalar_t, is_channels_last>(
+  cfg.template build<KernelClass>();
+  auto kfn = KernelClass(
       output,
       indices,
       input,
@@ -423,37 +426,42 @@ void launch_max_pool2d_backward_kernel(
   if (globalContext().deterministicAlgorithms() ||
       std::is_same_v<scalar_t, at::Half> ||
       std::is_same_v<scalar_t, at::BFloat16>) {
+    using KernelClass =
+        MaxPool2dBackwardDeterministicKernelFunctor<scalar_t, is_channels_last>;
     BatchKernelConfig cfg = {
         1, gradInputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive};
-    auto kfn =
-        MaxPool2dBackwardDeterministicKernelFunctor<scalar_t, is_channels_last>(
-            gradInput,
-            gradOutput,
-            indices,
-            numPlane,
-            gradInputSizeH,
-            gradInputSizeW,
-            gradOutputSizeH,
-            gradOutputSizeW,
-            gradInputSize,
-            out_cf_c_stride,
-            in_cf_c_stride,
-            out_n_stride,
-            in_n_stride,
-            kernel_h,
-            kernel_w,
-            stride_h,
-            stride_w,
-            pad_h,
-            pad_w,
-            dilation_h,
-            dilation_w,
-            cfg);
+    cfg.template build<KernelClass>();
+    auto kfn = KernelClass(
+        gradInput,
+        gradOutput,
+        indices,
+        numPlane,
+        gradInputSizeH,
+        gradInputSizeW,
+        gradOutputSizeH,
+        gradOutputSizeW,
+        gradInputSize,
+        out_cf_c_stride,
+        in_cf_c_stride,
+        out_n_stride,
+        in_n_stride,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        dilation_h,
+        dilation_w,
+        cfg);
     sycl_kernel_submit(cfg.global_size(), cfg.group_size(), queue, kfn);
   } else {
+    using KernelClass =
+        MaxPool2dBackwardKernelFunctor<scalar_t, is_channels_last>;
     BatchKernelConfig cfg = {
         1, gradOutputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive};
-    auto kfn = MaxPool2dBackwardKernelFunctor<scalar_t, is_channels_last>(
+    cfg.template build<KernelClass>();
+    auto kfn = KernelClass(
         gradInput,
         gradOutput,
         indices,
diff --git a/src/ATen/native/xpu/sycl/EmbeddingBackwardKernel.h b/src/ATen/native/xpu/sycl/EmbeddingBackwardKernel.h
index a68c7a05b..b7d65796f 100644
--- a/src/ATen/native/xpu/sycl/EmbeddingBackwardKernel.h
+++ b/src/ATen/native/xpu/sycl/EmbeddingBackwardKernel.h
@@ -228,15 +228,6 @@ void compute_grad_weight_bags(
     const Tensor& segment_offsets,
     int64_t num_of_segments,
     const Tensor& grad_weight_per_segment) {
-  constexpr int SYCL_MAX_SUB_GROUP_SIZE = 32;
-  int64_t work_group_size = syclMaxWorkGroupSize();
-
-  int64_t stride_warped =
-      CeilDiv(stride, SYCL_MAX_SUB_GROUP_SIZE) * SYCL_MAX_SUB_GROUP_SIZE;
-  int64_t group_size = std::min(stride_warped, work_group_size);
-  auto num_groups = CeilDiv(num_of_segments * stride_warped, group_size);
-  auto total_items = num_groups * group_size;
-
   bool per_sample_weight_defined = per_sample_weights.defined();
   bool count_defined = count.defined();
   int64_t per_sample_weights_stride =
@@ -258,10 +249,11 @@ void compute_grad_weight_bags(
                          // buffer.
   auto segment_offsets_data = segment_offsets.data_ptr<index_t>();
 
-  auto global_range = sycl::range<1>((size_t)total_items);
-  auto local_range = sycl::range<1>((size_t)group_size);
+  int64_t max_sub_group_size = syclMaxSubGroupSize();
+  int64_t stride_warped =
+      CeilDiv(stride, max_sub_group_size) * max_sub_group_size;
 
-  auto caller = ComputeGradWeightBagsKernelFunctor<scalar_t, index_t>(
+  auto kfn = ComputeGradWeightBagsKernelFunctor<scalar_t, index_t>(
       numel,
       stride,
       mode_mean,
@@ -278,7 +270,15 @@ void compute_grad_weight_bags(
       bag_size_data,
       per_sample_weights_data,
       segment_offsets_data);
-  sycl_kernel_submit(global_range, local_range, getCurrentSYCLQueue(), caller);
+
+  int64_t work_group_size = syclMaxWorkGroupSize(kfn);
+  int64_t group_size = std::min(stride_warped, work_group_size);
+  auto num_groups = CeilDiv(num_of_segments * stride_warped, group_size);
+  auto total_items = num_groups * group_size;
+  auto global_range = sycl::range<1>((size_t)total_items);
+  auto local_range = sycl::range<1>((size_t)group_size);
+
+  sycl_kernel_submit(global_range, local_range, getCurrentSYCLQueue(), kfn);
 }
 
 template <typename scalar_t, typename index_t>
@@ -358,14 +358,6 @@ void compute_grad_weight(
     const Tensor& segment_offsets,
     int64_t num_of_segments,
     const Tensor& grad_weight_per_segment) {
-  constexpr int SYCL_MAX_SUB_GROUP_SIZE = 32;
-  int64_t work_group_size = syclMaxWorkGroupSize();
-  int64_t stride_warped =
-      CeilDiv(stride, SYCL_MAX_SUB_GROUP_SIZE) * SYCL_MAX_SUB_GROUP_SIZE;
-  int64_t group_size = std::min(stride_warped, work_group_size);
-  auto num_groups = CeilDiv(num_of_segments * stride_warped, group_size);
-  auto total_items = num_groups * group_size;
-
   bool count_defined = count.defined();
 
   auto grad_weight_per_segment_data =
@@ -377,10 +369,11 @@ void compute_grad_weight(
       : indices_data; // use the indices_data handler as the dummy buffer.
   auto segment_offsets_data = segment_offsets.data_ptr<index_t>();
 
-  auto global_range = sycl::range<1>((size_t)total_items);
-  auto local_range = sycl::range<1>((size_t)group_size);
+  int64_t max_sub_group_size = syclMaxSubGroupSize();
+  int64_t stride_warped =
+      CeilDiv(stride, max_sub_group_size) * max_sub_group_size;
 
-  auto caller = ComputeGradWeightKernelFunctor<scalar_t, index_t>(
+  auto kfn = ComputeGradWeightKernelFunctor<scalar_t, index_t>(
       numel,
       stride,
       num_of_segments,
@@ -391,7 +384,15 @@ void compute_grad_weight(
       grad_output_data,
       count_data,
       segment_offsets_data);
-  sycl_kernel_submit(global_range, local_range, getCurrentSYCLQueue(), caller);
+
+  int64_t work_group_size = syclMaxWorkGroupSize(kfn);
+  int64_t group_size = std::min(stride_warped, work_group_size);
+  auto num_groups = CeilDiv(num_of_segments * stride_warped, group_size);
+  auto total_items = num_groups * group_size;
+  auto global_range = sycl::range<1>((size_t)total_items);
+  auto local_range = sycl::range<1>((size_t)group_size);
+
+  sycl_kernel_submit(global_range, local_range, getCurrentSYCLQueue(), kfn);
 }
 
 template <typename scalar_t, typename index_t>
@@ -449,6 +450,10 @@ struct SumAndScatterKernelFunctor {
         grad_weight_per_segment_data_(grad_weight_per_segment_data),
         segment_sizes_offsets_data_(segment_sizes_offsets_data) {}
 
+  void set_stride_warped(int64_t stride_warped) {
+    stride_warped_ = stride_warped;
+  }
+
  private:
   int64_t stride_;
   int64_t num_of_segments_;
@@ -473,12 +478,6 @@ void sum_and_scatter(
     const Tensor& segment_sizes_offsets,
     int64_t num_of_partial_segments,
     const int64_t padding_idx) {
-  int64_t work_group_size = syclMaxWorkGroupSize();
-  int64_t stride_warped = CeilDiv(stride, work_group_size) * work_group_size;
-  int64_t group_size = std::min(stride_warped, syclMaxWorkGroupSize());
-  auto num_groups = CeilDiv(num_of_segments * stride_warped, group_size);
-  auto total_items = num_groups * group_size;
-
   auto grad_weight_data = grad_weight.data_ptr<scalar_t>();
   auto input_data = input.data_ptr<index_t>();
   auto segment_offsets_data = segment_offsets.data_ptr<index_t>();
@@ -486,20 +485,29 @@ void sum_and_scatter(
       grad_weight_per_segment.data_ptr<acc_type<scalar_t, true>>();
   auto segment_sizes_offsets_data = segment_sizes_offsets.data_ptr<index_t>();
 
-  auto global_range = sycl::range<1>((size_t)total_items);
-  auto local_range = sycl::range<1>((size_t)group_size);
-  auto caller = SumAndScatterKernelFunctor<scalar_t, index_t>(
+  auto kfn = SumAndScatterKernelFunctor<scalar_t, index_t>(
       stride,
       num_of_segments,
       num_of_partial_segments,
       padding_idx,
-      stride_warped,
+      /* stride_warped */ 0,
       grad_weight_data,
       input_data,
       segment_offsets_data,
       grad_weight_per_segment_data,
       segment_sizes_offsets_data);
-  sycl_kernel_submit(global_range, local_range, getCurrentSYCLQueue(), caller);
+
+  int64_t work_group_size = syclMaxWorkGroupSize(kfn);
+  int64_t stride_warped = CeilDiv(stride, work_group_size) * work_group_size;
+  kfn.set_stride_warped(stride_warped);
+
+  int64_t group_size = std::min(stride_warped, work_group_size);
+  auto num_groups = CeilDiv(num_of_segments * stride_warped, group_size);
+  auto total_items = num_groups * group_size;
+  auto global_range = sycl::range<1>((size_t)total_items);
+  auto local_range = sycl::range<1>((size_t)group_size);
+
+  sycl_kernel_submit(global_range, local_range, getCurrentSYCLQueue(), kfn);
 }
 
 struct EmbeddingBackwardDeterministicKernelCopyIfFunctor {
diff --git a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
index 751dea41c..9b7366781 100644
--- a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
+++ b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
@@ -42,6 +42,15 @@ void embedding_bag(
   using vec_t = at::detail::Array<scalar_t, vec_size>;
   using vec_acc_t = at::detail::Array<accscalar_t, vec_size>;
   using vec_idx_t = at::detail::Array<index_t, vec_size>;
+  using KernelClass = EmbeddingBagKernelFunctor<
+      scalar_t,
+      accscalar_t,
+      index_t,
+      mode,
+      vec_size,
+      vec_t,
+      vec_acc_t,
+      vec_idx_t>;
 
   vec_t* o_vec = reinterpret_cast<vec_t*>(output);
   vec_t* w_vec = reinterpret_cast<vec_t*>(weights);
@@ -50,16 +59,10 @@ void embedding_bag(
   vec_len = vec_len / vec_size;
   BatchKernelConfig cfg = {
       bag_num, vec_len, 1, bag_num, true, BatchKernelConfig::Policy::pAdaptive};
+  cfg.template build<KernelClass>();
+
   index_t fixing_bag_size = ignore_offsets ? index_size / bag_num : 0;
-  auto caller = EmbeddingBagKernelFunctor<
-      scalar_t,
-      accscalar_t,
-      index_t,
-      mode,
-      vec_size,
-      vec_t,
-      vec_acc_t,
-      vec_idx_t>(
+  auto kfn = KernelClass(
       index,
       offset,
       offset2bag,
@@ -77,7 +80,7 @@ void embedding_bag(
       cfg,
       fixing_bag_size);
   sycl_kernel_submit(
-      cfg.global_size(), cfg.group_size(), getCurrentSYCLQueue(), caller);
+      cfg.global_size(), cfg.group_size(), getCurrentSYCLQueue(), kfn);
 }
 
 #define EMBBAG_KERNEL_ACC(                                                   \
diff --git a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp
index 2ad35e5d9..104147f71 100644
--- a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp
@@ -157,16 +157,18 @@ void launch_lpnorm_chunk_reduce_kernel(
                   AT_PRIVATE_CASE_TYPE_USING_HINT(          \
                       at::ScalarType::BFloat16, out_t, __VA_ARGS__))
 
-std::vector<Tensor> foreach_norm_kernel(
+template <class KernelClass>
+void foreach_norn_kernel_config(
     TensorList tensors,
-    const Scalar& ord,
-    double p,
-    c10::optional<ScalarType> dtype) {
+    TensorOptions output_per_tensor_option,
+    int64_t& wg_size,
+    int& max_chunks_per_tensor,
+    Tensor& output_per_tensor) {
   const int ntensors = tensors.size();
-  int max_chunks_per_tensor = -1;
 
-  int64_t wg_size = multi_tensor_apply_kernel_get_wg_size();
-  int64_t kChunkSize = multi_tensor_apply_kernel_get_chunk_size();
+  max_chunks_per_tensor = -1;
+  wg_size = multi_tensor_apply_kernel_get_wg_size<KernelClass>();
+  int64_t kChunkSize = multi_tensor_apply_kernel_get_chunk_size<KernelClass>();
 
   for (int t = 0; t < ntensors; t++) {
     int max_chunks_this_tensor =
@@ -176,18 +178,29 @@ std::vector<Tensor> foreach_norm_kernel(
     }
   }
 
-  const auto options = tensors[0].options();
-  const ScalarType output_dtype = // tensors[0].scalar_type();
-      dtype.has_value() ? dtype.value() : tensors[0].scalar_type();
-  const ScalarType output_per_tensor_dtype = toOpMathType(output_dtype);
-  auto output_per_tensor = at::zeros(
+  output_per_tensor = at::zeros(
       {static_cast<int64_t>(ntensors) * max_chunks_per_tensor},
-      options.dtype(output_per_tensor_dtype));
+      output_per_tensor_option);
+}
 
-  const auto res_option = options.dtype(output_dtype);
-  auto ret_per_tensor = at::empty({ntensors}, res_option);
+std::vector<Tensor> foreach_norm_kernel(
+    TensorList tensors,
+    const Scalar& ord,
+    double p,
+    c10::optional<ScalarType> dtype) {
+  const int ntensors = tensors.size();
 
+  const ScalarType output_dtype = // tensors[0].scalar_type();
+      dtype.has_value() ? dtype.value() : tensors[0].scalar_type();
+  const auto options = tensors[0].options();
+  auto output_per_tensor_option = options.dtype(toOpMathType(output_dtype));
+  auto res_option = options.dtype(output_dtype);
+  auto ret_per_tensor = at::empty({ntensors}, res_option);
   auto tensor_lists = std::vector<std::vector<Tensor>>{tensors.vec()};
+
+  int64_t wg_size;
+  int max_chunks_per_tensor;
+  Tensor output_per_tensor;
   if (p == static_cast<double>(1)) {
     AT_DISPATCH_FLOATING_TYPES_AND2(
         kHalf,
@@ -198,12 +211,24 @@ std::vector<Tensor> foreach_norm_kernel(
           AT_DISPATCH_OUT_DTYPES(
               output_dtype, "foreach_norm_out_dtype_xpu", [&]() {
                 using out_opmath_t = typename at::opmath_type<out_t>;
+                using KernelClass = lpnormChunkReduceKernelFunctor<
+                    out_t,
+                    NormType::L1,
+                    out_opmath_t>;
+                foreach_norn_kernel_config<KernelClass>(
+                    tensors,
+                    output_per_tensor_option,
+                    wg_size,
+                    max_chunks_per_tensor,
+                    output_per_tensor);
+
                 // sum temp val for each chunk
                 multi_tensor_apply<1>(
                     tensor_lists,
                     LpNormFunctor<scalar_t, NormType::L1, out_opmath_t>(),
                     output_per_tensor.mutable_data_ptr<out_opmath_t>(),
                     max_chunks_per_tensor);
+
                 // sum final val for all chunks
                 launch_lpnorm_chunk_reduce_kernel<
                     out_t,
@@ -226,11 +251,23 @@ std::vector<Tensor> foreach_norm_kernel(
           AT_DISPATCH_OUT_DTYPES(
               output_dtype, "foreach_norm_out_dtype_xpu", [&]() {
                 using out_opmath_t = typename at::opmath_type<out_t>;
+                using KernelClass = lpnormChunkReduceKernelFunctor<
+                    out_t,
+                    NormType::L2,
+                    out_opmath_t>;
+                foreach_norn_kernel_config<KernelClass>(
+                    tensors,
+                    output_per_tensor_option,
+                    wg_size,
+                    max_chunks_per_tensor,
+                    output_per_tensor);
+
                 multi_tensor_apply<1>(
                     tensor_lists,
                     LpNormFunctor<scalar_t, NormType::L2, out_opmath_t>(),
                     output_per_tensor.mutable_data_ptr<out_opmath_t>(),
                     max_chunks_per_tensor);
+
                 launch_lpnorm_chunk_reduce_kernel<
                     out_t,
                     NormType::L2,
diff --git a/src/ATen/native/xpu/sycl/GridSampler.cpp b/src/ATen/native/xpu/sycl/GridSampler.cpp
index 746e2b035..9427fd4ce 100644
--- a/src/ATen/native/xpu/sycl/GridSampler.cpp
+++ b/src/ATen/native/xpu/sycl/GridSampler.cpp
@@ -252,10 +252,6 @@ void grid_sampler_2d_forward_template(
     const GridSamplerInterpolation interpolation_mode,
     const GridSamplerPadding padding_mode,
     bool align_corners) {
-  auto& queue = getCurrentSYCLQueue();
-  const auto wgroup_size = syclMaxWorkGroupSize();
-  const auto ngroups = (nthreads + wgroup_size - 1) / wgroup_size;
-
   index_t C = input.sizes[1];
   index_t inp_H = input.sizes[2];
   index_t inp_W = input.sizes[3];
@@ -299,6 +295,11 @@ void grid_sampler_2d_forward_template(
       out_sC,
       out_sH,
       out_sW);
+
+  const auto wgroup_size = syclMaxWorkGroupSize(kfn);
+  const auto ngroups = (nthreads + wgroup_size - 1) / wgroup_size;
+  auto& queue = getCurrentSYCLQueue();
+
   sycl_kernel_submit(
       sycl::range<1>(ngroups * wgroup_size),
       sycl::range<1>(wgroup_size),
@@ -700,10 +701,6 @@ void grid_sampler_2d_backward_template(
     const GridSamplerPadding padding_mode,
     bool align_corners,
     const bool input_requires_grad) {
-  auto& queue = getCurrentSYCLQueue();
-  const auto wgroup_size = syclMaxWorkGroupSize();
-  const auto ngroups = (nthreads + wgroup_size - 1) / wgroup_size;
-
   index_t C = input.sizes[1];
   index_t inp_H = input.sizes[2];
   index_t inp_W = input.sizes[3];
@@ -768,6 +765,11 @@ void grid_sampler_2d_backward_template(
       gInp_sH,
       gInp_sW,
       gGrid_sW);
+
+  const auto wgroup_size = syclMaxWorkGroupSize(kfn);
+  const auto ngroups = (nthreads + wgroup_size - 1) / wgroup_size;
+  auto& queue = getCurrentSYCLQueue();
+
   sycl_kernel_submit(
       sycl::range<1>(ngroups * wgroup_size),
       sycl::range<1>(wgroup_size),
diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
index 08adcd608..ae9bf0005 100644
--- a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
@@ -257,9 +257,9 @@ void group_norm_kernel_impl(
 
   auto& queue = getCurrentSYCLQueue();
   int64_t simd = syclMaxSubGroupSize();
-  const int64_t wg_size = D * HxW < get_group_reduce_group_size()
+  const int64_t wg_size = D * HxW < get_group_reduce_group_size(simd)
       ? simd
-      : get_group_reduce_group_size();
+      : get_group_reduce_group_size(simd);
   int64_t nwg = N * G;
   auto global_range = sycl::range<1>(nwg * wg_size);
   auto local_range = sycl::range<1>(wg_size);
@@ -384,8 +384,10 @@ struct Compute1dBackwardFusedParamsFunctor
   }
 
   void sycl_ker_config_convention(sycl::handler& cgh) {
-    ds_shared_ = sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(), cgh);
-    db_shared_ = sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(), cgh);
+    ds_shared_ =
+        sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(SIMD), cgh);
+    db_shared_ =
+        sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(SIMD), cgh);
   }
 
   Compute1dBackwardFusedParamsFunctor(
@@ -664,9 +666,9 @@ void group_norm_1d_backward(
     T_ACC* c2_data = c2.mutable_data_ptr<T_ACC>();
     T_ACC* c3_data = c3.mutable_data_ptr<T_ACC>();
 
-    const int64_t wg_size = (C / G) < get_group_reduce_group_size()
+    const int64_t wg_size = (C / G) < get_group_reduce_group_size(simd)
         ? simd
-        : get_group_reduce_group_size();
+        : get_group_reduce_group_size(simd);
     auto global_range = sycl::range<2>(G, N * wg_size);
     auto local_range = sycl::range<2>(1, wg_size);
     group_norm_kernel_simd_choice_and_launch<
@@ -717,7 +719,7 @@ void group_norm_1d_backward(
     T* dgamma_data = dgamma.defined() ? dgamma.mutable_data_ptr<T>() : nullptr;
     T* dbeta_data = dbeta.defined() ? dbeta.mutable_data_ptr<T>() : nullptr;
     if (N <= 128) {
-      const int64_t wg_size = get_group_reduce_group_size();
+      const int64_t wg_size = get_group_reduce_group_size(simd);
       const int64_t B = (C + wg_size - 1) / wg_size;
       auto caller = GammaBeta1dBackwardSmallKernel<T>(
           N,
@@ -784,8 +786,10 @@ struct ComputeInternalGradientsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
   }
 
   void sycl_ker_config_convention(sycl::handler& cgh) {
-    ds_shared_ = sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(), cgh);
-    db_shared_ = sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(), cgh);
+    ds_shared_ =
+        sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(SIMD), cgh);
+    db_shared_ =
+        sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(SIMD), cgh);
   }
 
   ComputeInternalGradientsFunctor(
@@ -857,8 +861,10 @@ struct ComputeBackwardFusedParamsFunctor
   }
 
   void sycl_ker_config_convention(sycl::handler& cgh) {
-    ds_shared_ = sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(), cgh);
-    db_shared_ = sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(), cgh);
+    ds_shared_ =
+        sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(SIMD), cgh);
+    db_shared_ =
+        sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(SIMD), cgh);
   }
 
   ComputeBackwardFusedParamsFunctor(
@@ -1144,9 +1150,9 @@ void group_norm_backward_kernel_impl(
   auto& queue = getCurrentSYCLQueue();
 
   int64_t simd = syclMaxSubGroupSize();
-  int64_t wg_size = HxW < get_group_reduce_group_size()
+  int64_t wg_size = HxW < get_group_reduce_group_size(simd)
       ? simd
-      : get_group_reduce_group_size();
+      : get_group_reduce_group_size(simd);
   group_norm_kernel_simd_choice_and_launch<
       ComputeInternalGradientsFunctor<T, SIMD16>,
       ComputeInternalGradientsFunctor<T, SIMD32>>(
@@ -1177,9 +1183,9 @@ void group_norm_backward_kernel_impl(
       gpu_kernel(iter, GroupNormBackwardC1Functor<T, T_ACC>());
     }
 
-    wg_size = (C / G) < get_group_reduce_group_size()
+    wg_size = (C / G) < get_group_reduce_group_size(simd)
         ? simd
-        : get_group_reduce_group_size();
+        : get_group_reduce_group_size(simd);
     group_norm_kernel_simd_choice_and_launch<
         ComputeBackwardFusedParamsFunctor<T, SIMD16>,
         ComputeBackwardFusedParamsFunctor<T, SIMD32>>(
diff --git a/src/ATen/native/xpu/sycl/GroupReduceUtils.h b/src/ATen/native/xpu/sycl/GroupReduceUtils.h
index 95ef90a69..c85877da9 100644
--- a/src/ATen/native/xpu/sycl/GroupReduceUtils.h
+++ b/src/ATen/native/xpu/sycl/GroupReduceUtils.h
@@ -12,8 +12,10 @@ namespace at {
 namespace native {
 namespace xpu {
 
-inline size_t get_group_reduce_group_size() {
-  return syclMaxWorkGroupSize() / 2;
+inline int get_group_reduce_group_size(int simd) {
+  // Limited by group reduce implementation. We use two sub group shuffles,
+  // The second sub group shuffle only could handle simd size elements.
+  return std::min(512, simd * simd);
 }
 
 template <int DIM>
diff --git a/src/ATen/native/xpu/sycl/Indexing.cpp b/src/ATen/native/xpu/sycl/Indexing.cpp
index 5b2981f12..1149a0cfc 100644
--- a/src/ATen/native/xpu/sycl/Indexing.cpp
+++ b/src/ATen/native/xpu/sycl/Indexing.cpp
@@ -82,12 +82,28 @@ static inline void _index_select_kernel(
     IdxInfo& index_info,
     int64_t dim) {
   using scalar_t = typename SrcInfo::scalar_t;
-  auto cfg = IndexKernelConfig<
+  using IdxConfig = IndexKernelConfig<
       SrcInfo,
       DstInfo,
       IdxInfo,
-      IndexSelectScalarFunctor<scalar_t>>::
-      make_config(
+      IndexSelectScalarFunctor<scalar_t>>;
+
+  using IndexKnownProblemInnerKernel =
+      IndexKernel<IdxConfig, TrivialOffCal, true>;
+  auto IndexKnownProblemInnerKernel_cfg =
+      IdxConfig::template make_config<IndexKnownProblemInnerKernel>(
+          src_info,
+          dst_info,
+          index_info,
+          static_cast<scalar_t>(0),
+          dim,
+          false,
+          IndexSelectScalarFunctor<scalar_t>());
+
+  using IndexUnknownProblemInnerKernel =
+      IndexKernel<IdxConfig, TrivialOffCal, false>;
+  auto IndexUnknownProblemInnerKernel_cfg =
+      IdxConfig::template make_config<IndexUnknownProblemInnerKernel>(
           src_info,
           dst_info,
           index_info,
@@ -95,10 +111,13 @@ static inline void _index_select_kernel(
           dim,
           false,
           IndexSelectScalarFunctor<scalar_t>());
-  if (cfg.problem_inner_) {
-    launch_index_kernel<decltype(cfg), TrivialOffCal, true>(cfg);
+
+  if (IndexKnownProblemInnerKernel_cfg.problem_inner_) {
+    launch_index_kernel<IdxConfig, TrivialOffCal, true>(
+        IndexKnownProblemInnerKernel_cfg);
   } else {
-    launch_index_kernel<decltype(cfg), TrivialOffCal, false>(cfg);
+    launch_index_kernel<IdxConfig, TrivialOffCal, false>(
+        IndexUnknownProblemInnerKernel_cfg);
   }
 }
 
@@ -390,19 +409,21 @@ void index_add_kernel(
               getTensorInfo<scalar_t, int64_t>(self_);
           int new_indexing_dim = dst_info.collapseDims(dim);
 
-          auto cfg = IndexKernelConfig<
+          using IdxConfig = IndexKernelConfig<
               decltype(src_info),
               decltype(dst_info),
               decltype(index_info),
-              IndexAddScalarFunctor<scalar_t>>::
-              make_config(
-                  src_info,
-                  dst_info,
-                  index_info,
-                  alpha.to<scalar_t>(),
-                  new_indexing_dim,
-                  true,
-                  IndexAddScalarFunctor<scalar_t>());
+              IndexAddScalarFunctor<scalar_t>>;
+          using KernelClass = IndexKernel<IdxConfig, false, false>;
+
+          auto cfg = IdxConfig::template make_config<KernelClass>(
+              src_info,
+              dst_info,
+              index_info,
+              alpha.to<scalar_t>(),
+              new_indexing_dim,
+              true,
+              IndexAddScalarFunctor<scalar_t>());
           launch_index_kernel(cfg);
         });
       });
diff --git a/src/ATen/native/xpu/sycl/Indexing.h b/src/ATen/native/xpu/sycl/Indexing.h
index 5ed09de0e..7ce476df5 100644
--- a/src/ATen/native/xpu/sycl/Indexing.h
+++ b/src/ATen/native/xpu/sycl/Indexing.h
@@ -106,6 +106,7 @@ class IndexKernelConfig : public BatchKernelConfig {
     return;
   }
 
+  template <class KernelClass>
   static IndexKernelConfig<SrcInfo, DstInfo, IdxInfo, FuncType> make_config(
       SrcInfo& src_info,
       DstInfo& dst_info,
@@ -154,7 +155,7 @@ class IndexKernelConfig : public BatchKernelConfig {
           problem_inner);
     }
 
-    return {
+    IndexKernelConfig<SrcInfo, DstInfo, IdxInfo, FuncType> cfg = {
         src_info,
         dst_info,
         index_info,
@@ -169,6 +170,9 @@ class IndexKernelConfig : public BatchKernelConfig {
         stride,
         problem_batch,
         problem_along_x};
+
+    cfg.template build<KernelClass>();
+    return cfg;
   }
 
  public:
@@ -512,11 +516,13 @@ void small_index_kernel(
     IntArrayRef non_index_size,
     IntArrayRef non_index_stride,
     const func_t f) {
+  using index_buf_type = char*;
+  using KernelClass = SmallIndexKernelFunctor<func_t, index_buf_type>;
+
   auto numel = iter.numel();
   auto indices_size = iter.tensor(2).size(-1);
   auto& queue = getCurrentSYCLQueue();
-  auto dev_id = getDeviceIndexOfCurrentQueue();
-  int64_t max_group_num = syclMaxDSSNum(dev_id) * OVER_SUBSCRIBE_DSS_FACTOR;
+  int64_t max_group_num = syclMaxDSSNum() * OVER_SUBSCRIBE_DSS_FACTOR;
 
   auto total_index_iter = numel / indices_size;
   max_group_num = std::min(int64_t(total_index_iter / 2), max_group_num);
@@ -529,7 +535,7 @@ void small_index_kernel(
   auto group_numel = group_index_iter * indices_size;
   auto group_numel_tail = (group_index_iter - 1) * indices_size;
 
-  auto wgroup_size = syclMaxWorkGroupSize(dev_id);
+  auto wgroup_size = syclMaxWorkGroupSize<KernelClass>();
   wgroup_size = std::min(decltype(wgroup_size)(group_numel), wgroup_size);
   auto global_size = max_group_num * wgroup_size;
 
@@ -555,13 +561,12 @@ void small_index_kernel(
 
   auto out_data = (char*)iter.data_ptr(0);
   auto in_data = (char*)iter.data_ptr(1);
-  using index_buf_type = decltype((char*)iter.data_ptr(0));
   at::detail::Array<index_buf_type, XPU_MAX_TENSORINFO_DIMS> index_ptrs;
   for (size_t i = 0; i < num_indices; i++) {
     index_ptrs[i] = (char*)iter.data_ptr(i + 2);
   }
 
-  SmallIndexKernelFunctor<func_t, index_buf_type> kfn(
+  KernelClass kfn(
       f,
       indices_size,
       group_num_tail,
@@ -741,16 +746,18 @@ void _index_kernel(
     }
   }
   if (small_index) {
-    auto dev_id = getDeviceIndexOfCurrentQueue();
-    int64_t max_group_num = syclMaxDSSNum(dev_id);
-    auto wgroup_size = syclMaxWorkGroupSize(dev_id);
+    using index_buf_type = char*;
+    using KernelClass = SmallIndexKernelFunctor<func_t, index_buf_type>;
+
+    int64_t max_group_num = syclMaxDSSNum();
+    auto wgroup_size = syclMaxWorkGroupSize<KernelClass>();
     auto indices_size = iter.tensor(2).size(-1);
     auto total_index_iter = numel / indices_size;
     auto local_index = numel / max_group_num;
 
     // the max_local_mem_size = 65536B (64KB)
     // TODO: Is this right?
-    auto max_local_mem_size = syclLocalMemSize(dev_id);
+    auto max_local_mem_size = syclLocalMemSize();
     auto indice_table_size = indices_size * sizeof(int64_t);
 
     // check whether the current case satisfying conditions 2,3,4
@@ -870,7 +877,11 @@ void launch_index_put_deterministic_kernel(
 
   // align with precision of CPU backend.
   using accscalar_t = scalar_t; /* acc_type<scalar_t>; */
-  IndexPutDeterministicKernelFunctor<scalar_t, accscalar_t> kfn(
+  using KernelClass = IndexPutDeterministicKernelFunctor<scalar_t, accscalar_t>;
+
+  cfg.template build<KernelClass>();
+
+  KernelClass kfn(
       sorted_indices,
       indices,
       value,
diff --git a/src/ATen/native/xpu/sycl/LossNLLKernel.cpp b/src/ATen/native/xpu/sycl/LossNLLKernel.cpp
index e487a426f..f3d6c1f94 100644
--- a/src/ATen/native/xpu/sycl/LossNLLKernel.cpp
+++ b/src/ATen/native/xpu/sycl/LossNLLKernel.cpp
@@ -226,6 +226,9 @@ void nll_loss_forward_template(
   int64_t batch_size = input.size(0);
 
   if (reduction == at::Reduction::None && n_dims == 2) {
+    using NllLossForwardNoReduceKernel =
+        NllLossForwardNoReduceKernelFunctor<scalar_t, index_t>;
+
     output.resize_({batch_size});
     total_weight.zero_();
     int64_t target_stride = target.stride(0);
@@ -233,7 +236,7 @@ void nll_loss_forward_template(
     auto weight_cont = weight.defined() ? weight.contiguous() : weight;
 
     auto& queue = getCurrentSYCLQueue();
-    int64_t local_size = syclMaxWorkGroupSize();
+    int64_t local_size = syclMaxWorkGroupSize<NllLossForwardNoReduceKernel>();
     bool has_weight = weight.defined()
         ? true
         : false; // sycl kernel can not accept host pointer
@@ -248,7 +251,7 @@ void nll_loss_forward_template(
         ? weight_cont.data_ptr<scalar_t>()
         : input_data; // use the input as the dummy data.
     auto output_data = output.data_ptr<scalar_t>();
-    NllLossForwardNoReduceKernelFunctor<scalar_t, index_t> kfn(
+    NllLossForwardNoReduceKernel kfn(
         input_data,
         target_data,
         weight_data,
@@ -304,9 +307,12 @@ void nll_loss_forward_template(
 
     sycl_kernel_submit(sycl::range<1>(local_size), queue, kfn);
   } else if (input_cont.dim() == 2) {
+    using NllLossForwardReduce2DKernel =
+        NllLossForwardReduce2DKernelFunctor<scalar_t, index_t>;
+
     int64_t batch_size = input.size(0);
     int n_target = input.size(1);
-    int64_t local_size = syclMaxWorkGroupSize();
+    int64_t local_size = syclMaxWorkGroupSize<NllLossForwardReduce2DKernel>();
     auto input_data = _input_data;
     auto weight_data = has_weight
         ? _weight_data
@@ -527,12 +533,15 @@ static inline void nll_loss_backward_template(
   int64_t batch_size = input.size(0);
 
   if (reduction == at::Reduction::None && n_dims == 2) {
+    using NllLossBackwardNoReduceKernel =
+        NllLossBackwardNoReduceKernelFunctor<scalar_t, index_t>;
+
     int64_t target_stride = target.stride(0);
     check_dim_size(gradOutput, 1, 0, batch_size);
     auto weight_cont = weight.defined() ? weight.contiguous() : weight;
 
     auto& queue = getCurrentSYCLQueue();
-    int64_t local_size = syclMaxWorkGroupSize();
+    int64_t local_size = syclMaxWorkGroupSize<NllLossBackwardNoReduceKernel>();
     int64_t global_size =
         ((batch_size + local_size - 1) / local_size) * local_size;
     bool has_weight = weight.defined() ? true : false;
@@ -547,7 +556,7 @@ static inline void nll_loss_backward_template(
         ? weight_cont.data_ptr<scalar_t>()
         : gradOutput_data; // Use gradOutput handler as dummy weight
     auto gradInput_data = gradInput.data_ptr<scalar_t>();
-    NllLossBackwardNoReduceKernelFunctor<scalar_t, index_t> kfn(
+    NllLossBackwardNoReduceKernel kfn(
         target_data,
         gradOutput_data,
         weight_data,
diff --git a/src/ATen/native/xpu/sycl/MultiTensorApply.h b/src/ATen/native/xpu/sycl/MultiTensorApply.h
index 8f9792a87..0817e40be 100644
--- a/src/ATen/native/xpu/sycl/MultiTensorApply.h
+++ b/src/ATen/native/xpu/sycl/MultiTensorApply.h
@@ -48,12 +48,14 @@ struct TLMetaForWG {
   uint32_t wg_to_chunk;
 };
 
-static inline int64_t multi_tensor_apply_kernel_get_wg_size() {
-  return syclMaxWorkGroupSize();
+template <class KernelClass>
+static int64_t multi_tensor_apply_kernel_get_wg_size() {
+  return syclMaxWorkGroupSize<KernelClass>();
 }
 
-static inline int64_t multi_tensor_apply_kernel_get_chunk_size() {
-  int64_t max_wg_size = multi_tensor_apply_kernel_get_wg_size();
+template <class KernelClass>
+static int64_t multi_tensor_apply_kernel_get_chunk_size() {
+  int64_t max_wg_size = multi_tensor_apply_kernel_get_wg_size<KernelClass>();
   return max_wg_size * kElementPerThread;
 }
 
@@ -116,17 +118,18 @@ void launch_multi_tensor_apply_kernel(
     U callable,
     int num_wg,
     ArgTypes... args) {
+  using KernelClass = MultiTensorApplyKernelFunctor<T, Y, U, ArgTypes...>;
+
   auto& q = getCurrentSYCLQueue();
-  int64_t max_wg_size = multi_tensor_apply_kernel_get_wg_size();
-  int64_t kChunkSize = multi_tensor_apply_kernel_get_chunk_size();
+  int64_t max_wg_size = multi_tensor_apply_kernel_get_wg_size<KernelClass>();
+  int64_t kChunkSize = multi_tensor_apply_kernel_get_chunk_size<KernelClass>();
 
   if constexpr (fused_kernel) {
     max_wg_size = multi_tensor_apply_fused_kernel_get_wg_size();
     kChunkSize = multi_tensor_apply_fused_kernel_get_chunk_size();
   }
 
-  MultiTensorApplyKernelFunctor<T, Y, U, ArgTypes...> kfn(
-      kChunkSize, tlAddressMeta, tlWGMeta, callable, args...);
+  KernelClass kfn(kChunkSize, tlAddressMeta, tlWGMeta, callable, args...);
 
   sycl_kernel_submit(
       sycl::range<1>(num_wg * max_wg_size),
@@ -141,14 +144,20 @@ void multi_tensor_apply(
     at::ArrayRef<Scalar> scalars,
     T callable,
     ArgTypes... args) {
+  using scalar_vals_t = typename T::opmath_t;
+  using KernelClass = MultiTensorApplyKernelFunctor<
+      TLMetaForAddressScalar<scalar_vals_t, depth>*,
+      TLMetaForWG*,
+      T,
+      ArgTypes...>;
+
   TORCH_CHECK(
       tensor_lists.size() == depth,
       "Number of tensor lists has to match he depth");
   size_t n_tensors = tensor_lists[0].size();
-  using scalar_vals_t = typename T::opmath_t;
 
   auto& q = getCurrentSYCLQueue();
-  int64_t kChunkSize = multi_tensor_apply_kernel_get_chunk_size();
+  int64_t kChunkSize = multi_tensor_apply_kernel_get_chunk_size<KernelClass>();
 
   auto addressStorage = at::empty(
       {(int)(sizeof(TLMetaForAddressScalar<scalar_vals_t, depth>) * n_tensors)},
@@ -221,13 +230,19 @@ void multi_tensor_apply(
     std::vector<std::vector<at::Tensor>>& tensor_lists,
     T callable,
     ArgTypes... args) {
+  using KernelClass = MultiTensorApplyKernelFunctor<
+      TLMetaForAddress<depth>*,
+      TLMetaForWG*,
+      T,
+      ArgTypes...>;
+
   TORCH_CHECK(
       tensor_lists.size() == depth,
       "Number of tensor lists has to match he depth");
   size_t n_tensors = tensor_lists[0].size();
 
   auto& q = getCurrentSYCLQueue();
-  int64_t kChunkSize = multi_tensor_apply_kernel_get_chunk_size();
+  int64_t kChunkSize = multi_tensor_apply_kernel_get_chunk_size<KernelClass>();
 
   auto addressStorage = at::empty(
       {(int)(sizeof(TLMetaForAddress<depth>) * n_tensors)},
diff --git a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp
index a8dc4e97c..e1c50a263 100644
--- a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp
+++ b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp
@@ -112,13 +112,13 @@ void nonzero_template(const Tensor& self_, Tensor& tensor) {
       }
 
       const int64_t N = num_nonzeros * num_dim;
-      const auto wg_sz = std::min(syclMaxWorkGroupSize(), N);
-      const auto num_wg = (N + wg_sz - 1) / wg_sz;
-
       // restore flatten idx to indices
       FlattenIdxtoRealIdxKernelFunctor kfn(
           N, num_dim, tensor_begin, idx_flat_begin, divisor, sizes);
 
+      const auto wg_sz = std::min(syclMaxWorkGroupSize(kfn), N);
+      const auto num_wg = (N + wg_sz - 1) / wg_sz;
+
       sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), kfn);
 
       // Support non-contiguous/outplace cases
diff --git a/src/ATen/native/xpu/sycl/Norm.h b/src/ATen/native/xpu/sycl/Norm.h
index d607667fe..da9bdf12b 100644
--- a/src/ATen/native/xpu/sycl/Norm.h
+++ b/src/ATen/native/xpu/sycl/Norm.h
@@ -15,6 +15,9 @@ namespace xpu {
 using namespace at::native::memory;
 using namespace at::xpu;
 
+// syclDeviceMaxWorkGroup is allowed for launching Norm kernels, only if SIMD
+// is 32. Related kernels include FusedNormKernelFunctor and
+// RowwiseMomentsKernelFunctor. Don't change SIMD, unless refactor the kernels.
 constexpr int SIMD = 32;
 
 template <
@@ -277,9 +280,8 @@ class NormConfig {
   // get resource size for Reduce problem [batch_size, problem_size]
   // the reduce is performed on problem_size dimension
   void get_workgroup_size() {
-    auto dev_id = getDeviceIndexOfCurrentQueue();
-    int max_workgroup_size = syclMaxWorkGroupSize(dev_id);
-    int total_resource = syclMaxWorkItemsPerTile(dev_id);
+    int max_workgroup_size = syclDeviceMaxWorkGroupSize();
+    int total_resource = syclMaxWorkItemsPerTile();
     workgroup_num = total_resource / max_workgroup_size;
     int max_workgroup_num_foreach = 1;
     workgroup_size = max_workgroup_size;
@@ -311,9 +313,8 @@ class NormConfig {
 
   void get_workgroup_size_row() {
     // enlarge the occupancy, compute the least workgroup_num
-    auto dev_id = getDeviceIndexOfCurrentQueue();
-    int max_workgroup_size = syclMaxWorkGroupSize(dev_id);
-    int total_resource = syclMaxWorkItemsPerTile(dev_id);
+    int max_workgroup_size = syclDeviceMaxWorkGroupSize();
+    int total_resource = syclMaxWorkItemsPerTile();
     workgroup_num = total_resource / max_workgroup_size;
 
     int max_block_row = max_workgroup_size / SIMD;
@@ -1048,18 +1049,21 @@ template <
     bool one_moment = false>
 void launch_norm_eltwise_update_kernel(Norm<scalar_t, mean_t, weight_t>& norm) {
   using vec_t = aligned_vector<scalar_t, vec_size>;
-  int total_threads = syclMaxWorkItemsPerTile();
-  auto workgroup_size = syclMaxWorkGroupSize();
-  index_t loops_end = (norm.numel() + vec_size - 1) / vec_size;
-
-  auto kfn = NormEltwiseUpdateKernelFunctor<
+  using KernelClass = NormEltwiseUpdateKernelFunctor<
       scalar_t,
       mean_t,
       weight_t,
       index_t,
       vec_size,
       Norm,
-      vec_t>(norm, loops_end, total_threads);
+      vec_t>;
+
+  int total_threads = syclMaxWorkItemsPerTile();
+  auto workgroup_size = syclMaxWorkGroupSize<KernelClass>();
+  index_t loops_end = (norm.numel() + vec_size - 1) / vec_size;
+
+  auto kfn = KernelClass(norm, loops_end, total_threads);
+
   sycl_kernel_submit(total_threads, workgroup_size, getCurrentSYCLQueue(), kfn);
 }
 
diff --git a/src/ATen/native/xpu/sycl/RandpermKernel.cpp b/src/ATen/native/xpu/sycl/RandpermKernel.cpp
index 233f4d18b..d151de28c 100644
--- a/src/ATen/native/xpu/sycl/RandpermKernel.cpp
+++ b/src/ATen/native/xpu/sycl/RandpermKernel.cpp
@@ -88,7 +88,8 @@ void randperm_handle_duplicate_keys(
 
   T mask = static_cast<T>((1UL << bits) - 1);
   HandleDuplicateKeysKernelFunctor kfn(keys, data, mask, n, rng_engine_inputs);
-  auto local_range = syclMaxWorkGroupSize() / 2;
+
+  auto local_range = syclMaxWorkGroupSize(kfn) / 2;
   auto num_wg = (n + local_range - 1) / local_range;
   auto global_range = num_wg * local_range;
 
diff --git a/src/ATen/native/xpu/sycl/Reduce.h b/src/ATen/native/xpu/sycl/Reduce.h
index ba0820e7c..1be3a5e93 100644
--- a/src/ATen/native/xpu/sycl/Reduce.h
+++ b/src/ATen/native/xpu/sycl/Reduce.h
@@ -237,9 +237,9 @@ struct ReduceConfig {
   int input_vec_size = 1;
   int output_vec_size = 1;
 
-  template <typename T>
+  template <typename T, class KernelClass>
   void set_group_dimension(int64_t dim0, int64_t dim1) {
-    auto max_wg_sz = syclMaxWorkGroupSize();
+    auto max_wg_sz = syclMaxWorkGroupSize<KernelClass>();
     auto max_sg_sz = syclMaxSubGroupSize();
     const int max_num_items = max_wg_sz / output_vec_size;
     int dim0_pow2 = dim0 < max_num_items ? static_cast<int>(last_pow2(dim0))
@@ -1429,7 +1429,22 @@ inline void gpu_reduce_kernel(
   }
 
   // Adjust group_width and group_height
-  config.set_group_dimension<scalar_t>(dim0, dim1);
+  // Mapping to launch_reduce_kernel
+  using R = ReduceOp<scalar_t, ops_t, uint32_t, out_scalar_t, vt0>;
+  switch (config.output_vec_size) {
+    case 4: {
+      config.set_group_dimension<scalar_t, ReduceKernel<4, R>>(dim0, dim1);
+      break;
+    }
+    case 2: {
+      config.set_group_dimension<scalar_t, ReduceKernel<2, R>>(dim0, dim1);
+      break;
+    }
+    default: {
+      config.set_group_dimension<scalar_t, ReduceKernel<1, R>>(dim0, dim1);
+      break;
+    }
+  }
 
   int group_width = config.group_width;
   int group_height = config.group_height;
@@ -1511,20 +1526,20 @@ inline void gpu_reduce_kernel(
   AT_ASSERT(can_use_32bit_indexing);
   auto output_calc = make_output_calculator<uint32_t>(iter);
   auto input_calc = make_input_calculator<uint32_t>(iter);
-  auto reduce = ReduceOp<scalar_t, ops_t, uint32_t, out_scalar_t, vt0>(
-      ops,
-      config,
-      input_calc,
-      output_calc,
-      in_data,
-      out_data,
-      out_data_extra,
-      acc_data,
-      buffer.defined() ? (void*)buffer.data_ptr() : nullptr,
-      buffer.defined() ? (int*)semaphores.data_ptr() : nullptr,
-      ident,
-      noutputs,
-      base_idx);
+  auto reduce =
+      R(ops,
+        config,
+        input_calc,
+        output_calc,
+        in_data,
+        out_data,
+        out_data_extra,
+        acc_data,
+        buffer.defined() ? (void*)buffer.data_ptr() : nullptr,
+        buffer.defined() ? (int*)semaphores.data_ptr() : nullptr,
+        ident,
+        noutputs,
+        base_idx);
   reduce.accumulate = iter.should_accumulate();
   reduce.final_output = iter.is_final_output();
 
diff --git a/src/ATen/native/xpu/sycl/ScanUtils.h b/src/ATen/native/xpu/sycl/ScanUtils.h
index 46938522c..6bd6c4dc6 100644
--- a/src/ATen/native/xpu/sycl/ScanUtils.h
+++ b/src/ATen/native/xpu/sycl/ScanUtils.h
@@ -186,14 +186,13 @@ class LoopScanConfig {
         glb_range_y_(0),
         wg_range_x_(0),
         wg_range_y_(0) {
-    auto dev_id = getDeviceIndexOfCurrentQueue();
-    size_t wg_size = syclMaxWorkItemsPerEU(dev_id);
+    size_t wg_size = syclMaxWorkItemsPerEU();
     wg_range_x_ = 32;
     while (problem_ <= wg_range_x_ >> 1) {
       wg_range_x_ = wg_range_x_ >> 1;
     }
     wg_range_y_ = wg_size / wg_range_x_;
-    const auto target_global_size = syclMaxWorkItemsPerTile(dev_id);
+    const auto target_global_size = syclMaxWorkItemsPerTile();
     ;
     const size_t max_work_group_num = target_global_size / wg_size;
     const size_t wg_number =
@@ -392,18 +391,6 @@ class SegmentScanConfig : public BatchKernelConfig {
   using OutputInfoType = OutputInfo;
   using IndicesInfoType = IndicesInfo;
 
-  // // Manually enable copy constructor
-  // SegmentScanConfig(const SegmentScanConfig& cfg_) {
-  //   this->iinfo_ = cfg_.iinfo_;
-  //   this->oinfo_ = cfg_.oinfo_;
-  //   this->idxinfo_ = cfg_.idxinfo_;
-  //   this->init_ = cfg_.init_;
-  //   this->type_ = cfg_.type_;
-  //   this->func_ = cfg_.func_;
-  //   this->carrier_ = cfg_.carrier_;
-  //   this->carrier_idx_ = cfg_.carrier_idx_;
-  // }
-
   SegmentScanConfig() {}
 
   SegmentScanConfig(
@@ -432,6 +419,7 @@ class SegmentScanConfig : public BatchKernelConfig {
         carrier_(nullptr),
         carrier_idx_(nullptr) {}
 
+  template <class KernelClass>
   static SegmentScanConfig<
       InputInfo,
       OutputInfo,
@@ -450,17 +438,22 @@ class SegmentScanConfig : public BatchKernelConfig {
     int64_t stride = input_info.innerSize(scan_dim);
     int64_t problem = input_info.sizes[scan_dim];
     bool problem_along_x = input_info.strides[scan_dim] == 1 ? true : false;
-    return {
-        input_info,
-        output_info,
-        indices_info,
-        batch,
-        problem,
-        stride,
-        problem_along_x,
-        init,
-        type,
-        func};
+
+    SegmentScanConfig<InputInfo, OutputInfo, IndicesInfo, T, BinaryFunction>
+        cfg = {
+            input_info,
+            output_info,
+            indices_info,
+            batch,
+            problem,
+            stride,
+            problem_along_x,
+            init,
+            type,
+            func};
+
+    cfg.template build<KernelClass>();
+    return cfg;
   }
 
   int64_t carrier_size() {
@@ -706,13 +699,21 @@ static inline void _segment_scan_kernel(
     int dim_after_collapse,
     T init,
     BinaryFunction func) {
+  using SSConfig = SegmentScanConfig<
+      InputInfo,
+      OutputInfo,
+      OutputInfo /*not used*/,
+      T,
+      BinaryFunction>;
+  using KernelClass = SegmentScanKernel<SSConfig, TrivialOffCal, TrivialIdxCal>;
+
   auto cfg = SegmentScanConfig<
       InputInfo,
       OutputInfo,
       OutputInfo /*not used*/,
       T,
       BinaryFunction>::
-      make_config(
+      template make_config<KernelClass>(
           input_info,
           output_info,
           output_info /*not used*/,
diff --git a/src/ATen/native/xpu/sycl/Shape.cpp b/src/ATen/native/xpu/sycl/Shape.cpp
index d86b32a8c..345d5078a 100644
--- a/src/ATen/native/xpu/sycl/Shape.cpp
+++ b/src/ATen/native/xpu/sycl/Shape.cpp
@@ -141,12 +141,19 @@ void CatArrayBatchedCopy(
     const int concatDim,
     IndexType dimStride,
     int batchCounter) {
-  auto& q = getCurrentSYCLQueue();
+  CatArrayBatchedCopyKernelFunctor<
+      Tout,
+      underlying_out_t,
+      Tin,
+      underlying_in_t,
+      IndexType,
+      Dims>
+      kfn(output, inputs, os, concatDim, dimStride);
 
   // Get grid where x dim fills half gpu and y dim is number of tensors.
   // This will have cating two tensors fill the entire grid, but prevent
   // many threads from needlessly load meta data if their sizes is small.
-  int64_t numWI = syclMaxWorkGroupSize();
+  int64_t numWI = syclMaxWorkGroupSize(kfn);
 
   // We set limited numWG to prevent over schedule.
   // numWG = 512 EUs * 8 threads * SIMD lanes 32 / max_compute_units
@@ -162,15 +169,7 @@ void CatArrayBatchedCopy(
     numWG = 128;
   sycl::range<2> global_range(batchCounter, numWG * numWI);
   sycl::range<2> local_range(1, numWI);
-
-  CatArrayBatchedCopyKernelFunctor<
-      Tout,
-      underlying_out_t,
-      Tin,
-      underlying_in_t,
-      IndexType,
-      Dims>
-      kfn(output, inputs, os, concatDim, dimStride);
+  auto& q = getCurrentSYCLQueue();
 
   sycl_kernel_submit(global_range, local_range, q, kfn);
 }
diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
index b8e380bc0..f0f114cde 100644
--- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
+++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
@@ -118,7 +118,7 @@ static inline void softmax_group_reduce_spatial(
   }
 }
 
-template <int SIMD, int vec_size, int NUM>
+template <int SIMD, int vec_size, int NUM, class KernelClass>
 static inline void get_wgroup_size(
     uint64_t dim_size,
     int outer_size,
@@ -127,8 +127,7 @@ static inline void get_wgroup_size(
     int& global_size_row,
     int& local_size_row,
     int& local_size_col) {
-  auto dev_id = getDeviceIndexOfCurrentQueue();
-  int maxWGSize = syclMaxWorkGroupSize(dev_id);
+  int maxWGSize = syclMaxWorkGroupSize<KernelClass>();
 
   int local_size = (dim_size + NUM * vec_size - 1) / (NUM * vec_size);
   local_size = std::min(local_size, maxWGSize);
@@ -163,16 +162,15 @@ static inline void get_wgroup_size(
 }
 
 // this method help to divide the computation resource for spatial_softmax
-template <int vec_size>
+template <int vec_size, class KernelClass>
 static inline void get_wgroup_size_spatial(
     int bs,
     int dim_size,
     int inner_size,
     int& GroupSize,
     int& GroupRow) {
-  auto dev_id = getDeviceIndexOfCurrentQueue();
-  int maxWGSize = syclMaxWorkGroupSize(dev_id);
-  int total_resource = syclMaxWorkItemsPerTile(dev_id);
+  int maxWGSize = syclMaxWorkGroupSize<KernelClass>();
+  int total_resource = syclMaxWorkItemsPerTile();
 
   // set the GroupSize smaller to ensure larger group number
   // smaller GroupSize is friendly to the tail case
@@ -389,22 +387,11 @@ void dispatch_softmax_forward_kernel(
   using vec_t = at::native::memory::aligned_vector<scalar_t, vec_size>;
   auto& queue = getCurrentSYCLQueue();
 
-  int sub_group_num, global_size_row, local_size_row, range, local_size;
-  get_wgroup_size<SIMD, vec_size, outer_loop>(
-      dim_size,
-      outer_size,
-      sub_group_num,
-      range,
-      global_size_row,
-      local_size_row,
-      local_size);
-  int64_t local_range{local_size_row * local_size};
-  int64_t global_range{global_size_row * local_size_row * local_size};
   scalar_t neginf = -std::numeric_limits<scalar_t>::infinity();
   scalar_t nan = std::numeric_limits<accscalar_t>::quiet_NaN();
 
   if constexpr (is_masked) {
-    auto caller = DispatchSoftmaxForwardKernelFunctor<
+    using KernelClass = DispatchSoftmaxForwardKernelFunctor<
         INNER_LOOP,
         vec_size,
         SIMD,
@@ -415,7 +402,21 @@ void dispatch_softmax_forward_kernel(
         outer_loop,
         is_masked,
         calc_t,
-        vec_t>(
+        vec_t>;
+
+    int sub_group_num, global_size_row, local_size_row, range, local_size;
+    get_wgroup_size<SIMD, vec_size, outer_loop, KernelClass>(
+        dim_size,
+        outer_size,
+        sub_group_num,
+        range,
+        global_size_row,
+        local_size_row,
+        local_size);
+    int64_t local_range{local_size_row * local_size};
+    int64_t global_range{global_size_row * local_size_row * local_size};
+
+    auto kfn = KernelClass(
         in_data,
         out_data,
         dim_size,
@@ -429,10 +430,10 @@ void dispatch_softmax_forward_kernel(
         local_size,
         neginf,
         nan);
-    sycl_kernel_submit(global_range, local_range, queue, caller);
+    sycl_kernel_submit(global_range, local_range, queue, kfn);
   } else {
     DummyFunctor dummy;
-    auto caller = DispatchSoftmaxForwardKernelFunctor<
+    using KernelClass = DispatchSoftmaxForwardKernelFunctor<
         INNER_LOOP,
         vec_size,
         SIMD,
@@ -443,7 +444,21 @@ void dispatch_softmax_forward_kernel(
         outer_loop,
         is_masked,
         DummyFunctor,
-        vec_t>(
+        vec_t>;
+
+    int sub_group_num, global_size_row, local_size_row, range, local_size;
+    get_wgroup_size<SIMD, vec_size, outer_loop, KernelClass>(
+        dim_size,
+        outer_size,
+        sub_group_num,
+        range,
+        global_size_row,
+        local_size_row,
+        local_size);
+    int64_t local_range{local_size_row * local_size};
+    int64_t global_range{global_size_row * local_size_row * local_size};
+
+    auto kfn = KernelClass(
         in_data,
         out_data,
         dim_size,
@@ -457,7 +472,7 @@ void dispatch_softmax_forward_kernel(
         local_size,
         neginf,
         nan);
-    sycl_kernel_submit(global_range, local_range, queue, caller);
+    sycl_kernel_submit(global_range, local_range, queue, kfn);
   }
 }
 
@@ -580,22 +595,25 @@ void softmax_forward_kernel(
     int outer_size) {
   using vec_t = at::native::memory::aligned_vector<scalar_t, vec_size>;
   constexpr int align_bytes = alignof(vec_t);
-  auto& queue = getCurrentSYCLQueue();
-  auto dev_id = getDeviceIndexOfCurrentQueue();
-  int local_size = std::min(
-      (dim_size + vec_size - 1) / vec_size, int(syclMaxWorkGroupSize(dev_id)));
-
-  int64_t local_range{local_size};
-  int64_t global_range{local_size * outer_size};
-  auto ker = SoftmaxForwardKernelFunctor<
+  using KernelClass = SoftmaxForwardKernelFunctor<
       vec_size,
       scalar_t,
       accscalar_t,
       IndexType,
       LogSoftMax,
       vec_t,
-      align_bytes>(in_data, out_data, dim_size, outer_size, local_size);
-  sycl_kernel_submit(global_range, local_range, queue, ker);
+      align_bytes>;
+
+  int local_size = std::min(
+      (dim_size + vec_size - 1) / vec_size,
+      int(syclMaxWorkGroupSize<KernelClass>()));
+  int64_t local_range{local_size};
+  int64_t global_range{local_size * outer_size};
+
+  auto kfn = KernelClass(in_data, out_data, dim_size, outer_size, local_size);
+
+  auto& queue = getCurrentSYCLQueue();
+  sycl_kernel_submit(global_range, local_range, queue, kfn);
 }
 
 template <
@@ -754,10 +772,16 @@ void spatial_softmax_forward(
     int inner_size,
     int outer_size) {
   using vec_t = at::native::memory::aligned_vector<scalar_t, vec_size>;
-  auto& queue = getCurrentSYCLQueue();
+  using KernelClass = SpatialSoftmaxForwardKernelFunctor<
+      vec_size,
+      scalar_t,
+      accscalar_t,
+      IndexType,
+      LogSoftMax,
+      vec_t>;
 
   int local_size, block_row;
-  get_wgroup_size_spatial<vec_size>(
+  get_wgroup_size_spatial<vec_size, KernelClass>(
       outer_size, dim_size, inner_size, local_size, block_row);
   int group_num =
       (inner_size + local_size * vec_size - 1) / (local_size * vec_size);
@@ -765,7 +789,7 @@ void spatial_softmax_forward(
       (size_t)outer_size, (size_t)block_row, (size_t)(group_num * local_size)};
   sycl::range<3> local_range{(size_t)1, (size_t)block_row, (size_t)local_size};
 
-  auto caller = SpatialSoftmaxForwardKernelFunctor<
+  auto kfn = SpatialSoftmaxForwardKernelFunctor<
       vec_size,
       scalar_t,
       accscalar_t,
@@ -780,7 +804,9 @@ void spatial_softmax_forward(
       local_size,
       block_row,
       group_num);
-  sycl_kernel_submit(global_range, local_range, queue, caller);
+
+  auto& queue = getCurrentSYCLQueue();
+  sycl_kernel_submit(global_range, local_range, queue, kfn);
 }
 
 template <
@@ -941,20 +967,10 @@ void dispatch_softmax_backward_kernel(
   using vec_t = at::native::memory::aligned_vector<scalar_t, vec_size>;
   auto& queue = getCurrentSYCLQueue();
   constexpr int NUM = INNER_LOOP / vec_size * (SIMD32 / SIMD);
-  int sub_group_num, global_size_row, local_size_row, range, local_size;
-  get_wgroup_size<SIMD, vec_size, NUM>(
-      dim_size,
-      outer_size,
-      sub_group_num,
-      range,
-      global_size_row,
-      local_size_row,
-      local_size);
-  int64_t local_range{local_size_row * local_size};
-  int64_t global_range{global_size_row * local_size_row * local_size};
 
+  int sub_group_num, global_size_row, local_size_row, range, local_size;
   if constexpr (is_masked) {
-    auto caller = DispatchSoftmaxBackwardKernelFunctor<
+    using KernelClass = DispatchSoftmaxBackwardKernelFunctor<
         INNER_LOOP,
         vec_size,
         SIMD,
@@ -965,7 +981,18 @@ void dispatch_softmax_backward_kernel(
         is_masked,
         calc_t,
         vec_t,
-        NUM>(
+        NUM>;
+
+    get_wgroup_size<SIMD, vec_size, NUM, KernelClass>(
+        dim_size,
+        outer_size,
+        sub_group_num,
+        range,
+        global_size_row,
+        local_size_row,
+        local_size);
+
+    auto kfn = KernelClass(
         gradInput,
         output,
         gradOutput,
@@ -978,10 +1005,14 @@ void dispatch_softmax_backward_kernel(
         local_size_row,
         range,
         local_size);
-    sycl_kernel_submit(global_range, local_range, queue, caller);
+
+    int64_t local_range{local_size_row * local_size};
+    int64_t global_range{global_size_row * local_size_row * local_size};
+
+    sycl_kernel_submit(global_range, local_range, queue, kfn);
   } else {
     DummyFunctor dummy;
-    auto caller = DispatchSoftmaxBackwardKernelFunctor<
+    using KernelClass = DispatchSoftmaxBackwardKernelFunctor<
         INNER_LOOP,
         vec_size,
         SIMD,
@@ -992,7 +1023,18 @@ void dispatch_softmax_backward_kernel(
         is_masked,
         DummyFunctor,
         vec_t,
-        NUM>(
+        NUM>;
+
+    get_wgroup_size<SIMD, vec_size, NUM, KernelClass>(
+        dim_size,
+        outer_size,
+        sub_group_num,
+        range,
+        global_size_row,
+        local_size_row,
+        local_size);
+
+    auto kfn = KernelClass(
         gradInput,
         output,
         gradOutput,
@@ -1005,7 +1047,11 @@ void dispatch_softmax_backward_kernel(
         local_size_row,
         range,
         local_size);
-    sycl_kernel_submit(global_range, local_range, queue, caller);
+
+    int64_t local_range{local_size_row * local_size};
+    int64_t global_range{global_size_row * local_size_row * local_size};
+
+    sycl_kernel_submit(global_range, local_range, queue, kfn);
   }
 }
 
@@ -1127,24 +1173,25 @@ void softmax_backward_kernel(
     int outer_size) {
   using vec_t = at::native::memory::aligned_vector<scalar_t, vec_size>;
   constexpr int align_bytes = alignof(vec_t);
-  auto& queue = getCurrentSYCLQueue();
-
-  auto dev_id = getDeviceIndexOfCurrentQueue();
-  int local_size = std::min(
-      (dim_size + vec_size - 1) / vec_size, int(syclMaxWorkGroupSize(dev_id)));
-  int64_t local_range{local_size};
-  int64_t global_range{local_size * outer_size};
-
-  auto caller = SoftmaxBackwardKernelFunctor<
+  using KernelClass = SoftmaxBackwardKernelFunctor<
       vec_size,
       scalar_t,
       accscalar_t,
       LogSoftMax,
       vec_t,
-      align_bytes>(
+      align_bytes>;
+
+  int local_size = std::min(
+      (dim_size + vec_size - 1) / vec_size,
+      int(syclMaxWorkGroupSize<KernelClass>()));
+  int64_t local_range{local_size};
+  int64_t global_range{local_size * outer_size};
+
+  auto kfn = KernelClass(
       gradInput, output, gradOutput, dim_size, outer_size, local_size);
 
-  sycl_kernel_submit(global_range, local_range, queue, caller);
+  auto& queue = getCurrentSYCLQueue();
+  sycl_kernel_submit(global_range, local_range, queue, kfn);
 }
 
 template <
@@ -1271,10 +1318,15 @@ void spatial_softmax_backward_kernel(
     int inner_size,
     int outer_size) {
   using vec_t = at::native::memory::aligned_vector<scalar_t, vec_size>;
-  auto& queue = getCurrentSYCLQueue();
+  using KernelClass = SpatialSoftmaxBackwardKernelFunctor<
+      vec_size,
+      scalar_t,
+      accscalar_t,
+      LogSoftMax,
+      vec_t>;
 
   int local_size, block_row;
-  get_wgroup_size_spatial<vec_size>(
+  get_wgroup_size_spatial<vec_size, KernelClass>(
       outer_size, dim_size, inner_size, local_size, block_row);
   int group_num =
       (inner_size + local_size * vec_size - 1) / (local_size * vec_size);
@@ -1282,7 +1334,7 @@ void spatial_softmax_backward_kernel(
       (size_t)outer_size, (size_t)block_row, (size_t)(group_num * local_size)};
   sycl::range<3> local_range{(size_t)1, (size_t)block_row, (size_t)local_size};
 
-  auto caller = SpatialSoftmaxBackwardKernelFunctor<
+  auto kfn = SpatialSoftmaxBackwardKernelFunctor<
       vec_size,
       scalar_t,
       accscalar_t,
@@ -1296,7 +1348,9 @@ void spatial_softmax_backward_kernel(
       outer_size,
       local_size,
       block_row);
-  sycl_kernel_submit(global_range, local_range, queue, caller);
+
+  auto& queue = getCurrentSYCLQueue();
+  sycl_kernel_submit(global_range, local_range, queue, kfn);
 }
 
 template <typename scalar_t, typename accscalar_t, bool LogSoftMax>
@@ -1322,8 +1376,8 @@ void spatial_softmax_forward(Tensor& output, Tensor& input, int dim) {
       canUse32BitIndexMath(input) && canUse32BitIndexMath(output);
 
   // decide SIMD: SIMD32 or SIMD16
-  auto* dev_prop =
-      at::xpu::getDeviceProperties(at::xpu::getDeviceIndexOfCurrentQueue());
+  auto dev_id = at::xpu::getDeviceIndexOfCurrentQueue();
+  auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   auto sub_group_size = dev_prop->sub_group_sizes;
   int SIMD = sub_group_size[1];
   if (SIMD == SIMD32) {
@@ -1381,8 +1435,27 @@ void spatial_softmax_forward(Tensor& output, Tensor& input, int dim) {
     // if the element number is smaller than max_work_group_size * INNER_LOOP,
     // the fast path (dispatch_softmax_forward) will be selected.
     // otherwise, the general path (softmax_forward_kernel) will be selected.
-    auto dev_id = getDeviceIndexOfCurrentQueue();
-    int max_group_size = syclMaxWorkGroupSize(dev_id);
+
+    // Query the smallest max work group size of the kernel template. The kernel
+    // instance with the largest register pressure will have the smallest max
+    // work group size. Memory spill probably occurs more severely than
+    // any other instances, then compiler probably chooses less SIMD width to
+    // mitgate register pressure. Actual max work group size of these kernel
+    // template allowed by the compiler is less than device allowed max work
+    // group size.
+    using DispatchSoftmaxForwardKernel = DispatchSoftmaxForwardKernelFunctor<
+        INNER_LOOP,
+        max_vec_size,
+        SIMD32,
+        scalar_t,
+        accscalar_t,
+        uint32_t,
+        LogSoftMax,
+        INNER_LOOP / max_vec_size,
+        false,
+        DummyFunctor,
+        vec_t>;
+    int max_group_size = syclMaxWorkGroupSize<DispatchSoftmaxForwardKernel>();
 
     if (can_use_32bit_index && max_group_size * INNER_LOOP >= dim_size) {
       // it assumes vec_size * outer_loop * work_group_size >= dim_size
@@ -1544,8 +1617,29 @@ void spatial_softmax_backward(
       outer_size);
 
   if (inner_size == 1) {
-    auto dev_id = getDeviceIndexOfCurrentQueue();
-    int max_group_size = syclMaxWorkGroupSize(dev_id);
+    // Query the smallest max work group size of the kernel template. The kernel
+    // instance with the largest register pressure will have the smallest max
+    // work group size. Memory spill probably occurs more severely than
+    // any other instances, then compiler probably chooses less SIMD width to
+    // mitgate register pressure. Actual max work group size of these kernel
+    // template allowed by the compiler is less than device allowed max work
+    // group size.
+    constexpr int NUM = INNER_LOOP / max_vec_size /* * (SIMD32 / SIMD32) */;
+    using DispatchSoftmaxBackwardKernel = DispatchSoftmaxBackwardKernelFunctor<
+        INNER_LOOP,
+        max_vec_size,
+        SIMD32,
+        scalar_t,
+        accscalar_t,
+        uint32_t,
+        LogSoftMax,
+        false, /* No instance for true */
+        DummyFunctor,
+        vec_t,
+        NUM>;
+
+    int max_group_size = syclMaxWorkGroupSize<DispatchSoftmaxBackwardKernel>();
+
     // if the element number is smaller than max_work_group_size * INNER_LOOP
     // / 2, (2 indicates reading two tensors: output and gradOutput) the fast
     // path (dispatch_softmax_backward) will be selected. otherwise, the
diff --git a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp
index 99b4beeb3..ecf832de3 100644
--- a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp
@@ -50,10 +50,11 @@ struct ElementwiseKernelFunctor {
 
 template <typename func_t>
 void elementwise_kernel(int total_n_elems, func_t f) {
+  using KernelClass = ElementwiseKernelFunctor<func_t>;
+
   auto& queue = getCurrentSYCLQueue();
-  auto dev_id = getDeviceIndexOfCurrentQueue();
-  int64_t max_wg_size = syclMaxWorkGroupSize(dev_id);
-  const auto target_global_size = syclMaxWorkItemsPerTile(dev_id);
+  int64_t max_wg_size = syclMaxWorkGroupSize<KernelClass>();
+  const auto target_global_size = syclMaxWorkItemsPerTile();
   int work_group_size =
       total_n_elems > max_wg_size ? max_wg_size : total_n_elems;
   const int max_work_group_num = target_global_size / work_group_size;
@@ -66,8 +67,7 @@ void elementwise_kernel(int total_n_elems, func_t f) {
 
   int total_work_items = work_group_size * work_group_num;
 
-  ElementwiseKernelFunctor<func_t> kfn(
-      loops, total_n_elems, f, total_work_items);
+  KernelClass kfn(loops, total_n_elems, f, total_work_items);
 
   sycl_kernel_submit(
       sycl::range<1>(total_work_items),
@@ -205,12 +205,14 @@ void roll_template(
     int64_t size,
     int64_t stride,
     int64_t total_dims) {
+  using KernelClass = RollKernelFunctor<scalar_t>;
+
   auto shift = size - start;
   auto offset = shift * stride;
   auto start_offset = start * stride;
   auto total_offset = size * stride;
 
-  auto local_range = syclMaxWorkGroupSize();
+  auto local_range = syclMaxWorkGroupSize<KernelClass>();
   const auto target_global_range =
       syclMaxWorkItemsPerTile() / local_range * local_range;
   int global_range = (N + local_range - 1) / local_range * local_range;
@@ -221,7 +223,7 @@ void roll_template(
 
   auto in_data = in_tensor.data_ptr<scalar_t>();
   auto out_data = out_tensor.data_ptr<scalar_t>();
-  RollKernelFunctor<scalar_t> kfn(
+  KernelClass kfn(
       in_data,
       out_data,
       val_of_work_item,
diff --git a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp
index fb63f19b6..07fd0be0a 100644
--- a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp
@@ -72,12 +72,7 @@ struct ApplyTriuTrilKernelFunctor {
 
 template <typename scalar_t, typename IndexType, bool upper>
 void apply_triu_tril(Tensor& result, const Tensor& self, const int64_t k) {
-  auto& queue = getCurrentSYCLQueue();
-  auto dev_id = getDeviceIndexOfCurrentQueue();
   auto N = self.numel();
-  int64_t group_size = syclMaxWorkGroupSize(dev_id);
-  auto num_groups = ceil_div(N, group_size);
-  auto total_items = num_groups * group_size;
   IndexType self_size_0 = (IndexType)self.size(-2);
   IndexType self_size_1 = (IndexType)self.size(-1);
   IndexType self_stride = (IndexType)(self.dim() > 2 ? self.stride(-3) : 1);
@@ -105,6 +100,11 @@ void apply_triu_tril(Tensor& result, const Tensor& self, const int64_t k) {
       result_ptr,
       self_ptr);
 
+  int64_t group_size = syclMaxWorkGroupSize(kfn);
+  auto num_groups = ceil_div(N, group_size);
+  auto total_items = num_groups * group_size;
+  auto& queue = getCurrentSYCLQueue();
+
   sycl_kernel_submit(
       sycl::range<1>(total_items), sycl::range<1>(group_size), queue, kfn);
 }
@@ -157,4 +157,4 @@ Tensor& triu_kernel(Tensor& result, const Tensor& self, int64_t k) {
   return result;
 }
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp
index 504f28d7b..275c4f2dc 100644
--- a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp
@@ -125,12 +125,13 @@ static void upsample_bicubic2d_out_template(
     bool align_corners,
     const accscalar_t height_scale,
     const accscalar_t width_scale) {
-  auto queue = getCurrentSYCLQueue();
-  int64_t wg_size = syclMaxWorkGroupSize();
-  int64_t num_wg = at::ceil_div(onum, wg_size);
-
   UpsampleBicubic2dKernelFunctor<scalar_t, accscalar_t> kfn(
       odata, idata, onum, align_corners, height_scale, width_scale);
+
+  int64_t wg_size = syclMaxWorkGroupSize(kfn);
+  int64_t num_wg = at::ceil_div(onum, wg_size);
+  auto queue = getCurrentSYCLQueue();
+
   sycl_kernel_submit(num_wg * wg_size, wg_size, queue, kfn);
 }
 
diff --git a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp
index 1ab02435a..5ad95e0c2 100644
--- a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp
@@ -107,10 +107,6 @@ void launch_upsample_bilinear2d_kernel(
     int64_t output_width,
     int64_t nbatch,
     int64_t channels) {
-  auto queue = getCurrentSYCLQueue();
-  int64_t wg_size = syclMaxWorkGroupSize();
-  int num_group = at::ceil_div(n, (int)wg_size);
-
   UpsampleBilinear2dKernelFunctor<scalar_t, accscalar_t> kfn(
       n,
       rheight,
@@ -125,6 +121,10 @@ void launch_upsample_bilinear2d_kernel(
       nbatch,
       channels);
 
+  int64_t wg_size = syclMaxWorkGroupSize(kfn);
+  int num_group = at::ceil_div(n, (int)wg_size);
+  auto queue = getCurrentSYCLQueue();
+
   sycl_kernel_submit(
       sycl::range<1>(num_group * wg_size), sycl::range<1>(wg_size), queue, kfn);
 }
@@ -249,14 +249,10 @@ void launch_upsample_bilinear2d_backward_kernel(
     const bool align_corners,
     scalar_t* idata,
     const scalar_t* odata) {
-  auto queue = getCurrentSYCLQueue();
-  int64_t wg_size = syclMaxWorkGroupSize();
-
   const size_t o_numel = nc * output_width * output_height;
   const size_t i_numel = nc * input_width * input_height;
 
   const size_t num_kernels = nc * output_width * output_height;
-  int num_group = at::ceil_div((int64_t)num_kernels, (int64_t)wg_size);
 
   UpsampleBilinear2dBackwardKernelFunctor<scalar_t, accscalar_t> kfn(
       nc,
@@ -273,6 +269,11 @@ void launch_upsample_bilinear2d_backward_kernel(
       odata,
       o_numel,
       i_numel);
+
+  int64_t wg_size = syclMaxWorkGroupSize(kfn);
+  int num_group = at::ceil_div((int64_t)num_kernels, (int64_t)wg_size);
+  auto queue = getCurrentSYCLQueue();
+
   sycl_kernel_submit(
       sycl::range<1>(num_group * wg_size), sycl::range<1>(wg_size), queue, kfn);
 }
diff --git a/src/ATen/native/xpu/sycl/pstl/PSTLFunctions.h b/src/ATen/native/xpu/sycl/pstl/PSTLFunctions.h
index 68f42ec88..96a2452b2 100644
--- a/src/ATen/native/xpu/sycl/pstl/PSTLFunctions.h
+++ b/src/ATen/native/xpu/sycl/pstl/PSTLFunctions.h
@@ -174,31 +174,35 @@ static inline OutputIt _scan_kernel(
     InputIt last,
     OutputIt d_first,
     T init) {
+  using KSScanKernel = KSScanKernelFunctor<scan_type, InputIt, OutputIt, T>;
+  using KSScanWithCarrierKernel =
+      KSScanWithCarrierKernelFunctor<scan_type, InputIt, OutputIt, T>;
+
   const auto N = std::distance(first, last);
   auto& q = getCurrentSYCLQueue();
-  const auto wgroup_size = syclMaxWorkGroupSize();
-  const auto ngroups = (N + wgroup_size - 1) / wgroup_size;
+  const auto kss_wgroup_size = syclMaxWorkGroupSize<KSScanKernel>();
 
   auto options = map_options<T>();
 
-  if (N <= wgroup_size) {
+  if (N <= kss_wgroup_size) {
     // Kogge-Stone addr algorithm;
-    KSScanKernelFunctor<scan_type, InputIt, OutputIt, T> kfn1(
-        first, init, N, d_first);
+    KSScanKernel kfn1(first, init, N, d_first);
     sycl_kernel_submit(sycl::range<1>(N), sycl::range<1>(N), q, kfn1);
 
     return d_first + N;
   }
 
+  const auto kssc_wgroup_size = syclMaxWorkGroupSize<KSScanWithCarrierKernel>();
+  auto ngroups = (N + kssc_wgroup_size - 1) / kssc_wgroup_size;
   Tensor carry = at::empty({ngroups}, options);
   T* carry_ptr = carry.data_ptr<T>();
 
   // 1. do exclusive_scan on each workgroups
-  KSScanWithCarrierKernelFunctor<scan_type, InputIt, OutputIt, T> kfn2(
-      first, init, N, carry_ptr, wgroup_size, d_first);
+  KSScanWithCarrierKernel kfn2(
+      first, init, N, carry_ptr, kssc_wgroup_size, d_first);
   sycl_kernel_submit(
-      sycl::range<1>(ngroups * wgroup_size),
-      sycl::range<1>(wgroup_size),
+      sycl::range<1>(ngroups * kssc_wgroup_size),
+      sycl::range<1>(kssc_wgroup_size),
       q,
       kfn2);
 
@@ -207,9 +211,13 @@ static inline OutputIt _scan_kernel(
 
   // 3. reduce among all work groups and flush data to dst
   ScanAccumulateKernelFunctor<OutputIt, T> kfn3(d_first, carry_ptr, N);
+
+  const auto sa_wgroup_size = syclMaxWorkGroupSize(kfn3);
+  ngroups = (N + sa_wgroup_size - 1) / sa_wgroup_size;
+
   sycl_kernel_submit(
-      sycl::range<1>(ngroups * wgroup_size),
-      sycl::range<1>(wgroup_size),
+      sycl::range<1>(ngroups * sa_wgroup_size),
+      sycl::range<1>(sa_wgroup_size),
       q,
       kfn3);
 
diff --git a/src/comm/DeviceProperties.h b/src/comm/DeviceProperties.h
index 5597248fa..0f4c084c8 100644
--- a/src/comm/DeviceProperties.h
+++ b/src/comm/DeviceProperties.h
@@ -3,11 +3,34 @@
 #include <ATen/xpu/XPUContext.h>
 
 #include <comm/Runtime.h>
+#include <iostream>
 
 namespace xpu {
 namespace sycl {
 
-static inline int64_t syclMaxWorkGroupSize(
+template <class KernelClass>
+static int64_t syclMaxWorkGroupSize(
+    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+  auto q = c10::xpu::getCurrentXPUStream(dev_id).queue();
+  auto ctx = q.get_context();
+  auto dev = q.get_device();
+
+  auto kid = ::sycl::get_kernel_id<KernelClass>();
+  auto kbundle =
+      ::sycl::get_kernel_bundle<::sycl::bundle_state::executable>(ctx, {kid});
+
+  ::sycl::kernel k = kbundle.get_kernel(kid);
+  return k.get_info<::sycl::info::kernel_device_specific::work_group_size>(dev);
+}
+
+template <class KernelClass>
+static int64_t syclMaxWorkGroupSize(
+    KernelClass /*kfn*/,
+    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+  return syclMaxWorkGroupSize<KernelClass>(dev_id);
+}
+
+static inline int64_t syclDeviceMaxWorkGroupSize(
     at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->max_work_group_size;

From 9c45ee237528ed5b632fb2a352abd948199a21f5 Mon Sep 17 00:00:00 2001
From: majing <jing1.ma@intel.com>
Date: Thu, 11 Jul 2024 20:09:43 +0800
Subject: [PATCH 3/6] Add reflection_pad1d/3d and backwards (#482)

Signed-off-by: majing <Jing1.Ma@intel.com>
Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/ReflectionPad.cpp         | 352 +++++++++++++
 src/ATen/native/xpu/XPUFallback.template      |   4 -
 .../native/xpu/sycl/ReflectionPadKernels.cpp  | 491 +++++++++++++++++-
 .../native/xpu/sycl/ReflectionPadKernels.h    |  22 +
 yaml/xpu_functions.yaml                       |   8 +
 5 files changed, 865 insertions(+), 12 deletions(-)

diff --git a/src/ATen/native/xpu/ReflectionPad.cpp b/src/ATen/native/xpu/ReflectionPad.cpp
index 57ea9bbc6..2488ed229 100644
--- a/src/ATen/native/xpu/ReflectionPad.cpp
+++ b/src/ATen/native/xpu/ReflectionPad.cpp
@@ -1,10 +1,322 @@
 #include <ATen/Context.h>
 #include <ATen/core/Tensor.h>
+#include <ATen/core/op_registration/adaption.h>
+#include <ATen/native/Padding.h>
 #include <ATen/native/xpu/sycl/ReflectionPadKernels.h>
 #include <ATen/xpu/XPUNativeFunctions.h>
+#include <comm/RegisterUtils.h>
 
 namespace at {
 
+void reflection_pad1d_meta(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef padding) {
+  int64_t dim_plane = 0;
+  int64_t dim_w = 1;
+  int64_t nbatch = 1;
+
+  if (input.ndimension() == 3) {
+    nbatch = input.size(0);
+    dim_w++;
+    dim_plane++;
+  }
+
+  at::native::padding::check_valid_input<1>(input, padding);
+
+  /* sizes */
+  auto pad_l = padding[0];
+  auto pad_r = padding[1];
+
+  int64_t nplane = input.size(dim_plane);
+  int64_t input_w = input.size(dim_w);
+  int64_t output_w = input_w + pad_l + pad_r;
+
+  TORCH_CHECK(
+      pad_l < input_w && pad_r < input_w,
+      "Argument #4: Padding size "
+      "should be less than the corresponding input dimension, but got: padding (",
+      pad_l,
+      ", ",
+      pad_r,
+      ") at dimension ",
+      dim_w,
+      " of input ",
+      input.sizes());
+
+  TORCH_CHECK(
+      output_w >= 1,
+      "input (W: ",
+      input_w,
+      ") is too small. Calculated output W: ",
+      output_w);
+
+  if (output.defined()) {
+    if (input.ndimension() == 2) {
+      xpu::resize_out(output, {nplane, output_w}, {}, input.options());
+    } else {
+      xpu::resize_out(output, {nbatch, nplane, output_w}, {}, input.options());
+    }
+  } else {
+    if (input.ndimension() == 2) {
+      output = xpu::create_out({nplane, output_w}, {}, input.options());
+    } else {
+      output = xpu::create_out({nbatch, nplane, output_w}, {}, input.options());
+    }
+  }
+}
+
+void reflection_pad1d_backward_meta(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding) {
+  int64_t dim_w = 1;
+  if (input.ndimension() == 3) {
+    dim_w++;
+  }
+
+  /* sizes */
+  auto pad_l = padding[0];
+  auto pad_r = padding[1];
+  int64_t input_w = input.size(dim_w);
+  int64_t output_w = input_w + pad_l + pad_r;
+
+  TORCH_CHECK(
+      pad_l < input_w && pad_r < input_w,
+      "Argument #4: Padding size "
+      "should be less than the corresponding input dimension, but got: padding (",
+      pad_l,
+      ", ",
+      pad_r,
+      ") at dimension ",
+      dim_w,
+      " of input ",
+      input.sizes());
+
+  TORCH_CHECK(
+      output_w == grad_output.size(dim_w),
+      "grad_output width unexpected."
+      " Expected: ",
+      output_w,
+      ", Got: ",
+      grad_output.size(dim_w));
+
+  if (grad_input.defined()) {
+    xpu::resize_out(grad_input, input.sizes(), {}, input.options());
+  } else {
+    grad_input = xpu::create_out(input.sizes(), {}, input.options());
+  }
+}
+
+void reflection_pad3d_meta(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef padding) {
+  int64_t pad_left = padding[0];
+  int64_t pad_right = padding[1];
+  int64_t pad_top = padding[2];
+  int64_t pad_bottom = padding[3];
+  int64_t pad_front = padding[4];
+  int64_t pad_back = padding[5];
+  int64_t dim_w = 3;
+  int64_t dim_h = 2;
+  int64_t dim_d = 1;
+  int64_t dim_plane = 0;
+
+  at::native::padding::check_valid_input<3>(input, padding);
+
+  bool batch_mode = (input.dim() == 5);
+  if (batch_mode) {
+    dim_w++;
+    dim_h++;
+    dim_d++;
+    dim_plane++;
+  }
+
+  int64_t nplane = input.size(dim_plane);
+  int64_t input_d = input.size(dim_d);
+  int64_t input_h = input.size(dim_h);
+  int64_t input_w = input.size(dim_w);
+  int64_t output_d = input_d + pad_front + pad_back;
+  int64_t output_h = input_h + pad_top + pad_bottom;
+  int64_t output_w = input_w + pad_left + pad_right;
+
+  TORCH_CHECK(
+      pad_left < input_w && pad_right < input_w,
+      "Argument #4: Padding size "
+      "should be less than the corresponding input dimension, but got: padding (",
+      pad_left,
+      ", ",
+      pad_right,
+      ") at dimension ",
+      dim_w,
+      " of input ",
+      input.sizes());
+  TORCH_CHECK(
+      pad_top < input_h && pad_bottom < input_h,
+      "Argument #6: Padding size "
+      "should be less than the corresponding input dimension, but got: padding (",
+      pad_top,
+      ", ",
+      pad_bottom,
+      ") at dimension ",
+      dim_h,
+      " of input ",
+      input.sizes());
+  TORCH_CHECK(
+      pad_front < input_d && pad_back < input_d,
+      "Argument #8: Padding size "
+      "should be less than the corresponding input dimension, but got: padding (",
+      pad_front,
+      ", ",
+      pad_back,
+      ") at dimension ",
+      dim_d,
+      " of input ",
+      input.sizes());
+
+  TORCH_CHECK(
+      output_w >= 1 || output_h >= 1 || output_d >= 1,
+      "input (D: ",
+      input_d,
+      " H: ",
+      input_h,
+      ", W: ",
+      input_w,
+      ") is too small."
+      " Calculated output D: ",
+      output_d,
+      " H: ",
+      output_h,
+      " W: ",
+      output_w);
+
+  if (output.defined()) {
+    if (batch_mode) {
+      xpu::resize_out(
+          output,
+          {input.size(0), nplane, output_d, output_h, output_w},
+          {},
+          input.options());
+    } else {
+      xpu::resize_out(
+          output, {nplane, output_d, output_h, output_w}, {}, input.options());
+    }
+  } else {
+    if (batch_mode) {
+      output = xpu::create_out(
+          {input.size(0), nplane, output_d, output_h, output_w},
+          {},
+          input.options());
+    } else {
+      output = xpu::create_out(
+          {nplane, output_d, output_h, output_w}, {}, input.options());
+    }
+  }
+}
+
+void reflection_pad3d_backward_meta(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding) {
+  TORCH_CHECK(padding.size() == 6, "padding size is expected to be 6");
+  TORCH_CHECK(input.dim() > 3);
+  TORCH_CHECK(grad_output.dim() == input.dim());
+
+  int64_t pad_left = padding[0];
+  int64_t pad_right = padding[1];
+  int64_t pad_top = padding[2];
+  int64_t pad_bottom = padding[3];
+  int64_t pad_front = padding[4];
+  int64_t pad_back = padding[5];
+  int64_t dim_w = 3;
+  int64_t dim_h = 2;
+  int64_t dim_d = 1;
+
+  if (input.dim() == 5) {
+    // batch mode
+    dim_w++;
+    dim_h++;
+    dim_d++;
+  }
+
+  int64_t input_d = input.size(dim_d);
+  int64_t input_h = input.size(dim_h);
+  int64_t input_w = input.size(dim_w);
+  int64_t output_d = input_d + pad_front + pad_back;
+  int64_t output_h = input_h + pad_top + pad_bottom;
+  int64_t output_w = input_w + pad_left + pad_right;
+
+  TORCH_CHECK(
+      output_w == grad_output.size(dim_w),
+      "grad_output width unexpected."
+      " Expected: ",
+      output_w,
+      ", Got: ",
+      grad_output.size(dim_w));
+  TORCH_CHECK(
+      output_h == grad_output.size(dim_h),
+      "grad_output height unexpected."
+      " Expected: ",
+      output_h,
+      ", Got: ",
+      grad_output.size(dim_h));
+  TORCH_CHECK(
+      output_d == grad_output.size(dim_d),
+      "grad_output depth unexpected."
+      " Expected: ",
+      output_d,
+      ", Got: ",
+      grad_output.size(dim_d));
+
+  if (grad_input.defined()) {
+    xpu::resize_out(grad_input, input.sizes(), {}, input.options());
+  } else {
+    grad_input = xpu::create_out(input.sizes(), {}, input.options());
+  }
+}
+
+Tensor XPUNativeFunctions::reflection_pad1d(
+    const Tensor& input,
+    IntArrayRef padding) {
+  Tensor output;
+  reflection_pad1d_meta(output, input, padding);
+  native::xpu::reflection_pad1d_kernel(output, input, padding);
+  return output;
+}
+
+Tensor& XPUNativeFunctions::reflection_pad1d_out(
+    const Tensor& input,
+    IntArrayRef padding,
+    Tensor& output) {
+  reflection_pad1d_meta(output, input, padding);
+  native::xpu::reflection_pad1d_kernel(output, input, padding);
+  return output;
+}
+
+Tensor XPUNativeFunctions::reflection_pad1d_backward(
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding) {
+  Tensor grad_input;
+  reflection_pad1d_backward_meta(grad_input, grad_output, input, padding);
+  native::xpu::reflection_pad1d_backward_kernel(
+      grad_input, grad_output, input, padding);
+  return grad_input;
+}
+
+Tensor& XPUNativeFunctions::reflection_pad1d_backward_out(
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding,
+    Tensor& grad_input) {
+  native::xpu::reflection_pad1d_backward_kernel(
+      grad_input, grad_output, input, padding);
+  return grad_input;
+}
+
 Tensor& XPUNativeFunctions::reflection_pad2d_out(
     const Tensor& input,
     IntArrayRef padding,
@@ -49,4 +361,44 @@ Tensor XPUNativeFunctions::reflection_pad2d_backward(
   return grad_input;
 }
 
+Tensor XPUNativeFunctions::reflection_pad3d(
+    const Tensor& input,
+    IntArrayRef padding) {
+  Tensor output;
+  reflection_pad3d_meta(output, input, padding);
+  native::xpu::reflection_pad3d_kernel(output, input, padding);
+  return output;
+}
+
+Tensor& XPUNativeFunctions::reflection_pad3d_out(
+    const Tensor& input,
+    IntArrayRef padding,
+    Tensor& output) {
+  reflection_pad3d_meta(output, input, padding);
+  native::xpu::reflection_pad3d_kernel(output, input, padding);
+  return output;
+}
+
+Tensor XPUNativeFunctions::reflection_pad3d_backward(
+    const Tensor& grad_output,
+    const Tensor& input,
+    at::IntArrayRef padding) {
+  Tensor grad_input;
+  reflection_pad3d_backward_meta(grad_input, grad_output, input, padding);
+  native::xpu::reflection_pad3d_backward_kernel(
+      grad_input, grad_output, input, padding);
+  return grad_input;
+}
+
+Tensor& XPUNativeFunctions::reflection_pad3d_backward_out(
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding,
+    Tensor& grad_input) {
+  reflection_pad3d_backward_meta(grad_input, grad_output, input, padding);
+  native::xpu::reflection_pad3d_backward_kernel(
+      grad_input, grad_output, input, padding);
+  return grad_input;
+}
+
 } // namespace at
\ No newline at end of file
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 7bfdd6abd..471081ccd 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -299,8 +299,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "prod",
     "prod.int_out",
     "put_",
-    "reflection_pad1d_backward.grad_input",
-    "reflection_pad1d.out",
     "renorm.out",
     "repeat_interleave.Tensor",
     "replication_pad1d_backward.grad_input",
@@ -380,8 +378,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "vdot",
     "xlogy.OutTensor",
     "_upsample_bicubic2d_aa.out",
-    "reflection_pad3d.out",
-    "reflection_pad3d_backward.grad_input",
     "replication_pad3d.out",
     "replication_pad3d_backward",
   };
diff --git a/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp b/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp
index b96bb2d7a..1627a5def 100644
--- a/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReflectionPadKernels.cpp
@@ -5,6 +5,7 @@
 #pragma GCC diagnostic ignored "-Wreturn-type"
 
 #include <ATen/ATen.h>
+#include <ATen/Context.h>
 #include <ATen/Dispatch.h>
 #include <ATen/ceil_div.h>
 #include <ATen/native/IndexingUtils.h>
@@ -15,6 +16,30 @@
 
 namespace at::native::xpu {
 
+inline std::pair<int64_t, int64_t> get_index_mapping1d(
+    int64_t input_w,
+    int64_t output_w,
+    int64_t output_x,
+    int64_t pad_l,
+    const sycl::nd_item<3> item) {
+  auto input_offset =
+      (item.get_group(1) + item.get_group(0) * item.get_group_range(1)) *
+      input_w;
+  auto output_offset =
+      (item.get_group(1) + item.get_group(0) * item.get_group_range(1)) *
+      output_w;
+
+  auto i_start_x = std::max(int64_t(0), -pad_l);
+  auto o_start_x = std::max(int64_t(0), pad_l);
+
+  int64_t input_x = std::abs(output_x - pad_l) -
+      std::abs(output_x - (input_w + pad_l - 1)) - output_x + 2 * pad_l +
+      input_w - 1 - o_start_x + i_start_x;
+
+  return std::make_pair<int64_t, int64_t>(
+      input_offset + input_x, output_offset + output_x);
+}
+
 inline std::pair<int64_t, int64_t> get_index_mapping2d(
     int64_t input_dim_x,
     int64_t input_dim_y,
@@ -26,10 +51,10 @@ inline std::pair<int64_t, int64_t> get_index_mapping2d(
     const sycl::nd_item<3> item) {
   // 3D grid of 1D blocks
   auto input_offset =
-      (item.get_group(1) + item.get_group(2) * item.get_group_range(1)) *
+      (item.get_group(1) + item.get_group(0) * item.get_group_range(1)) *
       input_dim_x * input_dim_y;
   auto output_offset =
-      (item.get_group(1) + item.get_group(2) * item.get_group_range(1)) *
+      (item.get_group(1) + item.get_group(0) * item.get_group_range(1)) *
       output_dim_x * output_dim_y;
 
   auto output_x = output_xy % output_dim_x;
@@ -53,10 +78,122 @@ inline std::pair<int64_t, int64_t> get_index_mapping2d(
       output_offset + output_y * output_dim_x + output_x);
 }
 
+template <typename scalar_t>
+struct ReflectionPad1dKernelFunctor {
+  void operator()(sycl::nd_item<3> item) const {
+    auto output_x = item.get_global_id(2);
+
+    if (output_x < output_w_) {
+      // input index and output index mapping
+      auto index_pair =
+          get_index_mapping1d(input_w_, output_w_, output_x, pad_l_, item);
+      output_data_[index_pair.second] = input_data_[index_pair.first];
+    }
+  }
+  ReflectionPad1dKernelFunctor(
+      scalar_t* input_data,
+      scalar_t* output_data,
+      int64_t input_w,
+      int64_t pad_l,
+      int64_t output_w)
+      : input_data_(input_data),
+        output_data_(output_data),
+        input_w_(input_w),
+        pad_l_(pad_l),
+        output_w_(output_w) {}
+
+ private:
+  scalar_t* input_data_;
+  scalar_t* output_data_;
+  int64_t input_w_;
+  int64_t pad_l_;
+  int64_t output_w_;
+};
+
+template <typename scalar_t>
+void reflection_pad1d_template(
+    scalar_t* input,
+    scalar_t* output,
+    int64_t input_w,
+    int64_t pad_l,
+    int64_t pad_r,
+    int64_t nbatch,
+    int64_t nplane,
+    int64_t output_w) {
+  auto queue = getCurrentSYCLQueue();
+  int64_t work_group_size = syclMaxWorkItemsPerEU();
+  int64_t work_group_num = at::ceil_div(output_w, work_group_size);
+
+  ReflectionPad1dKernelFunctor<scalar_t> kfn(
+      input, output, input_w, pad_l, output_w);
+  sycl_kernel_submit(
+      sycl::range<3>(nbatch, nplane, work_group_size * work_group_num),
+      sycl::range<3>(1, 1, work_group_size),
+      queue,
+      kfn);
+}
+
+template <typename scalar_t>
+struct ReflectionPad1dBackwardKernelFunctor {
+  void operator()(sycl::nd_item<3> item) const {
+    auto output_x = item.get_global_id(2);
+
+    if (output_x < output_w_) {
+      // grad input index and grad output index mapping
+      auto index_pair =
+          get_index_mapping1d(input_w_, output_w_, output_x, pad_l_, item);
+      atomicAdd(
+          (sycl_global_ptr<scalar_t>)&grad_input_data_[index_pair.first],
+          grad_output_data_[index_pair.second]);
+    }
+  }
+  ReflectionPad1dBackwardKernelFunctor(
+      scalar_t* grad_input_data,
+      scalar_t* grad_output_data,
+      int64_t input_w,
+      int64_t pad_l,
+      int64_t output_w)
+      : grad_input_data_(grad_input_data),
+        grad_output_data_(grad_output_data),
+        input_w_(input_w),
+        pad_l_(pad_l),
+        output_w_(output_w) {}
+
+ private:
+  scalar_t* grad_input_data_;
+  scalar_t* grad_output_data_;
+  int64_t input_w_;
+  int64_t pad_l_;
+  int64_t output_w_;
+};
+
+template <typename scalar_t>
+void reflection_pad1d_backward_template(
+    scalar_t* grad_input,
+    scalar_t* grad_output,
+    int64_t input_w,
+    int64_t pad_l,
+    int64_t pad_r,
+    int64_t nbatch,
+    int64_t nplane,
+    int64_t output_w) {
+  auto queue = getCurrentSYCLQueue();
+  int64_t work_group_size = syclMaxWorkItemsPerEU();
+  int64_t work_group_num = at::ceil_div(output_w, work_group_size);
+
+  ReflectionPad1dBackwardKernelFunctor<scalar_t> kfn(
+      grad_input, grad_output, input_w, pad_l, output_w);
+  sycl_kernel_submit(
+      sycl::range<3>(nbatch, nplane, work_group_size * work_group_num),
+      sycl::range<3>(1, 1, work_group_size),
+      queue,
+      kfn);
+}
+
 template <typename scalar_t>
 struct ReflectionPad2dKernellFunctor {
   void operator()(sycl::nd_item<3> item) const {
-    auto output_xy = item.get_global_id(0);
+    auto output_xy = item.get_global_id(2);
 
     if (output_xy < output_dim_x_ * output_dim_y_) {
       // input index and output index mapping
@@ -131,8 +268,8 @@ void reflection_pad2d_template(
       output_dim_x,
       output_dim_y);
   sycl_kernel_submit(
-      sycl::range<3>(work_group_size * work_group_num, nplane, nbatch),
-      sycl::range<3>(work_group_size, 1, 1),
+      sycl::range<3>(nbatch, nplane, work_group_size * work_group_num),
+      sycl::range<3>(1, 1, work_group_size),
       queue,
       kfn);
 }
@@ -140,7 +277,7 @@ void reflection_pad2d_template(
 template <typename scalar_t>
 struct ReflectionPad2dBackwardKernelFunctor {
   void operator()(sycl::nd_item<3> item) const {
-    auto output_xy = item.get_global_id(0);
+    auto output_xy = item.get_global_id(2);
 
     if (output_xy < output_dim_x_ * output_dim_y_) {
       // grad input index and grad output index mapping
@@ -216,12 +353,266 @@ void reflection_pad2d_backward_template(
       output_dim_x,
       output_dim_y);
   sycl_kernel_submit(
-      sycl::range<3>(work_group_size * work_group_num, nplane, nbatch),
-      sycl::range<3>(work_group_size, 1, 1),
+      sycl::range<3>(nbatch, nplane, work_group_size * work_group_num),
+      sycl::range<3>(1, 1, work_group_size),
       queue,
       kfn);
 }
 
+template <typename scalar_t, typename F>
+struct ParallelReflectionPad3dKernelFunctor {
+  void operator()(sycl::nd_item<3> item) const {
+    auto output_id = item.get_global_id(2);
+    if (output_id >= output_plane_size_) {
+      return;
+    }
+
+    int64_t output_x = output_id % output_.size(4);
+    int64_t output_y = (output_id / output_.size(4)) % output_.size(3);
+    int64_t output_z = output_id / (output_.size(3) * output_.size(4));
+
+    int64_t i_start_x = std::max(int64_t(0), -pad_left_);
+    int64_t o_start_x = std::max(int64_t(0), pad_left_);
+    int64_t i_start_y = std::max(int64_t(0), -pad_top_);
+    int64_t o_start_y = std::max(int64_t(0), pad_top_);
+    int64_t i_start_z = std::max(int64_t(0), -pad_front_);
+    int64_t o_start_z = std::max(int64_t(0), pad_front_);
+
+    int64_t input_x = std::abs(output_x - pad_left_) -
+        std::abs(output_x - (input_.size(4) + pad_left_ - 1)) - output_x +
+        2 * pad_left_ + input_.size(4) - 1 - o_start_x + i_start_x;
+    int64_t input_y = std::abs(output_y - pad_top_) -
+        std::abs(output_y - (input_.size(3) + pad_top_ - 1)) - output_y +
+        2 * pad_top_ + input_.size(3) - 1 - o_start_y + i_start_y;
+
+    int64_t input_z = std::abs(output_z - pad_front_) -
+        std::abs(output_z - (input_.size(2) + pad_front_ - 1)) - output_z +
+        2 * pad_front_ + input_.size(2) - 1 - o_start_z + i_start_z;
+
+    f_(input_,
+       output_,
+       item.get_group(1),
+       item.get_group(0),
+       output_z,
+       output_y,
+       output_x,
+       input_z,
+       input_y,
+       input_x);
+  }
+  ParallelReflectionPad3dKernelFunctor(
+      PackedTensorAccessor64<scalar_t, 5> input,
+      PackedTensorAccessor64<scalar_t, 5> output,
+      int64_t pad_left,
+      int64_t pad_top,
+      int64_t pad_front,
+      const F f,
+      int64_t output_plane_size)
+      : input_(input),
+        output_(output),
+        pad_left_(pad_left),
+        pad_top_(pad_top),
+        pad_front_(pad_front),
+        f_(f),
+        output_plane_size_(output_plane_size) {}
+
+ private:
+  PackedTensorAccessor64<scalar_t, 5> input_;
+  PackedTensorAccessor64<scalar_t, 5> output_;
+  int64_t pad_left_;
+  int64_t pad_top_;
+  int64_t pad_front_;
+  const F f_;
+  int64_t output_plane_size_;
+};
+
+template <typename scalar_t, typename F>
+inline void parallel_reflection_pad3d(
+    PackedTensorAccessor64<scalar_t, 5> input,
+    PackedTensorAccessor64<scalar_t, 5> output,
+    int64_t pad_left,
+    int64_t pad_top,
+    int64_t pad_front,
+    const F& f) {
+  auto queue = getCurrentSYCLQueue();
+  int64_t output_plane_size = output.size(2) * output.size(3) * output.size(4);
+  int64_t work_group_size = syclMaxWorkItemsPerEU();
+  int64_t work_group_num = at::ceil_div(output_plane_size, work_group_size);
+  int64_t nplane = input.size(1);
+  int64_t nbatch = input.size(0);
+
+  ParallelReflectionPad3dKernelFunctor<scalar_t, F> kfn(
+      input, output, pad_left, pad_top, pad_front, f, output_plane_size);
+  sycl_kernel_submit(
+      sycl::range<3>(nbatch, nplane, work_group_size * work_group_num),
+      sycl::range<3>(1, 1, work_group_size),
+      queue,
+      kfn);
+}
+
+template <typename scalar_t>
+struct reflection_pad3d_kernel_functor {
+  void operator()(
+      PackedTensorAccessor64<scalar_t, 5> input,
+      PackedTensorAccessor64<scalar_t, 5> output,
+      int64_t plane,
+      int64_t batch,
+      int64_t output_z,
+      int64_t output_y,
+      int64_t output_x,
+      int64_t input_z,
+      int64_t input_y,
+      int64_t input_x) const {
+    auto value_to_copy = input[batch][plane][input_z][input_y][input_x];
+    output[batch][plane][output_z][output_y][output_x] = value_to_copy;
+  }
+};
+
+template <typename scalar_t>
+void reflection_pad3d_template(
+    PackedTensorAccessor64<scalar_t, 5> input,
+    PackedTensorAccessor64<scalar_t, 5> output,
+    int64_t pad_left,
+    int64_t pad_top,
+    int64_t pad_front) {
+  reflection_pad3d_kernel_functor<scalar_t> f;
+  parallel_reflection_pad3d(input, output, pad_left, pad_top, pad_front, f);
+}
+
+template <typename scalar_t>
+struct reflection_pad3d_backward_kernel_functor {
+  void operator()(
+      PackedTensorAccessor64<scalar_t, 5> grad_input,
+      PackedTensorAccessor64<scalar_t, 5> grad_output,
+      int64_t plane,
+      int64_t batch,
+      int64_t output_z,
+      int64_t output_y,
+      int64_t output_x,
+      int64_t input_z,
+      int64_t input_y,
+      int64_t input_x) const {
+    auto value_to_add = grad_output[batch][plane][output_z][output_y][output_x];
+    auto target = (sycl_global_ptr<scalar_t>)&grad_input[batch][plane][input_z]
+                                                        [input_y][input_x];
+    atomicAdd(target, value_to_add);
+  }
+};
+
+template <typename scalar_t>
+void reflection_pad3d_backward_template(
+    PackedTensorAccessor64<scalar_t, 5> grad_input,
+    PackedTensorAccessor64<scalar_t, 5> grad_output,
+    int64_t pad_left,
+    int64_t pad_top,
+    int64_t pad_front) {
+  reflection_pad3d_backward_kernel_functor<scalar_t> f;
+  parallel_reflection_pad3d(
+      grad_input, grad_output, pad_left, pad_top, pad_front, f);
+}
+
+void reflection_pad1d_kernel(
+    Tensor& output,
+    const Tensor& input_,
+    IntArrayRef padding) {
+  TORCH_CHECK(
+      canUse32BitIndexMath(input_),
+      "input tensor must fit into 32-bit index math");
+
+  if (output.numel() == 0) {
+    return;
+  }
+
+  int64_t dim_plane = 0;
+  int64_t dim_w = 1;
+  int64_t nbatch = 1;
+
+  if (input_.ndimension() == 3) {
+    nbatch = input_.size(0);
+    dim_plane++;
+    dim_w++;
+  }
+
+  int64_t pad_l = padding[0];
+  int64_t pad_r = padding[1];
+
+  int64_t nplane = input_.size(dim_plane);
+  int64_t input_w = input_.size(dim_w);
+  int64_t output_w = input_w + pad_l + pad_r;
+
+  Tensor input = input_.contiguous();
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      kHalf, kBFloat16, input.scalar_type(), "reflection_pad1d_xpu", [&] {
+        reflection_pad1d_template<scalar_t>(
+            input.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>(),
+            input_w,
+            pad_l,
+            pad_r,
+            nbatch,
+            nplane,
+            output_w);
+      });
+}
+
+void reflection_pad1d_backward_kernel(
+    Tensor& grad_input,
+    const Tensor& grad_output_,
+    const Tensor& input,
+    IntArrayRef padding) {
+  globalContext().alertNotDeterministic("reflection_pad1d_backward_out_xpu");
+  grad_input.zero_();
+
+  if (grad_input.numel() == 0) {
+    return;
+  }
+
+  TORCH_CHECK(
+      canUse32BitIndexMath(input),
+      "input tensor must fit into 32-bit index math");
+
+  TORCH_CHECK(
+      canUse32BitIndexMath(grad_output_),
+      "input tensor must fit into 32-bit index math");
+
+  int64_t dim_plane = 0;
+  int64_t dim_w = 1;
+  int64_t nbatch = 1;
+
+  if (input.ndimension() == 3) {
+    nbatch = input.size(0);
+    dim_plane++;
+    dim_w++;
+  }
+
+  int64_t pad_l = padding[0];
+  int64_t pad_r = padding[1];
+
+  int64_t nplane = input.size(dim_plane);
+  int64_t input_w = input.size(dim_w);
+  int64_t output_w = input_w + pad_l + pad_r;
+
+  Tensor grad_output = grad_output_.contiguous();
+
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      grad_input.scalar_type(),
+      "reflection_pad1d_backward_xpu",
+      [&] {
+        reflection_pad1d_backward_template<scalar_t>(
+            grad_input.data_ptr<scalar_t>(),
+            grad_output.data_ptr<scalar_t>(),
+            input_w,
+            pad_l,
+            pad_r,
+            nbatch,
+            nplane,
+            output_w);
+      });
+}
+
 void reflection_pad2d_kernel(
     Tensor& output,
     const Tensor& input_,
@@ -396,6 +787,90 @@ void reflection_pad2d_backward_kernel(
       });
 }
 
+void reflection_pad3d_kernel(
+    Tensor& output,
+    const Tensor& input_,
+    IntArrayRef padding) {
+  TORCH_CHECK(
+      canUse32BitIndexMath(input_),
+      "input tensor must fit into 32-bit index math");
+
+  if (output.numel() == 0) {
+    return;
+  }
+
+  int64_t pad_left = padding[0];
+  int64_t pad_top = padding[2];
+  int64_t pad_front = padding[4];
+
+  auto input = input_.contiguous();
+  bool batch_mode = (input.dim() == 5);
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      kHalf, kBFloat16, input.scalar_type(), "reflection_pad3d_xpu", [&] {
+        auto input_inner = input;
+        auto output_inner = output;
+        if (!batch_mode) {
+          input_inner = input.unsqueeze(0);
+          output_inner = output.unsqueeze(0);
+        }
+
+        auto input_packed = input_inner.packed_accessor64<scalar_t, 5>();
+        auto output_packed = output_inner.packed_accessor64<scalar_t, 5>();
+
+        reflection_pad3d_template<scalar_t>(
+            input_packed, output_packed, pad_left, pad_top, pad_front);
+      });
+}
+
+void reflection_pad3d_backward_kernel(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding) {
+  globalContext().alertNotDeterministic("reflection_pad3d_backward_out_xpu");
+  TORCH_CHECK(
+      canUse32BitIndexMath(input),
+      "input tensor must fit into 32-bit index math");
+  TORCH_CHECK(
+      canUse32BitIndexMath(grad_output),
+      "input tensor must fit into 32-bit index math");
+
+  if (grad_input.numel() == 0) {
+    return;
+  }
+  grad_input.zero_();
+
+  int64_t pad_left = padding[0];
+  int64_t pad_top = padding[2];
+  int64_t pad_front = padding[4];
+
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      input.scalar_type(),
+      "reflection_pad3d_backward_xpu",
+      [&] {
+        auto grad_input_ = grad_input;
+        auto grad_output_ = grad_output;
+        if (input.dim() == 4) {
+          // non-batch mode
+          grad_input_ = grad_input.unsqueeze(0);
+          grad_output_ = grad_output.unsqueeze(0);
+        }
+
+        auto grad_input_packed = grad_input_.packed_accessor64<scalar_t, 5>();
+        auto grad_output_packed = grad_output_.packed_accessor64<scalar_t, 5>();
+
+        reflection_pad3d_backward_template<scalar_t>(
+            grad_input_packed,
+            grad_output_packed,
+            pad_left,
+            pad_top,
+            pad_front);
+      });
+}
+
 } // namespace at::native::xpu
 
 #pragma GCC diagnostic pop
diff --git a/src/ATen/native/xpu/sycl/ReflectionPadKernels.h b/src/ATen/native/xpu/sycl/ReflectionPadKernels.h
index 8f103b73b..a21f6c8ee 100644
--- a/src/ATen/native/xpu/sycl/ReflectionPadKernels.h
+++ b/src/ATen/native/xpu/sycl/ReflectionPadKernels.h
@@ -4,6 +4,17 @@
 
 namespace at::native::xpu {
 
+void reflection_pad1d_kernel(
+    Tensor& output,
+    const Tensor& input_,
+    IntArrayRef padding);
+
+void reflection_pad1d_backward_kernel(
+    Tensor& grad_input,
+    const Tensor& grad_output_,
+    const Tensor& input,
+    IntArrayRef padding);
+
 void reflection_pad2d_kernel(
     Tensor& output,
     const Tensor& input_,
@@ -15,4 +26,15 @@ void reflection_pad2d_backward_kernel(
     const Tensor& input,
     IntArrayRef padding);
 
+void reflection_pad3d_kernel(
+    Tensor& output,
+    const Tensor& input_,
+    IntArrayRef padding);
+
+void reflection_pad3d_backward_kernel(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding);
+
 } // namespace at::native::xpu
\ No newline at end of file
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index 2ecc6790b..f103a7795 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -464,10 +464,18 @@ supported:
   - searchsorted.Scalar
   - searchsorted.Scalar_out
   - trace
+  - reflection_pad1d
+  - reflection_pad1d.out
+  - reflection_pad1d_backward
+  - reflection_pad1d_backward.grad_input
   - reflection_pad2d
   - reflection_pad2d.out
   - reflection_pad2d_backward
   - reflection_pad2d_backward.grad_input
+  - reflection_pad3d
+  - reflection_pad3d.out
+  - reflection_pad3d_backward
+  - reflection_pad3d_backward.grad_input
   - native_group_norm
   - native_group_norm_backward
   - elu

From f76c85ba31ce5c1426c3f8c25815d13e05a3243e Mon Sep 17 00:00:00 2001
From: hjhee <hjhee@users.noreply.github.com>
Date: Fri, 12 Jul 2024 09:45:13 +0800
Subject: [PATCH 4/6] Add atan::asinh, atan::atan, atan::atan2, atan::atanh,
 atan::asin, atan::cosh (#457)

- asinh
- asinh.out
- asinh_
- atan
- atan.out
- atan_
- atan2
- atan2.out
- atan2_
- atanh
- atanh.out
- atanh_
- asin
- asin.out
- asin_
- cosh
- cosh.out
- cosh_

---------

Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/BinaryOps.cpp             |  25 ++++
 src/ATen/native/xpu/UnaryOps.cpp              | 115 ++++++++++++++++++
 src/ATen/native/xpu/XPUFallback.template      |   6 -
 .../xpu/sycl/BinaryGeometricKernels.cpp       |  17 ++-
 .../native/xpu/sycl/BinaryGeometricKernels.h  |   2 +
 .../xpu/sycl/UnaryGeometricAsinKernel.cpp     |  40 ++++++
 .../xpu/sycl/UnaryGeometricAsinKernel.h       |   9 ++
 .../xpu/sycl/UnaryGeometricAsinhKernel.cpp    |  40 ++++++
 .../xpu/sycl/UnaryGeometricAsinhKernel.h      |   9 ++
 .../xpu/sycl/UnaryGeometricAtanKernel.cpp     |  40 ++++++
 .../xpu/sycl/UnaryGeometricAtanKernel.h       |   9 ++
 .../xpu/sycl/UnaryGeometricAtanhKernel.cpp    |  40 ++++++
 .../xpu/sycl/UnaryGeometricAtanhKernel.h      |   9 ++
 .../xpu/sycl/UnaryGeometricCoshKernel.cpp     |  40 ++++++
 .../xpu/sycl/UnaryGeometricCoshKernel.h       |   9 ++
 test/xpu/extended/run_test_with_skip.py       |  18 +++
 test/xpu/run_test_with_skip.py                |  41 ++++---
 test/xpu/xpu_test_utils.py                    |  14 ++-
 yaml/xpu_functions.yaml                       |  18 +++
 19 files changed, 471 insertions(+), 30 deletions(-)
 create mode 100644 src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.cpp
 create mode 100644 src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h
 create mode 100644 src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.cpp
 create mode 100644 src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h
 create mode 100644 src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.cpp
 create mode 100644 src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h
 create mode 100644 src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.cpp
 create mode 100644 src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h
 create mode 100644 src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.cpp
 create mode 100644 src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h

diff --git a/src/ATen/native/xpu/BinaryOps.cpp b/src/ATen/native/xpu/BinaryOps.cpp
index c50f8305d..02a22b11a 100644
--- a/src/ATen/native/xpu/BinaryOps.cpp
+++ b/src/ATen/native/xpu/BinaryOps.cpp
@@ -477,4 +477,29 @@ Tensor XPUNativeFunctions::sigmoid_backward(
   return iter.output();
 }
 
+Tensor XPUNativeFunctions::atan2(const Tensor& self, const Tensor& other) {
+  Tensor out;
+  TensorIterator iter;
+  iter.build_borrowing_binary_float_op(out, self, other);
+  native::xpu::atan2_kernel(iter);
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::atan2_(Tensor& self, const Tensor& other) {
+  TensorIterator iter;
+  iter.build_borrowing_binary_float_op(self, self, other);
+  native::xpu::atan2_kernel(iter);
+  return self;
+}
+
+Tensor& XPUNativeFunctions::atan2_out(
+    const Tensor& self,
+    const Tensor& other,
+    Tensor& out) {
+  TensorIterator iter;
+  iter.build_borrowing_binary_float_op(out, self, other);
+  native::xpu::atan2_kernel(iter);
+  return out;
+}
+
 } // namespace at
diff --git a/src/ATen/native/xpu/UnaryOps.cpp b/src/ATen/native/xpu/UnaryOps.cpp
index ffc528fab..1d3137f79 100644
--- a/src/ATen/native/xpu/UnaryOps.cpp
+++ b/src/ATen/native/xpu/UnaryOps.cpp
@@ -9,7 +9,12 @@
 #include <ATen/native/xpu/sycl/UnaryFractionKernels.h>
 #include <ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h>
 #include <ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h>
+#include <ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h>
+#include <ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h>
+#include <ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h>
+#include <ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h>
 #include <ATen/native/xpu/sycl/UnaryGeometricCosKernel.h>
+#include <ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h>
 #include <ATen/native/xpu/sycl/UnaryGeometricSinKernel.h>
 #include <ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h>
 #include <ATen/native/xpu/sycl/UnaryKernels.h>
@@ -516,6 +521,116 @@ Tensor& XPUNativeFunctions::erfc_out(const Tensor& self, Tensor& out) {
   return out;
 }
 
+Tensor XPUNativeFunctions::asinh(const Tensor& self) {
+  Tensor out;
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::asinh_kernel(iter);
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::asinh_(Tensor& self) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(self, self);
+  native::xpu::asinh_kernel(iter);
+  return self;
+}
+
+Tensor& XPUNativeFunctions::asinh_out(const Tensor& self, Tensor& out) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::asinh_kernel(iter);
+  return out;
+}
+
+Tensor XPUNativeFunctions::asin(const Tensor& self) {
+  Tensor out;
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::asin_kernel(iter);
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::asin_(Tensor& self) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(self, self);
+  native::xpu::asin_kernel(iter);
+  return self;
+}
+
+Tensor& XPUNativeFunctions::asin_out(const Tensor& self, Tensor& out) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::asin_kernel(iter);
+  return out;
+}
+
+Tensor XPUNativeFunctions::atan(const Tensor& self) {
+  Tensor out;
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::atan_kernel(iter);
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::atan_(Tensor& self) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(self, self);
+  native::xpu::atan_kernel(iter);
+  return self;
+}
+
+Tensor& XPUNativeFunctions::atan_out(const Tensor& self, Tensor& out) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::atan_kernel(iter);
+  return out;
+}
+
+Tensor XPUNativeFunctions::atanh(const Tensor& self) {
+  Tensor out;
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::atanh_kernel(iter);
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::atanh_(Tensor& self) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(self, self);
+  native::xpu::atanh_kernel(iter);
+  return self;
+}
+
+Tensor& XPUNativeFunctions::atanh_out(const Tensor& self, Tensor& out) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::atanh_kernel(iter);
+  return out;
+}
+
+Tensor XPUNativeFunctions::cosh(const Tensor& self) {
+  Tensor out;
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::cosh_kernel(iter);
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::cosh_(Tensor& self) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(self, self);
+  native::xpu::cosh_kernel(iter);
+  return self;
+}
+
+Tensor& XPUNativeFunctions::cosh_out(const Tensor& self, Tensor& out) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::cosh_kernel(iter);
+  return out;
+}
+
 Tensor& XPUNativeFunctions::conj_physical_out(const Tensor& self, Tensor& out) {
   auto iter = TensorIterator::unary_op(out, self);
   native::xpu::conj_physical_kernel(iter);
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 471081ccd..2d109b32a 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -162,11 +162,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "aminmax.out",
     "angle",
     "argmin.out",
-    "asinh.out",
-    "asin.out",
-    "atan2.out",
-    "atanh.out",
-    "atan.out",
     "avg_pool3d_backward.grad_input",
     "avg_pool3d.out",
     "binary_cross_entropy",
@@ -180,7 +175,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "cholesky_inverse",
     "_cholesky_solve_helper",
     "copysign.out",
-    "cosh.out",
     "count_nonzero.dim_IntList",
     "_ctc_loss",
     "_ctc_loss_backward",
diff --git a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp
index c93afe4bf..e170760e8 100644
--- a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.cpp
@@ -1,13 +1,28 @@
 #include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <ATen/native/TensorIterator.h>
-
 #include <ATen/native/xpu/sycl/Loops.h>
 
 namespace at {
 namespace native {
 namespace xpu {
 
+template <typename scalar_t>
+struct Atan2Functor {
+  scalar_t operator()(scalar_t a, scalar_t b) const {
+    return std::atan2(a, b);
+  }
+};
+
+void atan2_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::BFloat16,
+      at::ScalarType::Half,
+      iter.common_dtype(),
+      "atan2_xpu",
+      [&]() { gpu_kernel(iter, Atan2Functor<scalar_t>()); });
+}
+
 template <typename scalar_t>
 struct HypotFunctor {
   scalar_t operator()(scalar_t a, scalar_t b) const {
diff --git a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.h b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.h
index e37dd6dbf..588d52c4f 100644
--- a/src/ATen/native/xpu/sycl/BinaryGeometricKernels.h
+++ b/src/ATen/native/xpu/sycl/BinaryGeometricKernels.h
@@ -4,6 +4,8 @@
 
 namespace at::native::xpu {
 
+void atan2_kernel(TensorIteratorBase& iter);
+
 void hypot_kernel(TensorIteratorBase& iter);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.cpp
new file mode 100644
index 000000000..c8b1aaca6
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.cpp
@@ -0,0 +1,40 @@
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+
+#include <ATen/native/xpu/sycl/Loops.h>
+
+namespace at::native::xpu {
+
+template <typename scalar_t>
+struct AsinComplexFunctor {
+  using opmath_t = at::opmath_type<scalar_t>;
+  scalar_t operator()(const scalar_t a) const {
+    return std::asin(static_cast<opmath_t>(a));
+  }
+};
+
+template <typename scalar_t>
+struct AsinFunctor {
+  scalar_t operator()(const scalar_t a) const {
+    return std::asin(a);
+  }
+};
+
+void asin_kernel(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "asin_xpu", [&]() {
+          gpu_kernel(iter, AsinComplexFunctor<scalar_t>());
+        });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        ScalarType::Half,
+        ScalarType::BFloat16,
+        common_dtype,
+        "asin_xpu",
+        [&]() { gpu_kernel(iter, AsinFunctor<scalar_t>()); });
+  }
+}
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h
new file mode 100644
index 000000000..194ce6479
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <ATen/native/TensorIterator.h>
+
+namespace at::native::xpu {
+
+void asin_kernel(TensorIteratorBase& iter);
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.cpp
new file mode 100644
index 000000000..009a68b47
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.cpp
@@ -0,0 +1,40 @@
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+
+#include <ATen/native/xpu/sycl/Loops.h>
+
+namespace at::native::xpu {
+
+template <typename scalar_t>
+struct AsinhComplexFunctor {
+  using opmath_t = at::opmath_type<scalar_t>;
+  scalar_t operator()(const scalar_t a) const {
+    return std::asinh(static_cast<opmath_t>(a));
+  }
+};
+
+template <typename scalar_t>
+struct AsinhFunctor {
+  scalar_t operator()(const scalar_t a) const {
+    return std::asinh(a);
+  }
+};
+
+void asinh_kernel(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "asinh_xpu", [&]() {
+          gpu_kernel(iter, AsinhComplexFunctor<scalar_t>());
+        });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        ScalarType::Half,
+        ScalarType::BFloat16,
+        common_dtype,
+        "asinh_xpu",
+        [&]() { gpu_kernel(iter, AsinhFunctor<scalar_t>()); });
+  }
+}
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h
new file mode 100644
index 000000000..4d37288de
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <ATen/native/TensorIterator.h>
+
+namespace at::native::xpu {
+
+void asinh_kernel(TensorIteratorBase& iter);
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.cpp
new file mode 100644
index 000000000..f241e9da3
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.cpp
@@ -0,0 +1,40 @@
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+
+#include <ATen/native/xpu/sycl/Loops.h>
+
+namespace at::native::xpu {
+
+template <typename scalar_t>
+struct AtanComplexFunctor {
+  using opmath_t = at::opmath_type<scalar_t>;
+  scalar_t operator()(const scalar_t a) const {
+    return std::atan(static_cast<opmath_t>(a));
+  }
+};
+
+template <typename scalar_t>
+struct AtanFunctor {
+  scalar_t operator()(const scalar_t a) const {
+    return std::atan(a);
+  }
+};
+
+void atan_kernel(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "atan_xpu", [&]() {
+          gpu_kernel(iter, AtanComplexFunctor<scalar_t>());
+        });
+  } else {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+        ScalarType::Half,
+        ScalarType::BFloat16,
+        common_dtype,
+        "atan_xpu",
+        [&]() { gpu_kernel(iter, AtanFunctor<scalar_t>()); });
+  }
+}
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h
new file mode 100644
index 000000000..022720223
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <ATen/native/TensorIterator.h>
+
+namespace at::native::xpu {
+
+void atan_kernel(TensorIteratorBase& iter);
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.cpp
new file mode 100644
index 000000000..630a64d39
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.cpp
@@ -0,0 +1,40 @@
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+
+#include <ATen/native/xpu/sycl/Loops.h>
+
+namespace at::native::xpu {
+
+template <typename scalar_t>
+struct AtanhComplexFunctor {
+  using opmath_t = at::opmath_type<scalar_t>;
+  scalar_t operator()(const scalar_t a) const {
+    return std::atanh(static_cast<opmath_t>(a));
+  }
+};
+
+template <typename scalar_t>
+struct AtanhFunctor {
+  scalar_t operator()(const scalar_t a) const {
+    return std::atanh(a);
+  }
+};
+
+void atanh_kernel(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "atanh_xpu", [&]() {
+          gpu_kernel(iter, AtanhComplexFunctor<scalar_t>());
+        });
+  } else {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+        ScalarType::Half,
+        ScalarType::BFloat16,
+        common_dtype,
+        "atanh_xpu",
+        [&]() { gpu_kernel(iter, AtanhFunctor<scalar_t>()); });
+  }
+}
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h
new file mode 100644
index 000000000..5536641e7
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <ATen/native/TensorIterator.h>
+
+namespace at::native::xpu {
+
+void atanh_kernel(TensorIteratorBase& iter);
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.cpp b/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.cpp
new file mode 100644
index 000000000..11440b3d3
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.cpp
@@ -0,0 +1,40 @@
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+
+#include <ATen/native/xpu/sycl/Loops.h>
+
+namespace at::native::xpu {
+
+template <typename scalar_t>
+struct CoshComplexFunctor {
+  using opmath_t = at::opmath_type<scalar_t>;
+  scalar_t operator()(scalar_t a) const {
+    return std::cosh(static_cast<opmath_t>(a));
+  }
+};
+
+template <typename scalar_t>
+struct CoshFunctor {
+  scalar_t operator()(scalar_t a) const {
+    return std::cosh(a);
+  }
+};
+
+void cosh_kernel(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "cosh_xpu", [&]() {
+          gpu_kernel(iter, CoshComplexFunctor<scalar_t>());
+        });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        ScalarType::Half,
+        ScalarType::BFloat16,
+        common_dtype,
+        "cosh_xpu",
+        [&]() { gpu_kernel(iter, CoshFunctor<scalar_t>()); });
+  }
+}
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h b/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h
new file mode 100644
index 000000000..7f031e3ff
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <ATen/native/TensorIterator.h>
+
+namespace at::native::xpu {
+
+void cosh_kernel(TensorIteratorBase& iter);
+
+} // namespace at::native::xpu
diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py
index 943d46465..108b3073d 100644
--- a/test/xpu/extended/run_test_with_skip.py
+++ b/test/xpu/extended/run_test_with_skip.py
@@ -26,6 +26,21 @@
     "test_compare_cpu_acos_xpu_complex64",
     "test_compare_cpu_acosh_xpu_complex64",
 
+    # got inconsistent values between CPU / XPU
+    # AssertionError: Tensor-likes are not close!
+    # compute results contain nan / inf
+    "test_compare_cpu_acosh_xpu_complex64",
+    "test_compare_cpu_asin_xpu_complex128",
+    "test_compare_cpu_asin_xpu_complex64",
+    "test_compare_cpu_asinh_xpu_complex128",
+    "test_compare_cpu_asinh_xpu_complex64",
+    "test_compare_cpu_atan_xpu_complex128",
+    "test_compare_cpu_atan_xpu_complex64",
+
+    # skip random failure due to accuracy
+    # AssertionError: Tensor-likes are not close!
+    "test_compare_cpu_atan2_xpu_bfloat16",
+
     # CPU result is not golden reference
     "test_compare_cpu_div_floor_rounding_xpu_bfloat16",
     "test_compare_cpu_div_trunc_rounding_xpu_float16",
@@ -59,6 +74,9 @@
     # https://github.com/intel/torch-xpu-ops/issues/281
     "test_cow_input",
 
+    # The operator 'aten::sinh.out on the XPU backend is falling back to run on the CPU.
+    "test_cow_input_cosh_xpu_float32",
+
     # XPU implementation is correct.
     # std::exp{-inf, nan}, the result is (±0,±0) (signs are unspecified)
     # std::exp{-inf, inf}, the result is (±0,±0) (signs are unspecified)
diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py
index 28a073bd4..56fc93a48 100644
--- a/test/xpu/run_test_with_skip.py
+++ b/test/xpu/run_test_with_skip.py
@@ -802,7 +802,10 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     # Retrieve the case, once avg_pool1d is supported. Test infra will change claimed dtypes in test case once the op is listed
     # in XPU supported operators. Then the case will work.
     "test_noncontiguous_samples_nn_functional_avg_pool1d_xpu_int64",
-    "test_noncontiguous_samples_nn_functional_local_response_norm_xpu_int64"
+    "test_noncontiguous_samples_nn_functional_local_response_norm_xpu_int64",
+
+    # torch.complex32 - "sinh_cpu" not implemented for 'ComplexHalf'
+    "test_dtypes_cosh_xpu",
 )
 res += launch_test("test_ops_xpu.py", skip_list)
 
@@ -1514,35 +1517,17 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "_jiterator_",
     # CPU Fallback fails: Tensor-likes are not close!
     "test_reference_numerics_extremal__refs_acos_xpu_complex128",
-    "test_reference_numerics_extremal__refs_asin_xpu_complex128",
-    "test_reference_numerics_extremal__refs_asin_xpu_complex64",
-    "test_reference_numerics_extremal__refs_atan_xpu_complex128",
-    "test_reference_numerics_extremal__refs_atan_xpu_complex64",
     "test_reference_numerics_extremal__refs_exp2_xpu_complex128",
     "test_reference_numerics_extremal__refs_exp2_xpu_complex64",
     "test_reference_numerics_extremal__refs_nn_functional_tanhshrink_xpu_complex64",
     "test_reference_numerics_extremal_acos_xpu_complex128",
-    "test_reference_numerics_extremal_asin_xpu_complex128",
-    "test_reference_numerics_extremal_asin_xpu_complex64",
-    "test_reference_numerics_extremal_atan_xpu_complex128",
-    "test_reference_numerics_extremal_atan_xpu_complex64",
     "test_reference_numerics_extremal_exp2_xpu_complex128",
     "test_reference_numerics_extremal_exp2_xpu_complex64",
     "test_reference_numerics_extremal_nn_functional_tanhshrink_xpu_complex64",
-    "test_reference_numerics_large__refs_atan_xpu_complex128",
-    "test_reference_numerics_large__refs_atan_xpu_complex64",
-    "test_reference_numerics_large_atan_xpu_complex128",
-    "test_reference_numerics_large_atan_xpu_complex64",
     "test_reference_numerics_normal__refs_nn_functional_tanhshrink_xpu_complex64",
     "test_reference_numerics_normal_nn_functional_tanhshrink_xpu_complex64",
-    "test_reference_numerics_small__refs_atan_xpu_complex128",
-    "test_reference_numerics_small__refs_atan_xpu_complex64",
-    "test_reference_numerics_small_atan_xpu_complex128",
-    "test_reference_numerics_small_atan_xpu_complex64",
-    "test_reference_numerics_large__refs_atan_xpu_complex32",
     "test_reference_numerics_large__refs_tanh_xpu_complex32",
     "test_reference_numerics_large_tanh_xpu_complex32",
-    "test_reference_numerics_small__refs_atan_xpu_complex32",
     # For extreme value processing, Numpy and XPU results are inconsistent
     "test_reference_numerics_extremal__refs_log_xpu_complex64",
     "test_reference_numerics_extremal_log_xpu_complex64",
@@ -1554,8 +1539,26 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_reference_numerics_extremal__refs_acosh_xpu_complex64",
     "test_reference_numerics_extremal_acos_xpu_complex64",
     "test_reference_numerics_extremal_acosh_xpu_complex64",
+    "test_reference_numerics_extremal__refs_asinh_xpu_complex64",
+    "test_reference_numerics_extremal_asinh_xpu_complex64",
+    "test_reference_numerics_extremal__refs_asin_xpu_complex64",
+    "test_reference_numerics_extremal_asin_xpu_complex64",
     "test_reference_numerics_large__refs_acosh_xpu_complex64",
     "test_reference_numerics_large_acosh_xpu_complex64",
+    "test_reference_numerics_large__refs_asinh_xpu_complex128",
+    "test_reference_numerics_large__refs_asinh_xpu_complex64",
+    "test_reference_numerics_large__refs_asinh_xpu_complex32",
+    "test_reference_numerics_large_asinh_xpu_complex128",
+    "test_reference_numerics_large_asinh_xpu_complex64",
+    "test_reference_numerics_large_asinh_xpu_complex32",
+
+    # AssertionError: Tensor-likes are not close!
+    # exceeded maximum allowed difference
+    # Greatest absolute difference: 6.266784475883469e-05 at index (463, 204) (up to 1e-05 allowed)
+    # Greatest relative difference: 1.9145216356264427e-05 at index (463, 204) (up to 1.3e-06 allowed)
+    "test_reference_numerics_normal__refs_asinh_xpu_complex64",
+    "test_reference_numerics_normal_asinh_xpu_complex64",
+
     # CPU Fallback fails
     # New ATen operators fails on CPU Fallback.
     # E.g. aten::special_spherical_bessel_j0, aten::special_airy_ai.
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index 35c29d96b..e68df6e1e 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -50,7 +50,6 @@
     "clamp_min",
     "clone",
     "copy",
-    "cos",
     "cumsum",
     "eq",
     "fill",
@@ -95,7 +94,17 @@
     "remainder",
     "reshape",
     "rsqrt",
+    "cos",
+    "cosh",
+    "acos",
+    "acosh",
     "sin",
+    "asin",
+    "asinh",
+    "tanh",
+    "atan",
+    "atan2",
+    "atanh",
     "sqrt",
     "sum",
     "amin",
@@ -104,7 +113,6 @@
     "std_mean",
     "var",
     "var_mean",
-    "tanh",
     "hypot",
     "unfold",
     "uniform",
@@ -144,8 +152,6 @@
     "searchsorted",
     "grid_sampler_2d",
     # "nn.functional.grid_sample", # Lack of XPU implementation of aten::grid_sampler_3d.
-    "acos",
-    "acosh",
     "addr",
     "cdist",
     "nn.functional.group_norm",
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index f103a7795..a216619cb 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -511,6 +511,24 @@ supported:
   - addcmul.out
   - addcmul
   - addcmul_
+  - asinh
+  - asinh.out
+  - asinh_
+  - asin
+  - asin.out
+  - asin_
+  - atan
+  - atan.out
+  - atan_
+  - atan2
+  - atan2.out
+  - atan2_
+  - atanh
+  - atanh.out
+  - atanh_
+  - cosh
+  - cosh.out
+  - cosh_
   - randperm.generator_out
   - _amp_foreach_non_finite_check_and_unscale_
   - _amp_update_scale_

From 78575b61a770ad3ee153a7ac440f21bceec24fc5 Mon Sep 17 00:00:00 2001
From: hjhee <hjhee@users.noreply.github.com>
Date: Fri, 12 Jul 2024 13:03:52 +0800
Subject: [PATCH 5/6] Add aten::copysign, aten::count_nonzero (#481)

- copysign.out
  - copysign.Tensor
  - copysign_.Tensor
  - copysign.Scalar
  - copysign_.Scalar
  - copysign.Scalar_out
  - count_nonzero.dim_IntList
  - count_nonzero

---------

Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/BinaryOps.cpp             | 23 ++++++++++++++++++
 src/ATen/native/xpu/Indexing.cpp              |  1 +
 .../native/xpu/TensorAdvancedIndexing.cpp     |  4 ++++
 src/ATen/native/xpu/XPUFallback.template      |  2 --
 src/ATen/native/xpu/sycl/CopysignKernel.cpp   | 24 +++++++++++++++++++
 src/ATen/native/xpu/sycl/CopysignKernel.h     |  9 +++++++
 test/xpu/xpu_test_utils.py                    |  2 ++
 yaml/xpu_functions.yaml                       |  4 ++++
 8 files changed, 67 insertions(+), 2 deletions(-)
 create mode 100644 src/ATen/native/xpu/sycl/CopysignKernel.cpp
 create mode 100644 src/ATen/native/xpu/sycl/CopysignKernel.h

diff --git a/src/ATen/native/xpu/BinaryOps.cpp b/src/ATen/native/xpu/BinaryOps.cpp
index 02a22b11a..2ec722722 100644
--- a/src/ATen/native/xpu/BinaryOps.cpp
+++ b/src/ATen/native/xpu/BinaryOps.cpp
@@ -9,6 +9,7 @@
 #include <ATen/native/xpu/sycl/BinaryKernels.h>
 #include <ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.h>
 #include <ATen/native/xpu/sycl/BinaryRemainderKernel.h>
+#include <ATen/native/xpu/sycl/CopysignKernel.h>
 #include <ATen/native/xpu/sycl/GcdLcmKernels.h>
 #include <ATen/native/xpu/sycl/MaxMinElementwiseKernels.h>
 
@@ -502,4 +503,26 @@ Tensor& XPUNativeFunctions::atan2_out(
   return out;
 }
 
+Tensor& XPUNativeFunctions::copysign_out(
+    const Tensor& self,
+    const Tensor& other,
+    Tensor& out) {
+  TensorIterator iter;
+  iter.build_borrowing_binary_float_op(out, self, other);
+  native::xpu::copysign_kernel(iter);
+  return out;
+}
+
+Tensor& XPUNativeFunctions::copysign_(Tensor& self, const Tensor& other) {
+  return XPUNativeFunctions::copysign_out(self, other, self);
+}
+
+Tensor XPUNativeFunctions::copysign(const Tensor& self, const Tensor& other) {
+  Tensor out;
+  TensorIterator iter;
+  iter.build_borrowing_binary_float_op(out, self, other);
+  native::xpu::copysign_kernel(iter);
+  return iter.output();
+}
+
 } // namespace at
diff --git a/src/ATen/native/xpu/Indexing.cpp b/src/ATen/native/xpu/Indexing.cpp
index e80bee8ff..7b56ffc16 100644
--- a/src/ATen/native/xpu/Indexing.cpp
+++ b/src/ATen/native/xpu/Indexing.cpp
@@ -43,4 +43,5 @@ Tensor XPUNativeFunctions::index_select(
   auto out = at::empty({0}, self.options());
   return index_select_out(self, dim, index, out);
 }
+
 } // namespace at
diff --git a/src/ATen/native/xpu/TensorAdvancedIndexing.cpp b/src/ATen/native/xpu/TensorAdvancedIndexing.cpp
index 62bbd353d..a3b6d8c0e 100644
--- a/src/ATen/native/xpu/TensorAdvancedIndexing.cpp
+++ b/src/ATen/native/xpu/TensorAdvancedIndexing.cpp
@@ -1396,4 +1396,8 @@ Tensor& XPUNativeFunctions::gather_out(
   return out;
 }
 
+Tensor XPUNativeFunctions::count_nonzero(const Tensor& self, IntArrayRef dims) {
+  return (self != 0).sum(dims);
+}
+
 } // namespace at
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 2d109b32a..f1b861881 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -174,8 +174,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "cholesky",
     "cholesky_inverse",
     "_cholesky_solve_helper",
-    "copysign.out",
-    "count_nonzero.dim_IntList",
     "_ctc_loss",
     "_ctc_loss_backward",
     "_cummax_helper",
diff --git a/src/ATen/native/xpu/sycl/CopysignKernel.cpp b/src/ATen/native/xpu/sycl/CopysignKernel.cpp
new file mode 100644
index 000000000..3b8351abf
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/CopysignKernel.cpp
@@ -0,0 +1,24 @@
+#include <ATen/Dispatch.h>
+#include <ATen/native/TensorIterator.h>
+
+#include <ATen/native/xpu/sycl/Loops.h>
+
+namespace at::native::xpu {
+
+template <typename scalar_t>
+struct CopysignFunctor {
+  scalar_t operator()(scalar_t a, scalar_t b) const {
+    return std::copysign(a, b);
+  }
+};
+
+void copysign_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.common_dtype(),
+      "copysign_xpu",
+      [&]() { gpu_kernel_with_scalars(iter, CopysignFunctor<scalar_t>()); });
+}
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/CopysignKernel.h b/src/ATen/native/xpu/sycl/CopysignKernel.h
new file mode 100644
index 000000000..cf856728c
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/CopysignKernel.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <ATen/native/TensorIterator.h>
+
+namespace at::native::xpu {
+
+void copysign_kernel(TensorIteratorBase& iter);
+
+} // namespace at::native::xpu
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index e68df6e1e..16031eda2 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -163,6 +163,8 @@
     "renorm",
     "lerp",
     "conj_physical",
+    "copysign",
+    "count_nonzero"
 ]
 
 
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index a216619cb..f17a99051 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -532,6 +532,10 @@ supported:
   - randperm.generator_out
   - _amp_foreach_non_finite_check_and_unscale_
   - _amp_update_scale_
+  - copysign.out
+  - copysign.Tensor
+  - copysign_.Tensor
+  - count_nonzero.dim_IntList
   - conj_physical.out
   - conj_physical_
   - ceil

From 34f00adaf0e0588994699eaa3d69944e0f567750 Mon Sep 17 00:00:00 2001
From: mengfei25 <mengfei.li@Intel.com>
Date: Fri, 12 Jul 2024 14:51:32 +0800
Subject: [PATCH 6/6] enhance ci/nighlty workflow (#558)

1. apply PRs which closed but not merged
---
 .../ci_expected_accuracy/check_expected.py    | 14 ++--
 .github/scripts/apply_torch_pr.py             | 64 +++++++++++++------
 2 files changed, 52 insertions(+), 26 deletions(-)

diff --git a/.github/ci_expected_accuracy/check_expected.py b/.github/ci_expected_accuracy/check_expected.py
index 060d83f1f..5339f4ce3 100644
--- a/.github/ci_expected_accuracy/check_expected.py
+++ b/.github/ci_expected_accuracy/check_expected.py
@@ -48,9 +48,9 @@
         passed_models.append([model_name, test_accuracy])
         if refer_accuracy == "N/A":
             new_models.append([model_name, test_accuracy])
-            refer_data.loc[refer_data.tail(1).index.tolist()[0] + 1,:] = "N/A"
-            refer_data.at[refer_data.tail(1).index, "name"] = model_name
-            refer_data.at[refer_data.tail(1).index, args.dtype] = test_accuracy
+            refer_data.loc[len(refer_data),:] = "N/A"
+            refer_data.at[len(refer_data) - 1, "name"] = model_name
+            refer_data.at[len(refer_data) - 1, args.dtype] = test_accuracy
         elif 'pass' not in refer_accuracy:
             new_pass_models.append([model_name, test_accuracy])
             refer_data.at[refer_row[0], args.dtype] = test_accuracy
@@ -58,9 +58,9 @@
         if refer_accuracy == "N/A":
             new_models.append([model_name, test_accuracy])
             real_failed_models.append([model_name, test_accuracy])
-            refer_data.loc[refer_data.tail(1).index.tolist()[0] + 1,:] = "N/A"
-            refer_data.at[refer_data.tail(1).index, "name"] = model_name
-            refer_data.at[refer_data.tail(1).index, args.dtype] = test_accuracy
+            refer_data.loc[len(refer_data),:] = "N/A"
+            refer_data.at[len(refer_data) - 1, "name"] = model_name
+            refer_data.at[len(refer_data) - 1, args.dtype] = test_accuracy
         elif "pass" in refer_accuracy:
             real_failed_models.append([model_name, test_accuracy])
         else:
@@ -80,7 +80,7 @@
 print("Pass rate: {:.2f}%".format(len(passed_models) / len(model_names) * 100))
 
 if len(new_pass_models + new_models) > 0:
-    print("NOTE: New models result, please update the reference", new_pass_models)
+    print("NOTE: New models result, please update the reference", new_pass_models, new_models)
     if args.update:
         refer_data.to_csv(refer_file, sep=',', encoding='utf-8', index=False)
         print("Updated. Now, confirm the changes to .csvs and `git add` them if satisfied.")
diff --git a/.github/scripts/apply_torch_pr.py b/.github/scripts/apply_torch_pr.py
index d0ab9a163..9ef238abb 100644
--- a/.github/scripts/apply_torch_pr.py
+++ b/.github/scripts/apply_torch_pr.py
@@ -25,7 +25,7 @@
 
 # check reverted PR is in current code base or not
 def check_reverted_reopen(pr_info):
-    git_cmd = "git log nightly -n 1 2>&1 |grep 'nightly release' |head -1 |sed 's/.*(//;s/).*//' || git rev-parse HEAD"
+    git_cmd = "((git log -n 1 2>&1 |grep 'nightly release' |head -1 |sed 's/.*(//;s/).*//' || true) && git rev-parse HEAD) |head -n 1"
     git_info = subprocess.Popen(git_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
     main_commit = git_info.communicate()[0].decode("utf-8").replace("\n", "")
     revert_cmd = "cur_cmt=$(git rev-parse HEAD) && git fetch origin main > /dev/null 2>&1 && " + \
@@ -40,6 +40,39 @@ def check_reverted_reopen(pr_info):
         reverted = False
     return reverted
 
+def check_merged(pr_info):
+    git_cmd = "((git log -n 1 2>&1 |grep 'nightly release' |head -1 |sed 's/.*(//;s/).*//' || true) && git rev-parse HEAD) |head -n 1"
+    git_info = subprocess.Popen(git_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
+    main_commit = git_info.communicate()[0].decode("utf-8").replace("\n", "")
+    merge_cmd = "cur_cmt=$(git rev-parse HEAD) && git fetch origin main > /dev/null 2>&1 && " + \
+                 "git checkout " + main_commit + " > /dev/null 2>&1 && " + \
+                 "git log |grep 'resolved: " + pr_info["html_url"] + "' || true && " + \
+                 "git checkout $cur_cmt > /dev/null 2>&1"
+    merge_info = subprocess.Popen(merge_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
+    merge_msg = merge_info.communicate()[0].decode("utf-8")
+    if "resolved: " + pr_info["html_url"] in merge_msg:
+        merged = True
+    else:
+        merged = False
+    return merged
+
+def appyly_pr(pr_info, re_apply_msg):
+    # get pr diff
+    pr_file = pr_info["diff_url"].split("/")[-1]
+    urllib.request.urlretrieve(pr_info["diff_url"], pr_file)
+    # apply diff
+    apply_cmd = "git apply --3way " + pr_file + " && rm -f " + pr_file
+    apply_info = subprocess.Popen(apply_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
+    apply_message = apply_info.communicate()[0].decode("utf-8")
+    apply_status = apply_info.returncode
+    # apply status
+    if apply_status == 0:
+        print("{} {}, applied got SUCCESSFUL".format(pr_info["diff_url"], re_apply_msg))
+    else:
+        print("{} {}, applied got FAILED".format(pr_info["diff_url"], apply_message))
+        print(apply_status, apply_message)
+        exit(1)
+
 
 # headers = {'Authorization': 'Bearer ' + args.token} if args.token != None else args.token
 pr_list = args.pr_list + args.extra_pr_list
@@ -53,7 +86,7 @@ def check_reverted_reopen(pr_info):
     if pr_info["state"].lower() == "open":
         # for reverted PR
         reverted_id = next((item["id"] for item in pr_info["labels"] if item["name"] == "Reverted"), -1)
-        re_apply_msg = ""
+        re_apply_msg = "is opened"
         if reverted_id != -1:
             reverted = check_reverted_reopen(pr_info)
             # skip if PR not reverted but re-open in current code base
@@ -61,24 +94,17 @@ def check_reverted_reopen(pr_info):
                 print("{} is re-open but not reverted, no need to apply".format(pr_info["diff_url"]))
                 continue
             else:
-                re_apply_msg = "is re-opened & reverted,"
-        # get pr diff
-        pr_file = pr_info["diff_url"].split("/")[-1]
-        urllib.request.urlretrieve(pr_info["diff_url"], pr_file)
-        # apply diff
-        apply_cmd = "git apply --3way " + pr_file + " && rm -f " + pr_file
-        apply_info = subprocess.Popen(apply_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
-        apply_message = apply_info.communicate()[0].decode("utf-8")
-        apply_status = apply_info.returncode
-        # apply status
-        if apply_status == 0:
-            print("{} {} applied got SUCCESSFUL".format(pr_info["diff_url"], re_apply_msg))
-        else:
-            print("{} applied got FAILED".format(pr_info["diff_url"]))
-            print(apply_status, apply_message)
-            exit(1)
+                re_apply_msg = "is re-opened and reverted,"
+        appyly_pr(pr_info, re_apply_msg)
     elif pr_info["state"].lower() == "closed":
-        print("{} is ClOSED, no need to apply".format(pr_info["diff_url"]))
+        merged_id = next((item["id"] for item in pr_info["labels"] if item["name"] == "Merged"), -1)
+        re_apply_msg = "is closed but not merged"
+        if merged_id != -1:
+            merged = check_merged(pr_info)
+            if merged:
+                print("{} is closed and merged, no need to apply".format(pr_info["diff_url"]))
+                continue
+        appyly_pr(pr_info, re_apply_msg)
     else:
         print("{} is {}, no need to apply".format(pr_info["diff_url"], pr_info["state"]))
         exit(1)