diff --git a/.github/actions/inductor-xpu-e2e-test/action.yml b/.github/actions/inductor-xpu-e2e-test/action.yml
index 4f81534fd..4e1fce079 100644
--- a/.github/actions/inductor-xpu-e2e-test/action.yml
+++ b/.github/actions/inductor-xpu-e2e-test/action.yml
@@ -51,7 +51,7 @@ runs:
       shell: bash
       run: |
         source activate e2e_ci
-        source .github/scripts/env.sh
+        source .github/scripts/env.sh ${{ inputs.pytorch }}
         if [[ ${{ inputs.suite }} == *"torchbench"* ]]; then
           if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
             cd ../ && rm -rf audio && git clone --single-branch -b main https://github.com/pytorch/audio.git
@@ -94,7 +94,7 @@ runs:
       shell: bash
       run: |
         source activate e2e_ci
-        source .github/scripts/env.sh
+        source .github/scripts/env.sh ${{ inputs.pytorch }}
         cp .github/scripts/inductor_xpu_test.sh ../pytorch
         cd ../pytorch
 
diff --git a/.github/ci_expected_accuracy/check_expected.py b/.github/ci_expected_accuracy/check_expected.py
index 48543c930..6d321e34b 100644
--- a/.github/ci_expected_accuracy/check_expected.py
+++ b/.github/ci_expected_accuracy/check_expected.py
@@ -16,14 +16,14 @@
 
 
 # load csv files
-test_data= pd.read_csv(args.csv_file)
+test_data= pd.read_csv(args.csv_file, comment='#')
 # test_data = test_data.reset_index()  # make sure indexes pair with number of rows
 # test_data = test_data.sort_values(by=["name"], ascending=True)
 test_names = [row["name"] for index, row in test_data.iterrows()]
 
 current_path = pathlib.Path(__file__).parent.resolve()
 refer_file = str(current_path) + "/" + args.category + "_" + args.suite + "_" + args.mode + ".csv"
-refer_data= pd.read_csv(refer_file)
+refer_data= pd.read_csv(refer_file, comment='#')
 # refer_data = refer_data.reset_index()  # make sure indexes pair with number of rows
 # refer_data = refer_data.sort_values(by=["name"], ascending=True)
 refer_names = [row["name"] for index, row in refer_data.iterrows()]
diff --git a/.github/ci_expected_accuracy/inductor_huggingface_training.csv b/.github/ci_expected_accuracy/inductor_huggingface_training.csv
index a75d3d225..e2d5645e2 100644
--- a/.github/ci_expected_accuracy/inductor_huggingface_training.csv
+++ b/.github/ci_expected_accuracy/inductor_huggingface_training.csv
@@ -13,7 +13,8 @@ CamemBert,pass,pass,pass,pass,pass
 DebertaForMaskedLM,pass,pass,pass,pass,pass
 DebertaForQuestionAnswering,pass,pass,pass,pass,pass
 DebertaV2ForMaskedLM,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip
-DebertaV2ForQuestionAnswering,pass,pass,pass,pass,pass
+# Skip DebertaV2ForQuestionAnswering issue: https://github.com/intel/torch-xpu-ops/issues/1216
+DebertaV2ForQuestionAnswering,fail_accuracy,fail_accuracy,fail_accuracy,pass,pass
 DistilBertForMaskedLM,pass,pass,pass,pass,pass
 DistilBertForQuestionAnswering,pass,pass,pass,pass,pass
 DistillGPT2,pass,pass,pass,pass,pass
diff --git a/.github/ci_expected_accuracy/inductor_torchbench_inference.csv b/.github/ci_expected_accuracy/inductor_torchbench_inference.csv
index 4825aa41f..832923854 100644
--- a/.github/ci_expected_accuracy/inductor_torchbench_inference.csv
+++ b/.github/ci_expected_accuracy/inductor_torchbench_inference.csv
@@ -102,5 +102,6 @@ torch_multimodal_clip,pass,pass,pass,eager_fail_to_run,eager_fail_to_run
 tts_angular,pass,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
 vgg16,pass,pass,pass,pass,pass
 vision_maskrcnn,pass,pass,pass,eager_fail_to_run,eager_fail_to_run
-yolov3,pass,pass,pass,pass,pass
+# Skip yolov3 for known torchbench issue: https://github.com/intel/torch-xpu-ops/issues/1229
+yolov3,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
 hf_Roberta_base,pass,pass,pass,pass,pass
diff --git a/.github/ci_expected_accuracy/inductor_torchbench_training.csv b/.github/ci_expected_accuracy/inductor_torchbench_training.csv
index dc766eac0..36a646a14 100644
--- a/.github/ci_expected_accuracy/inductor_torchbench_training.csv
+++ b/.github/ci_expected_accuracy/inductor_torchbench_training.csv
@@ -102,5 +102,6 @@ torch_multimodal_clip,pass,pass,pass,eager_fail_to_run,eager_fail_to_run
 tts_angular,pass,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
 vgg16,pass,pass,pass,pass,pass
 vision_maskrcnn,pass,pass,pass,eager_fail_to_run,eager_fail_to_run
-yolov3,pass,pass,pass,pass,pass
+# Skip yolov3 for known torchbench issue: https://github.com/intel/torch-xpu-ops/issues/1229
+yolov3,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
 hf_Roberta_base,pass,pass,pass,pass,pass
diff --git a/.github/scripts/apply_torch_pr.py b/.github/scripts/apply_torch_pr.py
index b4b441263..bbe89ed7d 100644
--- a/.github/scripts/apply_torch_pr.py
+++ b/.github/scripts/apply_torch_pr.py
@@ -12,9 +12,7 @@
         # Fallback to CPU for XPU FP64
         "https://github.com/pytorch/pytorch/pull/126516",
         # Modify the tolerance level in TIMM benchmark
-        "https://github.com/pytorch/pytorch/pull/129735",
-        # [XPU] Update XPU C Shim Header
-        "https://github.com/pytorch/pytorch/pull/141086",
+        "https://github.com/pytorch/pytorch/pull/143739",
     ]
 )
 parser.add_argument('--extra-pr-list', '-e', nargs='+',default=[])
@@ -59,7 +57,7 @@ def appyly_pr(pr_info, re_apply_msg):
     pr_file = pr_info["diff_url"].split("/")[-1]
     urllib.request.urlretrieve(pr_info["diff_url"], pr_file)
     # apply diff
-    apply_cmd = "git apply --3way " + pr_file + " && rm -f " + pr_file
+    apply_cmd = "git apply --3way " + pr_file
     apply_info = subprocess.Popen(apply_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
     apply_message = apply_info.communicate()[0].decode("utf-8")
     apply_status = apply_info.returncode
diff --git a/.github/scripts/env.sh b/.github/scripts/env.sh
index 56d8e3930..9cfd67477 100644
--- a/.github/scripts/env.sh
+++ b/.github/scripts/env.sh
@@ -1,4 +1,11 @@
 #!/bin/bash
-source /opt/intel/oneapi/compiler/latest/env/vars.sh
-source /opt/intel/oneapi/umf/latest/env/vars.sh
-source /opt/intel/oneapi/pti/latest/env/vars.sh
+
+if [ "$1" != "nightly_wheel" ];then
+    source /opt/intel/oneapi/compiler/latest/env/vars.sh
+    source /opt/intel/oneapi/umf/latest/env/vars.sh
+    source /opt/intel/oneapi/pti/latest/env/vars.sh
+    source /opt/intel/oneapi/ccl/latest/env/vars.sh
+    source /opt/intel/oneapi/mpi/latest/env/vars.sh
+else
+    echo "Don't need to source DL-Essential for nightly wheel"
+fi
diff --git a/.github/scripts/inductor_summary.py b/.github/scripts/inductor_summary.py
index e11147664..8edd8970c 100644
--- a/.github/scripts/inductor_summary.py
+++ b/.github/scripts/inductor_summary.py
@@ -3,6 +3,8 @@
 import pandas as pd
 from scipy.stats import gmean
 from styleframe import StyleFrame, Styler, utils
+import numpy as np
+from openpyxl import Workbook
 
 parser = argparse.ArgumentParser(description="Generate report")
 parser.add_argument('-s', '--suite', default=["huggingface"], nargs='*', type=str, help='model suite name')
@@ -665,6 +667,73 @@ def update_summary(excel, scenario, suite):
             sf.set_row_height(j, 30)
         sf.to_excel(sheet_name=suite + '_'  + scenario + '_Summary', excel_writer=excel)
 
+def summary_conclusion(scenario, excel):
+    excel.book.save(excel)
+    df = pd.read_excel(excel, sheet_name = None, header = None)
+    #df = pd.DataFrame(excel)
+    if scenario == 'performance':
+        sheet_names = list(df.keys())
+        sheet_names = [s for s in sheet_names if 'Summary' in s and 'performance' in s]
+        sheet_names.sort()
+        print(f"Merge excel as below:\n{sheet_names}")
+        print("\n")
+        features = [[]] * 21
+        for sheet_name in sheet_names:
+            df_sheet = df[sheet_name]
+            df_sheet = df_sheet.values
+            features = np.hstack((features, df_sheet))
+        
+        if len(sheet_names) == 1:
+            print("sheet not merge")
+        elif len(sheet_names) == 2:
+            print("2 sheets merge")
+            if 'huggingface' in sheet_names[0]:
+                features[:, 4:5] = features[:, 14:15]
+                features[:, 6:7] = features[:, 16:17]
+            else:
+                features[:, 4:5] = features[:, 14:15]
+        else:
+            print("3 sheets merge")
+            features[:, 4:5] = features[:, 24:25]
+            features[:, 6:7] = features[:, 16:17]
+
+        df_concat = StyleFrame(pd.DataFrame(features).iloc[:,:10])
+        for i in range(10):
+            df_concat.set_column_width(i, 22)
+        for j in range(1, 23):
+            df_concat.set_row_height(j, 30)
+        df_concat.to_excel(sheet_name='Perf_Summary', excel_writer=excel, index=False)
+    else:
+        sheet_names = list(df.keys())
+        sheet_names = [s for s in sheet_names if 'Summary' in s and 'accuracy' in s]
+        sheet_names.sort()
+        print(f"Merge excel as below:\n{sheet_names}")
+        print("\n")
+        features = [[]] * 11
+        for sheet_name in sheet_names:
+            df_sheet = df[sheet_name]
+            df_sheet = df_sheet.values
+            features = np.hstack((features, df_sheet))
+        if len(sheet_names) == 1:
+            print("sheet not merge")
+        elif len(sheet_names) == 2:
+            print("2 sheets merge")
+            if 'huggingface' in sheet_names[0]:
+                features[:, 3:4] = features[:, 12:13]
+                features[:, 5:6] = features[:, 14:15]
+            else:
+                features[:, 3:4] = features[:, 12:13]
+        else:
+            print("3 sheets merge")
+            features[:, 3:4] = features[:, 21:22]
+            features[:, 5:6] = features[:, 14:15]
+
+        df_concat = StyleFrame(pd.DataFrame(features).iloc[:,:9])
+        for i in range(10):
+            df_concat.set_column_width(i, 22)
+        for j in range(1, 13):
+            df_concat.set_row_height(j, 30)
+        df_concat.to_excel(sheet_name='Acc_Summary', excel_writer=excel, index=False)
 
 def generate_report(excel, scenario_list, precision_list, mode_list, suite_list):
     for sc in scenario_list:
@@ -693,8 +762,19 @@ def excel_postprocess(file, scenario, precison, mode, suite):
                     wdt.merge_cells(start_row=1, end_row=1, start_column=13, end_column=16)
             wb.save(file)
 
+        if len(scenario) == 2:
+            wb.move_sheet("Perf_Summary", -(len(wb.worksheets)-1))
+            wb.move_sheet("Acc_Summary", -(len(wb.worksheets)-1))
+        elif len(scenario) == 1 and sc == 'accuracy':
+            wb.move_sheet("Acc_Summary", -(len(wb.worksheets)-1))
+        else:
+            wb.move_sheet("Perf_Summary", -(len(wb.worksheets)-1))
+
 
 if __name__ == '__main__':
     excel = StyleFrame.ExcelWriter('inductor_log/Inductor_E2E_Test_Report.xlsx')
     generate_report(excel, args.scenario, args.precision, args.mode, args.suite)
+    for sc in args.scenario:
+        summary_conclusion(sc, excel)
     excel_postprocess(excel, args.scenario, args.precision, args.mode, args.suite)
+    excel.close()
diff --git a/.github/scripts/spec.py b/.github/scripts/spec.py
new file mode 100644
index 000000000..b8bf6d59a
--- /dev/null
+++ b/.github/scripts/spec.py
@@ -0,0 +1,7 @@
+import torch
+
+DEVICE_NAME = 'xpu'
+
+MANUAL_SEED_FN = torch.xpu.manual_seed
+EMPTY_CACHE_FN = torch.xpu.empty_cache
+DEVICE_COUNT_FN = torch.xpu.device_count
diff --git a/.github/workflows/_linux_transformers.yml b/.github/workflows/_linux_transformers.yml
new file mode 100644
index 000000000..b21864e9b
--- /dev/null
+++ b/.github/workflows/_linux_transformers.yml
@@ -0,0 +1,355 @@
+name: Linux Transformers Test
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - '.github/scripts/spec.py'
+      - '.github/workflows/_linux_transformers.yml'
+  workflow_dispatch:
+    inputs:
+      pytorch:
+        required: false
+        type: string
+        default: 'nightly'
+        description: Pytorch branch/commit
+      python:
+        required: false
+        type: string
+        default: '3.10'
+        description: Python version
+      runner:
+        required: true
+        type: string
+        default: 'linux.idc.xpu'
+        description: Runner label
+      driver:
+        required: false
+        type: string
+        default: 'lts'
+        description: Driver lts/rolling
+      nightly_whl:
+        required: false
+        type: string
+        default: ''
+        description: Pytorch nightly wheel version
+      transformers:
+        required: false
+        type: string
+        default: 'v4.47.0'
+        description: Transformers version
+
+permissions: read-all
+
+jobs:
+  Torch-XPU-Transformers-Tests:
+    runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }}
+    env:
+      NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+      DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+      python: ${{ inputs.python != '' && inputs.python || '3.10' }}
+      pytorch: ${{ inputs.pytorch != '' && inputs.pytorch || 'nightly' }}
+      transformers: ${{ inputs.transformers != '' && inputs.transformers || 'v4.47.0' }}
+      PYTORCH_DEBUG_XPU_FALLBACK: '1'
+      TRANSFORMERS_TEST_DEVICE_SPEC: 'spec.py'
+    steps:
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+        with:
+          path: torch-xpu-ops
+      - name: Checkout Transformers
+        uses: actions/checkout@v4
+        with:
+          repository: huggingface/transformers
+          ref: ${{ env.transformers }}
+          path: transformers
+      - name: Prepare OS environment
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y \
+            espeak-ng \
+            git-lfs \
+            pkg-config \
+            libavcodec-dev \
+            libavdevice-dev \
+            libavfilter-dev \
+            libavformat-dev \
+            libavutil-dev \
+            libswresample-dev \
+            libswscale-dev
+          git lfs install
+      - name: Prepare Conda ENV
+        run: |
+          which conda && conda clean -ay
+          conda remove --all -y -n huggingface_transformers_test || rm -rf $(dirname ${CONDA_EXE})/../envs/huggingface_transformers_test
+          conda create -y -n huggingface_transformers_test python=${{ env.python }}
+          source activate huggingface_transformers_test
+      - name: Prepare Stock XPU Pytorch
+        run: |
+          pwd
+          source activate huggingface_transformers_test
+          if [ -z "${{ inputs.nightly_whl }}" ]; then
+            pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+          else
+            pip install torch==$(echo ${{ inputs.nightly_whl }}) torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+          fi
+      - name: Prepare Transformers
+        run: |
+          pwd
+          source activate huggingface_transformers_test
+          cd transformers
+          pip install -e .
+          pip install -e ".[dev-torch,testing,video]"
+          rm -rf tests_log && mkdir -p tests_log
+          rm -rf reports
+          cp ${{ github.workspace }}/torch-xpu-ops/.github/scripts/spec.py ./
+      - name: Report installed versions
+        run: |
+          source activate huggingface_transformers_test
+          echo "pip installed packages:"
+          pip list | tee ${{ github.workspace }}/transformers/tests_log/pip_list.txt
+          echo "lspci gpu devices:"
+          lspci -d ::0380 | tee ${{ github.workspace }}/transformers/tests_log/lspci_0380.txt
+          echo "GPU render nodes:"
+          cat /sys/class/drm/render*/device/device | tee ${{ github.workspace }}/transformers/tests_log/device_IDs.txt
+          echo "xpu-smi output:"
+          xpu-smi discovery -y --json --dump -1
+      - name: Sanitry check installed packages
+        run: |
+          source activate huggingface_transformers_test
+          # These checks are to exit earlier if for any reason Transformers
+          # reinstalled torch packages back to CUDA versions (not expected).
+          pip show torch | grep Version | grep xpu
+          pip show torchaudio | grep Version | grep xpu
+          pip show torchvision | grep Version | grep xpu
+          python -c 'import torch; exit(not torch.xpu.is_available())'
+      - name: Run -k backbone tests
+        env:
+          TEST_CASE: 'tests_backbone'
+        run: |
+          source activate huggingface_transformers_test
+          cd transformers
+          python3 -m pytest -rsf --make-reports=$TEST_CASE -k backbone tests || \
+            (echo "FAILED_CASES=$FAILED_CASES,$TEST_CASE" >> $GITHUB_ENV)
+      - name: Run tests/*.py
+        env:
+          TEST_CASE: 'tests_py'
+        run: |
+          source activate huggingface_transformers_test
+          cd transformers
+          python3 -m pytest -rsf --make-reports=$TEST_CASE tests/*.py || true
+      - name: Run tests/benchmark
+        env:
+          TEST_CASE: 'tests_benchmark'
+        run: |
+          source activate huggingface_transformers_test
+          cd transformers
+          python3 -m pytest -rsf --make-reports=$TEST_CASE tests/benchmark || true
+      - name: Run tests/generation
+        env:
+          TEST_CASE: 'tests_generation'
+        run: |
+          source activate huggingface_transformers_test
+          cd transformers
+          # Excluding tests due to:
+          # * torch.distributed.* not yet supported by XPU
+          pattern="not TestFSDPGeneration"
+          python3 -m pytest -rsf --make-reports=$TEST_CASE tests/generation -k "$pattern" || true
+      - name: Run tests/models
+        env:
+          TEST_CASE: 'tests_models'
+        run: |
+          source activate huggingface_transformers_test
+          cd transformers
+          # Excluding tests due to:
+          # * https://github.com/huggingface/transformers/issues/35252 (CUDA specific tests)
+          # * https://github.com/pytorch/pytorch/issues/140965 (aten::_linalg_eigvals)
+          pattern=" \
+            not test_model_parallelization and \
+            not test_model_parallel_equal_results and \
+            not test_resize_embeddings_untied and \
+            not test_resize_tokens_embeddings"
+          python3 -m pytest -rsf --make-reports=$TEST_CASE tests/models -k "$pattern" || true
+      - name: Run tests/pipelines
+        env:
+          TEST_CASE: 'tests_pipelines'
+        run: |
+          source activate huggingface_transformers_test
+          cd transformers
+          # Some tests are known to fail w/o clear pattern
+          # TODO: drop ||true after triage and fixes
+          python3 -m pytest -rsf --make-reports=$TEST_CASE tests/pipelines || true
+      - name: Run tests/trainer
+        env:
+          TEST_CASE: 'tests_trainer'
+        run: |
+          source activate huggingface_transformers_test
+          cd transformers
+          # Excluding tests due to:
+          # * Some ray tests hang, reason unknown
+          # * torch.distributed.* not yet supported by XPU
+          pattern=" \
+            not ray and \
+            not TestTrainerDistributed and \
+            not TestTrainerDistributedXPU and \
+            not TestFSDPTrainer"
+          python3 -m pytest -rsf --make-reports=$TEST_CASE tests/trainer -k "$pattern" || \
+            (echo "FAILED_CASES=$FAILED_CASES,$TEST_CASE" >> $GITHUB_ENV)
+      - name: Run tests/utils
+        env:
+          TEST_CASE: 'tests_utils'
+        run: |
+          source activate huggingface_transformers_test
+          cd transformers
+          # Excluding tests due to:
+          # * Network proxy connection issue, reason unknown
+          pattern="not test_load_img_url_timeout"
+          python3 -m pytest -rsf --make-reports=$TEST_CASE tests/utils -k "$pattern" || \
+            (echo "FAILED_CASES=$FAILED_CASES,$TEST_CASE" >> $GITHUB_ENV)
+      - name: Check for errors in tests
+        run: |
+          FAILED_CASES=$(echo $FAILED_CASES | sed 's/^,//')
+          echo "Failed cases: [$(echo $FAILED_CASES | sed 's/,/, /g')]"
+          test -z "$FAILED_CASES"
+      - name: Print results table
+        if: ${{ ! cancelled() }}
+        run: |
+          # Helper function to return number preceeding given pattern, i.e:
+          #   === 25 failed, 11 warnings, 0 errors ===
+          # Call as follows:
+          #   parse_stat $line "failed"
+          function parse_stat() {
+            stat=$(cat $1 | grep $2 | sed "s/.* \([0-9]*\) $2.*/\1/")
+            if [ -n "$stat" ]; then echo $stat; else echo "0"; fi
+          }
+          cd transformers
+          {
+            echo "### Results"
+            echo "| Test group | Errors | Failed | Deselected | Passed | Skipped |"
+            echo "| --- | --- | --- | --- | --- | --- |"
+            for stat in $(find reports -name stats.txt); do
+              # Each stat.txt is located in: reports/$test_group/stats.txt
+              test_group=$(echo $stat | cut -f 2 -d/)
+              # Get failed, passed, skipped, etc. counters
+              failed=$(parse_stat $stat failed)
+              passed=$(parse_stat $stat passed)
+              deselected=$(parse_stat $stat deselected)
+              skipped=$(parse_stat $stat skipped)
+              warnings=$(parse_stat $stat warnings)
+              errors=$(parse_stat $stat errors)
+              echo "| $test_group | $errors | $failed | $deselected | $passed | $skipped |"
+            done
+          } >> $GITHUB_STEP_SUMMARY
+      - name: Print failure lines
+        if: ${{ ! cancelled() }}
+        run: |
+          cd transformers
+          {
+            echo "### Failure lines"
+            echo "| Test group |File | Error | Comment |"
+            echo "| --- | --- | --- | --- |"
+            rm -rf _failures.txt
+            for failure in $(find reports -name failures_line.txt); do
+              # Each failure_line.txt is located in: reports/$test_group/failure_line.txt
+              test_group=$(echo $failure | cut -f2 -d/)
+              tail -n +2 $failure | sed "s/^/$test_group /" >> _failures.txt
+            done
+            # failures_line.txt file does not have test case information,
+            # so we can just sort the output and report uniq values
+            sort _failures.txt | uniq > _failures_uniq.txt
+            while read line; do
+              test_group=$(echo $line | cut -f1 -d" ")
+              file=$(echo $line | cut -f2 -d" " | sed "s/\(.*\):$/\1/")
+              error=$(echo $line | cut -f3 -d" " | sed "s/\(.*\):$/\1/")
+              # Failure comments often contain special characters which complicate
+              # parsing failure lines. But fortunately we know for sure where comments
+              # start. So we just output all contents starting from this position and
+              # wrap everything in <pre></pre> to avoid collisions with Markdown formatting.
+              comment="<pre>$(echo $line | cut -f4- -d' ' | sed 's/\(.*\):$/\1/')</pre>"
+              echo "| $test_group | $file | $error | $comment |"
+            done <_failures_uniq.txt
+          } >> $GITHUB_STEP_SUMMARY
+      - name: Print not implemented XPU backend ops
+        run: |
+          cd transformers
+          {
+            echo "### Not implemented ops"
+            echo "| Test group | Operator | Status |"
+            echo "| --- | --- | --- |"
+            rm -rf _ops.txt && touch _ops.txt
+            for log in $(find reports -name failures_line.txt); do
+              # Each failure_line.txt is located in: reports/$test_group/failure_line.txt
+              test_group=$(echo $log | cut -f2 -d/)
+              ops=$(grep NotImplementedError $log | grep "for the XPU device" | sed "s/.*The operator '\(.*\)' is not.*/\1/")
+              for op in $ops; do
+                echo "| $test_group | <pre>$op</pre> | not implemented |" >> _ops.txt
+              done
+            done
+            for log in $(find reports -name warnings.txt); do
+              # Each warnings.txt is located in: reports/$test_group/warnings.txt
+              test_group=$(echo $log | cut -f2 -d/)
+              ops=$(grep UserWarning $log | grep "on the XPU backend" | sed "s/.*The operator '\(.*\) on the XPU.*/\1/")
+              for op in $ops; do
+                echo "| $test_group | <pre>$op</pre> | fallback to CPU happens |" >> _ops.txt
+              done
+            done
+            sort _ops.txt | uniq
+          } >> $GITHUB_STEP_SUMMARY
+      - name: Print annotations
+        if: ${{ ! cancelled() }}
+        run: |
+          source activate huggingface_transformers_test
+          {
+            echo "### Annotations"
+            echo "| | |"
+            echo "| --- | --- |"
+            echo "| jobs.$GITHUB_JOB.versions.os | $(source /etc/os-release && echo $VERSION_ID) |"
+            echo "| jobs.$GITHUB_JOB.versions.linux-kernel | $(uname -r) |"
+            echo "| jobs.$GITHUB_JOB.versions.python | $(python --version | cut -f2 -d' ') |"
+            packages=" \
+              level-zero \
+              libigc1 \
+              libigc2 \
+              libze1 \
+              libze-intel-gpu1 \
+              intel-i915-dkms \
+              intel-level-zero-gpu \
+              intel-opencl-icd"
+            for package in $packages; do
+              package_version=$(dpkg -l | grep $package | grep ii | head -1 | sed "s/  */ /g" | cut -f3 -d" ")
+              echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |"
+            done
+            packages="accelerate \
+              numpy \
+              torch \
+              torchaudio \
+              torchvision \
+              transformers"
+            for package in $packages; do
+              package_version=$(python -c "import $package; print($package.__version__)" || true)
+              echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |"
+            done
+            # printing annotations for GPU cards
+            var="[$(cat /sys/class/drm/render*/device/vendor || true)]"
+            echo "| jobs.$GITHUB_JOB.drm.render_nodes_vendor_ids | $(echo $var | sed 's/ /,/g') |"
+            var="[$(cat /sys/class/drm/render*/device/device || true)]"
+            echo "| jobs.$GITHUB_JOB.drm.render_nodes_device_ids | $(echo $var | sed 's/ /,/g') |"
+            var=$(python -c "import torch; print(torch.version.xpu)" || true)
+            echo "| jobs.$GITHUB_JOB.torch.version.xpu | $var |"
+            var=$(python -c "import torch; print(torch.xpu.device_count())" || true)
+            echo "| jobs.$GITHUB_JOB.torch.xpu.device_count | $var |"
+            # printing annotations with key environment variables
+            echo "| jobs.$GITHUB_JOB.env.ZE_AFFINITY_MASK | $ZE_AFFINITY_MASK |"
+            echo "| jobs.$GITHUB_JOB.env.NEOReadDebugKeys | $NEOReadDebugKeys |"
+            echo "| jobs.$GITHUB_JOB.env.PYTORCH_ENABLE_XPU_FALLBACK | $PYTORCH_ENABLE_XPU_FALLBACK |"
+            echo "| jobs.$GITHUB_JOB.env.PYTORCH_DEBUG_XPU_FALLBACK | $PYTORCH_DEBUG_XPU_FALLBACK |"
+          } >> $GITHUB_STEP_SUMMARY
+      - name: Upload Test log
+        if: ${{ ! cancelled() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: Torch-XPU-Transformers-Log-${{ github.event.pull_request.number || github.sha }}
+          path: | 
+            ${{ github.workspace }}/transformers/reports
+            ${{ github.workspace }}/transformers/tests_log
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index d2f717230..b724d4259 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -65,19 +65,21 @@ jobs:
           conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
           source activate xpu_op_${ZE_AFFINITY_MASK}
           cd ../ && rm -rf pytorch
-          git clone https://github.com/pytorch/pytorch pytorch
-          cd pytorch && git checkout $(echo ${{ inputs.pytorch }} |sed 's/^nightly_wheel$/nightly/')
-          # apply PRs for stock pytorch
           pip install requests
-          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
-          git status && git show -s
-          git submodule sync && git submodule update --init --recursive
-          if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
-            echo "Don't replace torch-xpu-ops!"
-          else
-            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-            # Workaround for torch-xpu-ops ci test
-            sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+          git clone https://github.com/pytorch/pytorch pytorch
+          if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+            # apply PRs for stock pytorch
+            python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+            git status && git show -s
+            git submodule sync && git submodule update --init --recursive
+            if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+              echo "Don't replace torch-xpu-ops!"
+            else
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+              # Workaround for torch-xpu-ops ci test
+              sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+            fi
           fi
       - name: Triton Installation
         run: |
@@ -96,15 +98,15 @@ jobs:
       - name: Build Pytorch XPU
         run: |
           source activate xpu_op_${ZE_AFFINITY_MASK}
-          source .github/scripts/env.sh
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
           pip install mkl-static==2025.0.1 mkl-include==2025.0.1
-          cd ../pytorch
           if [[ ${{ inputs.abi }} == '0' ]]; then
             export _GLIBCXX_USE_CXX11_ABI=0
           else
             export _GLIBCXX_USE_CXX11_ABI=1
           fi
           if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+            cd ../pytorch
             export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
             pip install -r requirements.txt
             WERROR=1 python setup.py bdist_wheel
@@ -112,12 +114,21 @@ jobs:
             git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
           else
             pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+            TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
+            cd ../pytorch
+            git reset --hard && git checkout ${TORCH_COMMIT_ID}
+            TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
+            rm -rf third_party/torch-xpu-ops
+            git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
+            cd third_party/torch-xpu-ops
+            git checkout ${TORCH_XPU_OPS_COMMIT}
+            cd ../..
           fi
           pip install -r .ci/docker/requirements-ci.txt
       - name: Torch Config
         run: |
           source activate xpu_op_${ZE_AFFINITY_MASK}
-          source .github/scripts/env.sh
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
           python -c "import torch; print(torch.__config__.show())"
           python -c "import torch; print(torch.__config__.parallel_info())"
           python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
@@ -132,7 +143,7 @@ jobs:
         run: |
           cd ${{ github.workspace }}
           xpu-smi discovery
-          source .github/scripts/env.sh
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
           source activate xpu_op_${ZE_AFFINITY_MASK}
           cd ${{ github.workspace }}
           cd test/regressions
@@ -143,7 +154,7 @@ jobs:
         run: |
           cd ${{ github.workspace }}
           xpu-smi discovery
-          source .github/scripts/env.sh
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
           source activate xpu_op_${ZE_AFFINITY_MASK}
           export ZE_AFFINITY_MASK_OLD=${ZE_AFFINITY_MASK}
           unset ZE_AFFINITY_MASK
@@ -155,7 +166,7 @@ jobs:
       - name: Run XPU OP Extended UT
         if: contains(inputs.ut, 'op_extended') || github.event_name == 'schedule'
         run: |
-          source .github/scripts/env.sh
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
           source activate xpu_op_${ZE_AFFINITY_MASK}
           export PYTORCH_TEST_WITH_SLOW=1
           cd ../pytorch/third_party/torch-xpu-ops/test/xpu/extended/
@@ -163,7 +174,7 @@ jobs:
       - name: Run XPU OP UT
         if: contains(inputs.ut, 'op_ut') || github.event_name == 'schedule'
         run: |
-          source .github/scripts/env.sh
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
           source activate xpu_op_${ZE_AFFINITY_MASK}
           export PYTORCH_ENABLE_XPU_FALLBACK=1
           export PYTORCH_TEST_WITH_SLOW=1
@@ -177,7 +188,7 @@ jobs:
       - name: Run Torch XPU UT
         if: contains(inputs.ut, 'torch_xpu') || github.event_name == 'schedule'
         run: |
-          source .github/scripts/env.sh
+          source .github/scripts/env.sh ${{ inputs.pytorch }}
           source activate xpu_op_${ZE_AFFINITY_MASK}
           cd ../pytorch
           TEST_REPORTS_DIR=$(pwd)/test/test-reports
diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml
index 2edc06102..6b4ac6c3d 100644
--- a/.github/workflows/nightly_ondemand.yml
+++ b/.github/workflows/nightly_ondemand.yml
@@ -63,7 +63,7 @@ permissions: read-all
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.keep_torch_xpu_ops }}-${{ inputs.ut }}-${{ inputs.triton }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }}
-  cancel-in-progress: true
+  cancel-in-progress: ${{ github.event_name != 'schedule' }}
 
 jobs:
   Linux-Nightly-Ondemand-UT-Tests:
@@ -156,7 +156,7 @@ jobs:
           fi
           echo "TORCH_BRANCH_ID=$(git rev-parse --abbrev-ref HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TORCHBENCH_COMMIT_ID=$(<third_party/torch-xpu-ops/.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
diff --git a/.github/workflows/nightly_ondemand_rolling.yml b/.github/workflows/nightly_ondemand_rolling.yml
index 0a27b2b50..7515c5003 100644
--- a/.github/workflows/nightly_ondemand_rolling.yml
+++ b/.github/workflows/nightly_ondemand_rolling.yml
@@ -63,7 +63,7 @@ permissions: read-all
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.keep_torch_xpu_ops }}-${{ inputs.ut }}-${{ inputs.triton }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }}
-  cancel-in-progress: true
+  cancel-in-progress: ${{ github.event_name != 'schedule' }}
 
 jobs:
   Linux-Nightly-Ondemand-UT-Tests-Rolling:
@@ -158,7 +158,7 @@ jobs:
           fi
           echo "TORCH_BRANCH_ID=$(git rev-parse --abbrev-ref HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TORCHBENCH_COMMIT_ID=$(<third_party/torch-xpu-ops/.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
diff --git a/.github/workflows/nightly_ondemand_whl.yml b/.github/workflows/nightly_ondemand_whl.yml
index 6b8d0b58f..a742f2b2d 100644
--- a/.github/workflows/nightly_ondemand_whl.yml
+++ b/.github/workflows/nightly_ondemand_whl.yml
@@ -53,11 +53,11 @@ permissions: read-all
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.ut }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }}
-  cancel-in-progress: true
+  cancel-in-progress: ${{ github.event_name != 'schedule' }}
 
 jobs:
   Linux-Nightly-Ondemand-UT-WHL-Tests:
-    if: github.event_name == 'schedule' || ${{ inputs.ut_suite }}
+    if: github.event_name == 'schedule' || ${{ inputs.ut }}
     uses: ./.github/workflows/_linux_ut.yml
     with:
       ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu' || inputs.ut }}
@@ -75,19 +75,20 @@ jobs:
       ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu' || inputs.ut }}
       python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
     outputs:
-      TORCH_BRANCH_ID: ${{ steps.pinned.outputs.TORCH_BRANCH_ID }}
-      TORCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCH_COMMIT_ID }}
-      DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }}
-      KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }}
-      BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }}
-      OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }}
-      GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }}
+      TORCH_BRANCH_ID: ${{ steps.installed.outputs.TORCH_BRANCH_ID }}
+      TORCH_COMMIT_ID: ${{ steps.installed.outputs.TORCH_COMMIT_ID }}
+      TORCH_XPU_OPS_COMMIT: ${{ steps.installed.outputs.TORCH_XPU_OPS_COMMIT }}
       TORCHBENCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCHBENCH_COMMIT_ID }}
       TORCHVISION_COMMIT_ID: ${{ steps.pinned.outputs.TORCHVISION_COMMIT_ID }}
       TORCHAUDIO_COMMIT_ID: ${{ steps.pinned.outputs.TORCHAUDIO_COMMIT_ID }}
       TRANSFORMERS_VERSION: ${{ steps.pinned.outputs.TRANSFORMERS_VERSION }}
       TIMM_COMMIT_ID: ${{ steps.pinned.outputs.TIMM_COMMIT_ID }}
       TRITON_COMMIT_ID: ${{ steps.pinned.outputs.TRITON_COMMIT_ID }}
+      DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }}
+      KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }}
+      BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }}
+      OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }}
+      GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }}
       TIMEOUT_MODELS: ${{ steps.summary.outputs.TIMEOUT_MODELS }}
     steps:
       - name: Checkout torch-xpu-ops
@@ -101,38 +102,43 @@ jobs:
           pip install mkl-static==2025.0.1 mkl-include==2025.0.1
           pip install pandas scipy tqdm
       - name: Prepare Stock Pytorch
+        id: installed
         run: |
           pwd
           source activate e2e_ci
-          source .github/scripts/env.sh
+          pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+          echo "TORCH_BRANCH_ID=$(python -c 'import torch; print(torch.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
+          echo "TORCH_COMMIT_ID=${TORCH_COMMIT_ID}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           cd ../ && rm -rf pytorch
           git clone https://github.com/pytorch/pytorch pytorch
-          cd pytorch && git checkout $(echo ${{ env.pytorch }} |sed 's/^nightly_wheel$/nightly/')
+          cd pytorch && git checkout ${TORCH_COMMIT_ID}
           # apply PRs for stock pytorch
           pip install requests
-          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          # python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
           git status && git show -s
           pip install -r requirements.txt
-          cd ../
-          pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+          TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
+          echo "TORCH_XPU_OPS_COMMIT=${TORCH_XPU_OPS_COMMIT}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          rm -rf third_party/torch-xpu-ops
+          git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
+          cd third_party/torch-xpu-ops
+          git checkout ${TORCH_XPU_OPS_COMMIT}
       - name: Identify pinned versions
         id: pinned
         run: |
           source activate e2e_ci
-          source .github/scripts/env.sh
+          echo "TORCHVISION_COMMIT_ID=$(python -c 'import torchvision; print(torchvision.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          echo "TORCHAUDIO_COMMIT_ID=$(python -c 'import torchaudio; print(torchaudio.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          echo "TRITON_COMMIT_ID=$(python -c 'import triton; print(triton.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           cd ../pytorch
-          echo "TRITON_COMMIT_ID=$(pip list |grep -w pytorch-triton-xpu |awk '{print $2}')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TORCH_BRANCH_ID=nightly" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TORCH_COMMIT_ID=$(pip list |grep -w torch |awk '{print $2}')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TORCHBENCH_COMMIT_ID=$(<third_party/torch-xpu-ops/.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TORCHVISION_COMMIT_ID=$(pip list |grep -w torchvision |awk '{print $2}')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "TORCHAUDIO_COMMIT_ID=$(pip list |grep -w torchaudio |awk '{print $2}')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "DRIVER_VERSION=$(dkms status 2>&1 |grep 'intel-i915-dkms' |sed 's/.*\///;s/,.*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          echo "BUNDLE_VERSION=$(dpcpp --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+          echo "BUNDLE_VERSION=$(pip list |grep cmplr |head -n 1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           . /etc/os-release
           echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
@@ -271,6 +277,7 @@ jobs:
           repo="${{ github.repository }}"
           TORCH_BRANCH_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_BRANCH_ID }}"
           TORCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_COMMIT_ID }}"
+          TORCH_XPU_OPS_COMMIT="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_XPU_OPS_COMMIT }}"
           DRIVER_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.DRIVER_VERSION }}"
           KERNEL_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.KERNEL_VERSION }}"
           BUNDLE_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.BUNDLE_VERSION }}"
@@ -307,7 +314,7 @@ jobs:
           fi
           # Test report
           echo -e "**${test_status}** $test_type WHL Test on $(date +'%F'), See: $build_url\n" > ${{ github.workspace }}/report.txt
-          printf "Torch-xpu-ops | PyTorch | Triton\n--- | --- | ---\n${GITHUB_WORKFLOW_SHA:0:7} on ${GITHUB_REF_NAME} | " >> ${{ github.workspace }}/report.txt
+          printf "Torch-xpu-ops | PyTorch | Triton\n--- | --- | ---\n${TORCH_XPU_OPS_COMMIT:0:7} on pinned | " >> ${{ github.workspace }}/report.txt
           printf "[${TORCH_COMMIT_ID:0:7}](https://github.com/pytorch/pytorch/commit/${TORCH_COMMIT_ID:0:7}) on $TORCH_BRANCH_ID | " >> ${{ github.workspace }}/report.txt
           echo -e "[${TRITON_COMMIT_ID:0:7}](https://github.com/intel/intel-xpu-backend-for-triton/commit/${TRITON_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt
           printf "Transformers | Timm | Torchbench | Torchvision | Torchaudio\n--- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 53f93e629..fe6e428f5 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -90,7 +90,7 @@ jobs:
           cd ../pytorch
           echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" >> "${GITHUB_ENV}"
           echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" >> "${GITHUB_ENV}"
-          echo "TORCHBENCH_COMMIT_ID=$(<third_party/torch-xpu-ops/.github/ci_commit_pins/torchbench.txt)" >> "${GITHUB_ENV}"
+          echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" >> "${GITHUB_ENV}"
           echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" >> "${GITHUB_ENV}"
           echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" >> "${GITHUB_ENV}"
           echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" >> "${GITHUB_ENV}"
@@ -144,9 +144,9 @@ jobs:
         run: |
           rm -rf ${{ github.workspace }}/upload_files
           cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files
-          failed_case=$(grep "Real failed: models: *[1-9]" ${{ github.workspace }}/upload_files/summary_accuracy.log |wc -l || true)
+          failed_case=$(grep "Real failed models: *[1-9]" ${{ github.workspace }}/upload_files/summary_accuracy.log |wc -l || true)
           if [ ${failed_case} -ne 0 ];then
-            grep -E "Real failed: models: [1-9]|Summary for" ${{ github.workspace }}/summary_accuracy.log
+            grep -E "Real failed models: [1-9]|Summary for" ${{ github.workspace }}/upload_files/summary_accuracy.log
             exit 1
           fi
       - name: Upload Inductor XPU E2E Data
diff --git a/src/ATen/native/transformers/Attention.cpp b/src/ATen/native/transformers/Attention.cpp
index bb8b4602b..3090dfbee 100644
--- a/src/ATen/native/transformers/Attention.cpp
+++ b/src/ATen/native/transformers/Attention.cpp
@@ -93,36 +93,6 @@ static bool check_for_seq_len_1_nested_tensor(
   return true;
 }
 
-int64_t _fused_sdp_choice_xpu(
-    const Tensor& query,
-    const Tensor& key,
-    const Tensor& value,
-    const std::optional<Tensor>& attn_mask_,
-    double dropout_p,
-    bool is_causal,
-    std::optional<double> scale,
-    bool enable_gqa) {
-  // We have implemented efficient_attention backend with xetla, flash_attention
-  // backend is not supported now, which will be implemented in the future. So
-  // we provide two backends here.
-  sdp::sdp_params kernel_params{
-      query, key, value, attn_mask_, dropout_p, is_causal, enable_gqa};
-  // Because TORCHCHECK checks if condition is true we negate debug so that
-  // The statements will be printed when debug is true
-  bool print_debug = false;
-  sdp::SDPBackend backend =
-      sdp::can_use_mem_efficient_attention(kernel_params, print_debug)
-      ? sdp::SDPBackend::efficient_attention
-      : sdp::SDPBackend::math;
-  if (backend == sdp::SDPBackend::error) {
-    TORCH_CHECK(
-        false,
-        "No viable backend for scaled_dot_product_attention was found. ",
-        "This is likely due to turning off both the math kernel and the fused kernels.");
-  }
-  return static_cast<int64_t>(backend);
-}
-
 std::tuple<Tensor, Tensor> native_multi_head_attention_xpu(
     const Tensor& query,
     const Tensor& key,
@@ -204,8 +174,12 @@ std::tuple<Tensor, Tensor> native_multi_head_attention_xpu(
         value.view({value.size(0), -1, num_head, dim_per_head}).transpose(1, 2);
 
     sdp::sdp_params kernel_params{q, k, v, mask, 0.0, false, false};
-    auto backend = static_cast<sdp::SDPBackend>(
-        _fused_sdp_choice_xpu(q, k, v, mask, 0.0, false, {}, false));
+
+    sdp::SDPBackend backend = sdp::SDPBackend::math;
+    if (_fused_sdp_choice_stub.is_device_supported(q.device().type())) {
+      backend = static_cast<sdp::SDPBackend>(_fused_sdp_choice_stub(
+          q.device().type(), q, k, v, mask, 0.0, false, std::nullopt, false));
+    }
 
     // strides from packed projection for nested tensors when seq_len is 1 will
     // be and will trigger a contiguous call in the kernel, so we prevent this
diff --git a/src/ATen/native/transformers/SDPUtils.cpp b/src/ATen/native/transformers/SDPUtils.cpp
index db4409493..eca5f9829 100644
--- a/src/ATen/native/transformers/SDPUtils.cpp
+++ b/src/ATen/native/transformers/SDPUtils.cpp
@@ -4,6 +4,8 @@
 
 namespace sdp {
 
+using c10::array_of;
+
 bool check_all_tensors_on_device(sdp_params const& params, bool debug) {
   // Check that all tensors are on the GPU device
   // This should be handled by the stub dispatch, but whe call
diff --git a/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp b/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
index f0620c530..4a34e70d1 100644
--- a/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
+++ b/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
@@ -29,7 +29,7 @@ Tensor adaptive_avg_pool2d_backward_xpu(
       (input.ndimension() == 3 || input.ndimension() == 4),
       "non-empty 3D or 4D (batch mode) tensor expected for input");
 
-  globalContext().alertNotDeterministic("_adaptive_avg_pool2d_backward");
+  globalContext().alertNotDeterministic("adaptive_avg_pool2d_backward_xpu");
 
   Tensor grad_input;
   if (input.numel() != 0) {
diff --git a/src/ATen/native/xpu/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/DilatedMaxPool2d.cpp
index 600d29e85..a08227b47 100644
--- a/src/ATen/native/xpu/DilatedMaxPool2d.cpp
+++ b/src/ATen/native/xpu/DilatedMaxPool2d.cpp
@@ -4,6 +4,7 @@
 #include <ATen/native/xpu/sycl/DilatedMaxPool2d.h>
 #include <comm/RegisterUtils.h>
 
+#include <xpu/ATen/ops/max.h>
 #include <xpu/ATen/ops/max_pool2d_with_indices_backward_native.h>
 #include <xpu/ATen/ops/max_pool2d_with_indices_native.h>
 
@@ -40,6 +41,62 @@ TORCH_IMPL_FUNC(max_pool2d_with_indices_out_xpu)
  bool ceil_mode,
  const Tensor& output,
  const Tensor& indices) {
+  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kW = kernel_size.size() == 1
+      ? kH
+      : safe_downcast<int, int64_t>(kernel_size[1]);
+  const int padH = safe_downcast<int, int64_t>(padding[0]);
+  const int padW =
+      padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
+
+  const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
+  const int64_t nInputPlane = input.size(-3);
+  const int64_t inputHeight = input.size(-2);
+  const int64_t inputWidth = input.size(-1);
+
+  const int64_t outputHeight = output.size(-2);
+  const int64_t outputWidth = output.size(-1);
+  if (outputHeight == 1 && outputWidth == 1 && inputHeight <= kH &&
+      inputWidth <= kW && padH == 0 && padW == 0) {
+    auto smf = input.suggest_memory_format();
+    Tensor input_ = input.contiguous(smf);
+    bool is_3d = input.ndimension() == 3;
+    Tensor indices_, output_;
+    if (is_3d) {
+      indices_ = indices.contiguous();
+      output_ = output.contiguous();
+    } else {
+      indices_ = indices.contiguous(smf);
+      output_ = output.contiguous(smf);
+    }
+    if (!is_3d) {
+      input_.resize_({nbatch, nInputPlane, 1, inputHeight * inputWidth}, smf);
+      output_.resize_(
+          {nbatch, nInputPlane, 1, outputHeight * outputWidth}, smf);
+      indices_.resize_(
+          {nbatch, nInputPlane, 1, outputHeight * outputWidth}, smf);
+      at::max_outf(input_, 3, true, output_, indices_);
+    } else {
+      at::max_outf(input_, 2, true, output_, indices_);
+    }
+
+    if (!is_3d) {
+      input_.resize_({nbatch, nInputPlane, inputHeight, inputWidth}, smf);
+      output_.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, smf);
+      indices_.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, smf);
+    }
+
+    if ((is_3d && !indices.is_contiguous()) ||
+        (!is_3d && !indices.is_contiguous(smf))) {
+      indices.copy_(indices_);
+    }
+
+    if ((is_3d && !output.is_contiguous()) ||
+        (!is_3d && !output.is_contiguous(smf))) {
+      output.copy_(output_);
+    }
+    return;
+  }
   xpu::max_pool2d_with_indices_kernel(
       input,
       kernel_size,
diff --git a/src/ATen/native/xpu/RNN.cpp b/src/ATen/native/xpu/RNN.cpp
new file mode 100644
index 000000000..74152f293
--- /dev/null
+++ b/src/ATen/native/xpu/RNN.cpp
@@ -0,0 +1,46 @@
+#include <ATen/ATen.h>
+#include <ATen/native/xpu/sycl/RNNKernels.h>
+
+namespace at::native {
+
+std::tuple<Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_xpu(
+    const Tensor& input_gates,
+    const Tensor& hidden_gates,
+    const Tensor& cx,
+    const std::optional<Tensor>& input_bias_opt,
+    const std::optional<Tensor>& hidden_bias_opt) {
+  return native::xpu::_thnn_fused_lstm_cell_kernel(
+      input_gates, hidden_gates, cx, input_bias_opt, hidden_bias_opt);
+}
+
+std::tuple<Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backward_xpu(
+    const std::optional<Tensor>& grad_hy_opt,
+    const std::optional<Tensor>& grad_cy_opt,
+    const Tensor& cx,
+    const Tensor& cy,
+    const Tensor& workspace,
+    bool has_bias) {
+  return native::xpu::_thnn_fused_lstm_cell_backward_kernel(
+      grad_hy_opt, grad_cy_opt, cx, cy, workspace, has_bias);
+}
+
+std::tuple<at::Tensor, at::Tensor> _thnn_fused_gru_cell_xpu(
+    const Tensor& input_gates,
+    const Tensor& hidden_gates,
+    const Tensor& hx,
+    const std::optional<at::Tensor>& input_bias,
+    const std::optional<at::Tensor>& hidden_bias) {
+  return native::xpu::_thnn_fused_gru_cell_kernel(
+      input_gates, hidden_gates, hx, input_bias, hidden_bias);
+}
+
+std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor>
+_thnn_fused_gru_cell_backward_xpu(
+    const Tensor& grad_hy,
+    const Tensor& workspace,
+    bool has_bias) {
+  return native::xpu::_thnn_fused_gru_cell_backward_kernel(
+      grad_hy, workspace, has_bias);
+}
+
+} // namespace at::native
diff --git a/src/ATen/native/xpu/RreluWithNoise.cpp b/src/ATen/native/xpu/RreluWithNoise.cpp
index f66833983..fb4e2c333 100644
--- a/src/ATen/native/xpu/RreluWithNoise.cpp
+++ b/src/ATen/native/xpu/RreluWithNoise.cpp
@@ -6,7 +6,7 @@ namespace native {
 
 Tensor& rrelu_with_noise_out_xpu(
     const Tensor& self,
-    const Tensor& noise,
+    Tensor& noise,
     const Scalar& lower,
     const Scalar& upper,
     bool training,
@@ -18,7 +18,7 @@ Tensor& rrelu_with_noise_out_xpu(
 
 Tensor rrelu_with_noise_xpu(
     const Tensor& self,
-    const Tensor& noise,
+    Tensor& noise,
     const Scalar& lower,
     const Scalar& upper,
     bool training,
@@ -30,7 +30,7 @@ Tensor rrelu_with_noise_xpu(
 
 Tensor& rrelu_with_noise_xpu_(
     Tensor& self,
-    const Tensor& noise,
+    Tensor& noise,
     const Scalar& lower,
     const Scalar& upper,
     bool training,
diff --git a/src/ATen/native/xpu/SoftMax.cpp b/src/ATen/native/xpu/SoftMax.cpp
index e816d48c8..f155165ce 100644
--- a/src/ATen/native/xpu/SoftMax.cpp
+++ b/src/ATen/native/xpu/SoftMax.cpp
@@ -76,6 +76,17 @@ TORCH_IMPL_FUNC(log_softmax_xpu_out)
   xpu::_log_softmax_kernel(input, dim, half_to_float, output);
 }
 
+Tensor _safe_softmax_xpu(
+    const Tensor& self,
+    int64_t dim,
+    std::optional<ScalarType> dtype) {
+  // TODO: uncomment after XPU softmax support half_to_float=true
+  // if (self.scalar_type() == ScalarType::Half && dtype == ScalarType::Float)
+  //   return xpu::_safe_softmax_kernel(self, dim_, true);
+  Tensor converted = dtype.has_value() ? self.toType(dtype.value()) : self;
+  return xpu::_safe_softmax_kernel(converted, dim, false);
+}
+
 Tensor masked_softmax_xpu(
     const Tensor& input_,
     const Tensor& mask_,
diff --git a/src/ATen/native/xpu/UpSampleBilinear2d.cpp b/src/ATen/native/xpu/UpSampleBilinear2d.cpp
index ee8c37ac0..aec707193 100644
--- a/src/ATen/native/xpu/UpSampleBilinear2d.cpp
+++ b/src/ATen/native/xpu/UpSampleBilinear2d.cpp
@@ -30,6 +30,7 @@ TORCH_IMPL_FUNC(upsample_bilinear2d_backward_out_xpu)
  std::optional<double> scales_h,
  std::optional<double> scales_w,
  const Tensor& grad_input) {
+  globalContext().alertNotDeterministic("upsample_bilinear2d_backward_out_xpu");
   xpu::upsample_bilinear2d_backward_out_kernel(
       grad_input,
       grad_output,
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 8492a98be..72f2aacdd 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -184,9 +184,7 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "_linalg_svd.U",
     "lu_unpack.out",
     "ormqr",
-    "_scaled_dot_product_efficient_attention",
     "_scaled_mm",
-    "_thnn_fused_gru_cell",
     "_to_sparse_csr",
     "triangular_solve.X",
     "_validate_compressed_sparse_indices",
diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
index e21c0160c..d94db11c9 100644
--- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
+++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
@@ -5,9 +5,9 @@
 #pragma GCC diagnostic ignored "-Wreturn-type"
 
 #include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
 #include <ATen/native/Pool.h>
 #include <ATen/native/utils/ParamUtils.h>
-#include <comm/xpu_aten.h>
 
 #include <ATen/native/xpu/sycl/Atomics.h>
 #include <ATen/native/xpu/sycl/BatchKernel.h>
diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h
index d530560e6..b07041fcb 100644
--- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h
+++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <comm/xpu_aten.h>
+#include <ATen/core/Tensor.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
index 57ac0d114..fb034f988 100644
--- a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
+++ b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
@@ -531,6 +531,8 @@ Tensor embedding_bag_backward_xpu_max(
     const Tensor& max_indices_t,
     int64_t num_weights,
     int64_t padding_idx) {
+  globalContext().alertNotDeterministic("embedding_bag_backward_xpu_max");
+
   auto max_indices = max_indices_t.contiguous();
   auto grad_weight = at::zeros({num_weights, grad.size(1)}, grad.options());
   int64_t stride = grad_weight.stride(0);
diff --git a/src/ATen/native/xpu/sycl/Indexing.cpp b/src/ATen/native/xpu/sycl/Indexing.cpp
index d429ecfbe..bcbd50c42 100644
--- a/src/ATen/native/xpu/sycl/Indexing.cpp
+++ b/src/ATen/native/xpu/sycl/Indexing.cpp
@@ -207,7 +207,7 @@ void index_select_kernel(
         }),
         AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
         AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
-	AT_EXPAND(AT_FLOAT8_TYPES),
+        AT_EXPAND(AT_FLOAT8_TYPES),
         kComplexHalf,
         kHalf,
         kBool,
@@ -1081,7 +1081,8 @@ void take_kernel(TensorIterator& iter, const TensorBase& input) {
             canUse32BitIndexMath(input) ? ScalarType::Int : ScalarType::Long,
             "take_xpu_index",
             [&] {
-              const scalar_t* indexed_ptr = input.template const_data_ptr<scalar_t>();
+              const scalar_t* indexed_ptr =
+                  input.template const_data_ptr<scalar_t>();
               TakeFunctor<scalar_t, index_t> f(indexed_ptr);
               take_put_kernel_template<scalar_t, index_t>(iter, input, f);
             });
@@ -1114,6 +1115,14 @@ void put_kernel(
     TensorIterator& iter,
     const TensorBase& output,
     const bool accumulate) {
+  // Nondeterministic when index contains duplicate entries and we do not
+  // accumulate If we accumulate on GPU, we use atomicGPUAdd, which is
+  // non-deterministic
+  if (!accumulate ||
+      (accumulate && iter.tensor(1).device().type() == DeviceType::XPU)) {
+    at::globalContext().alertNotDeterministic("put_");
+  }
+
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
       at::ScalarType::BFloat16,
       at::ScalarType::Half,
diff --git a/src/ATen/native/xpu/sycl/LerpKernels.cpp b/src/ATen/native/xpu/sycl/LerpKernels.cpp
index 1648f193b..9d7551290 100644
--- a/src/ATen/native/xpu/sycl/LerpKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LerpKernels.cpp
@@ -57,15 +57,29 @@ struct LerpScalarFunctor {
   opmath_t weight_val_;
 };
 
+void lerp_scalar_kernel(
+    at::TensorIteratorBase& iter,
+    const c10::Scalar& weight);
+
 void lerp_tensor_kernel(at::TensorIteratorBase& iter) {
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
     AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "lerp_xpu", [&] {
+      if (iter.is_cpu_scalar(3)) {
+        auto weight_val = iter.scalar_value<scalar_t>(3);
+        iter.remove_operand(3);
+        return lerp_scalar_kernel(iter, weight_val);
+      }
       gpu_kernel(iter, LerpTensorComplexFunctor<scalar_t>());
     });
   } else {
     AT_DISPATCH_FLOATING_TYPES_AND2(
         at::ScalarType::Half, at::ScalarType::BFloat16, dtype, "lerp_xpu", [&] {
+          if (iter.is_cpu_scalar(3)) {
+            auto weight_val = iter.scalar_value<scalar_t>(3);
+            iter.remove_operand(3);
+            return lerp_scalar_kernel(iter, weight_val);
+          }
           gpu_kernel(iter, LerpTensorFunctor<scalar_t>());
         });
   }
diff --git a/src/ATen/native/xpu/sycl/LossCTCKernels.cpp b/src/ATen/native/xpu/sycl/LossCTCKernels.cpp
index 9d26a48c7..3dd44968d 100644
--- a/src/ATen/native/xpu/sycl/LossCTCKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LossCTCKernels.cpp
@@ -1248,7 +1248,7 @@ Tensor ctc_loss_backward_kernel(
     bool zero_infinity) {
   // See Note [Writing Nondeterministic Operations]
   // Nondeterministic because of atomicAdd usage
-  globalContext().alertNotDeterministic("ctc_loss_backward_kernel");
+  globalContext().alertNotDeterministic("ctc_loss_backward_xpu");
   return AT_DISPATCH_FLOATING_TYPES(
       log_probs.scalar_type(), "ctc_loss_backward_xpu", [&] {
         if (targets.scalar_type() == kLong) {
diff --git a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp
index 4b93cb3c3..8b018de6b 100644
--- a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp
@@ -186,7 +186,7 @@ void nll_loss2d_forward_kernel(
     int64_t reduction,
     int64_t ignore_index) {
   if (reduction != at::Reduction::None) {
-    at::globalContext().alertNotDeterministic("nll_loss2d_forward_kernel");
+    at::globalContext().alertNotDeterministic("nll_loss2d_forward_xpu");
   }
 
   total_weight.resize_({});
diff --git a/src/ATen/native/xpu/sycl/RNNKernels.cpp b/src/ATen/native/xpu/sycl/RNNKernels.cpp
new file mode 100644
index 000000000..bad6bdf69
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/RNNKernels.cpp
@@ -0,0 +1,968 @@
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/OpMathType.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/core/TensorAccessor.h>
+#include <ATen/native/CanUse32BitIndexMath.h>
+#include <c10/util/generic_math.h>
+#include <comm/SYCLContext.h>
+#include <comm/TensorInfo.h>
+
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+
+#include <ATen/native/xpu/sycl/Loops.h>
+#include <ATen/native/xpu/sycl/RNNKernels.h>
+
+namespace at::native::xpu {
+
+using at::native::canUse32BitIndexMath;
+using at::xpu::detail::getTensorInfo;
+using at::xpu::detail::IndexToOffset;
+using at::xpu::detail::TensorInfo;
+
+std::tuple<int64_t, int64_t> rnn_get_launch_config(
+    int64_t max_threads_per_group,
+    int64_t numel) {
+  int64_t num_groups =
+      (numel + max_threads_per_group - 1) / max_threads_per_group;
+  auto hw_max_groups = syclMaxWorkItemsPerTile() / max_threads_per_group;
+  num_groups = num_groups > hw_max_groups ? hw_max_groups : num_groups;
+  return std::make_tuple(num_groups, max_threads_per_group);
+}
+
+// Factor will be 3 for GRU and 4 for LSTM
+void checkSizes(
+    CheckedFrom c,
+    const TensorArg& input_gates,
+    const TensorArg& hidden_gates,
+    const TensorArg& input_bias,
+    const TensorArg& hidden_bias,
+    int64_t factor,
+    const TensorArg& prev_hidden) {
+  checkDim(c, input_gates, 2);
+  checkSameSize(c, input_gates, hidden_gates);
+  int64_t gates_size = input_gates->size(1);
+
+  if (input_bias->defined()) {
+    checkDim(c, input_bias, 1);
+    checkNumel(c, input_bias, gates_size);
+    checkSameSize(c, input_bias, hidden_bias);
+  }
+
+  checkDim(c, prev_hidden, 2);
+  checkNumel(c, prev_hidden, input_gates->size(0) * gates_size / factor);
+
+  checkAllSameGPU(
+      c, {input_gates, hidden_gates, input_bias, hidden_bias, prev_hidden});
+}
+
+bool allContiguous(at::TensorList tensors) {
+  return std::all_of(tensors.begin(), tensors.end(), [](const at::Tensor& t) {
+    return !t.defined() || t.is_contiguous();
+  });
+}
+
+template <typename T, typename T2>
+TensorInfo<T, T2> tryGetTensorInfo(const at::Tensor& t) {
+  return t.defined() ? getTensorInfo<T, T2>(t) : TensorInfo<T, T2>{};
+}
+
+void collapseDims(){};
+template <typename T, typename T2, typename... Args>
+void collapseDims(TensorInfo<T, T2>& info, Args&... infos) {
+  info.collapseDims();
+  collapseDims(infos...);
+}
+
+#define DEVICE_LINEAR_GET(D_TENSOR, INDEX) \
+  D_TENSOR.data[IndexToOffset<scalar_t, index_type>::get(INDEX, D_TENSOR)]
+
+// Biases are always 1D
+#define DEVICE_BIAS_GET(D_TENSOR, INDEX) \
+  D_TENSOR.data[IndexToOffset<scalar_t, index_type>::get(INDEX, D_TENSOR)]
+
+#define H2F(input) static_cast<accscalar_t>(input)
+#define F2H(input) static_cast<scalar_t>(input)
+
+template <typename T>
+inline T sigmoid(T in) {
+  T one = static_cast<T>(1.0);
+  return one / (one + std::exp(-in));
+}
+
+template <typename scalar_t, typename accscalar_t, typename index_type>
+struct LstmCellForwardFunctor {
+  void operator()(sycl::nd_item<1> item) const {
+    bool has_bias = bias1_.data != nullptr;
+
+    for (index_type linearIndex = item.get_global_id(0);
+         linearIndex < totalElements_;
+         linearIndex += item.get_group_range(0) * item.get_local_range(0)) {
+      index_type offset = (linearIndex / hsz_) * 4 * hsz_ + linearIndex % hsz_;
+
+      scalar_t iig = DEVICE_LINEAR_GET(input_, offset + 0 * hsz_);
+      scalar_t ifg = DEVICE_LINEAR_GET(input_, offset + 1 * hsz_);
+      scalar_t icg = DEVICE_LINEAR_GET(input_, offset + 2 * hsz_);
+      scalar_t iog = DEVICE_LINEAR_GET(input_, offset + 3 * hsz_);
+
+      scalar_t hig = DEVICE_LINEAR_GET(hidden_, offset + 0 * hsz_);
+      scalar_t hfg = DEVICE_LINEAR_GET(hidden_, offset + 1 * hsz_);
+      scalar_t hcg = DEVICE_LINEAR_GET(hidden_, offset + 2 * hsz_);
+      scalar_t hog = DEVICE_LINEAR_GET(hidden_, offset + 3 * hsz_);
+
+      scalar_t* wig = &DEVICE_LINEAR_GET(workspace_, offset + 0 * hsz_);
+      scalar_t* wfg = &DEVICE_LINEAR_GET(workspace_, offset + 1 * hsz_);
+      scalar_t* wcg = &DEVICE_LINEAR_GET(workspace_, offset + 2 * hsz_);
+      scalar_t* wog = &DEVICE_LINEAR_GET(workspace_, offset + 3 * hsz_);
+
+      scalar_t cx = DEVICE_LINEAR_GET(_cx_, linearIndex);
+
+      scalar_t* hy = &DEVICE_LINEAR_GET(_hy_, linearIndex);
+      scalar_t* cy = &DEVICE_LINEAR_GET(_cy_, linearIndex);
+
+      scalar_t b1i, b1f, b1c, b1o;
+      scalar_t b2i, b2f, b2c, b2o;
+
+      if (has_bias) {
+        b1i = DEVICE_BIAS_GET(bias1_, linearIndex % hsz_ + 0 * hsz_);
+        b1f = DEVICE_BIAS_GET(bias1_, linearIndex % hsz_ + 1 * hsz_);
+        b1c = DEVICE_BIAS_GET(bias1_, linearIndex % hsz_ + 2 * hsz_);
+        b1o = DEVICE_BIAS_GET(bias1_, linearIndex % hsz_ + 3 * hsz_);
+
+        b2i = DEVICE_BIAS_GET(bias2_, linearIndex % hsz_ + 0 * hsz_);
+        b2f = DEVICE_BIAS_GET(bias2_, linearIndex % hsz_ + 1 * hsz_);
+        b2c = DEVICE_BIAS_GET(bias2_, linearIndex % hsz_ + 2 * hsz_);
+        b2o = DEVICE_BIAS_GET(bias2_, linearIndex % hsz_ + 3 * hsz_);
+      } else {
+        b1i = F2H(0.0);
+        b1f = F2H(0.0);
+        b1c = F2H(0.0);
+        b1o = F2H(0.0);
+        b2i = F2H(0.0);
+        b2f = F2H(0.0);
+        b2c = F2H(0.0);
+        b2o = F2H(0.0);
+      }
+
+      accscalar_t ig, fg, cg, og;
+      accscalar_t f_hy, f_cy;
+
+      ig = sigmoid(H2F(iig) + H2F(hig) + H2F(b1i) + H2F(b2i));
+      fg = sigmoid(H2F(ifg) + H2F(hfg) + H2F(b1f) + H2F(b2f));
+      cg = std::tanh(H2F(icg) + H2F(hcg) + H2F(b1c) + H2F(b2c));
+      og = sigmoid(H2F(iog) + H2F(hog) + H2F(b1o) + H2F(b2o));
+
+      f_cy = (fg * H2F(cx)) + (ig * cg);
+      f_hy = og * std::tanh(f_cy);
+
+      *hy = F2H(f_hy);
+      *cy = F2H(f_cy);
+
+      // SAVE FOR BACKWARDS
+      // Also need cy and cx but can be saved easily in python
+      *wig = F2H(ig);
+      *wfg = F2H(fg);
+      *wcg = F2H(cg);
+      *wog = F2H(og);
+    }
+  }
+
+  LstmCellForwardFunctor(
+      TensorInfo<scalar_t, index_type> input,
+      TensorInfo<scalar_t, index_type> hidden,
+      TensorInfo<scalar_t, index_type> bias1,
+      TensorInfo<scalar_t, index_type> bias2,
+      TensorInfo<scalar_t, index_type> _cx,
+      TensorInfo<scalar_t, index_type> _hy,
+      TensorInfo<scalar_t, index_type> _cy,
+      TensorInfo<scalar_t, index_type> workspace,
+      index_type hsz,
+      index_type totalElements)
+      : input_(input),
+        hidden_(hidden),
+        bias1_(bias1),
+        bias2_(bias2),
+        _cx_(_cx),
+        _hy_(_hy),
+        _cy_(_cy),
+        workspace_(workspace),
+        hsz_(hsz),
+        totalElements_(totalElements) {}
+
+ private:
+  TensorInfo<scalar_t, index_type> input_;
+  TensorInfo<scalar_t, index_type> hidden_;
+  TensorInfo<scalar_t, index_type> bias1_;
+  TensorInfo<scalar_t, index_type> bias2_;
+  TensorInfo<scalar_t, index_type> _cx_;
+  TensorInfo<scalar_t, index_type> _hy_;
+  TensorInfo<scalar_t, index_type> _cy_;
+  TensorInfo<scalar_t, index_type> workspace_;
+  index_type hsz_;
+  index_type totalElements_;
+};
+
+template <typename scalar_t, typename accscalar_t, typename index_type>
+struct LstmCellBackwardFunctor {
+  void operator()(sycl::nd_item<1> item) const {
+    bool has_gradoutput = gradoutput_.data != nullptr;
+    bool has_gradoutputcell = gradoutputcell_.data != nullptr;
+
+    for (index_type linearIndex = item.get_global_id(0);
+         linearIndex < totalElements_;
+         linearIndex += item.get_group_range(0) * item.get_local_range(0)) {
+      index_type offset = (linearIndex / hsz_) * 4 * hsz_ + linearIndex % hsz_;
+
+      scalar_t ig = DEVICE_LINEAR_GET(storage_, offset + 0 * hsz_);
+      scalar_t fg = DEVICE_LINEAR_GET(storage_, offset + 1 * hsz_);
+      scalar_t cg = DEVICE_LINEAR_GET(storage_, offset + 2 * hsz_);
+      scalar_t og = DEVICE_LINEAR_GET(storage_, offset + 3 * hsz_);
+
+      scalar_t* ih = &DEVICE_LINEAR_GET(gradInGates_, offset + 0 * hsz_);
+      scalar_t* fh = &DEVICE_LINEAR_GET(gradInGates_, offset + 1 * hsz_);
+      scalar_t* ch = &DEVICE_LINEAR_GET(gradInGates_, offset + 2 * hsz_);
+      scalar_t* oh = &DEVICE_LINEAR_GET(gradInGates_, offset + 3 * hsz_);
+
+      // will return hidden grads here
+      scalar_t cx = DEVICE_LINEAR_GET(_cx_, linearIndex);
+      scalar_t cy = DEVICE_LINEAR_GET(_cy_, linearIndex);
+
+      scalar_t* gi = &DEVICE_LINEAR_GET(gradInputCx_, linearIndex);
+
+      accscalar_t go = has_gradoutput
+          ? H2F(DEVICE_LINEAR_GET(gradoutput_, linearIndex))
+          : 0.f;
+      accscalar_t goc = has_gradoutputcell
+          ? H2F(DEVICE_LINEAR_GET(gradoutputcell_, linearIndex))
+          : 0.f;
+
+      accscalar_t gcx = std::tanh(H2F(cy));
+
+      accscalar_t gog = go * gcx;
+      gcx = go * H2F(og) * (1 - gcx * gcx) + goc;
+
+      accscalar_t gig = gcx * H2F(cg);
+      accscalar_t gfg = gcx * H2F(cx);
+      accscalar_t gcg = gcx * H2F(ig);
+
+      gcx = gcx * H2F(fg);
+
+      gig = gig * (1 - H2F(ig)) * H2F(ig);
+      gfg = gfg * (1 - H2F(fg)) * H2F(fg);
+      gcg = gcg * (1 - H2F(cg) * H2F(cg));
+      gog = gog * (1 - H2F(og)) * H2F(og);
+
+      *ih = F2H(gig);
+      *fh = F2H(gfg);
+      *ch = F2H(gcg);
+      *oh = F2H(gog);
+
+      *gi = F2H(gcx);
+    }
+  }
+
+  LstmCellBackwardFunctor(
+      TensorInfo<scalar_t, index_type> storage,
+      TensorInfo<scalar_t, index_type> gradInGates,
+      TensorInfo<scalar_t, index_type> _cx,
+      TensorInfo<scalar_t, index_type> _cy,
+      TensorInfo<scalar_t, index_type> gradoutput,
+      TensorInfo<scalar_t, index_type> gradoutputcell,
+      TensorInfo<scalar_t, index_type> gradInputCx,
+      index_type hsz,
+      index_type totalElements)
+      : storage_(storage),
+        gradInGates_(gradInGates),
+        _cx_(_cx),
+        _cy_(_cy),
+        gradoutput_(gradoutput),
+        gradoutputcell_(gradoutputcell),
+        gradInputCx_(gradInputCx),
+        hsz_(hsz),
+        totalElements_(totalElements) {}
+
+ private:
+  TensorInfo<scalar_t, index_type> storage_;
+  TensorInfo<scalar_t, index_type> gradInGates_;
+  TensorInfo<scalar_t, index_type> _cx_;
+  TensorInfo<scalar_t, index_type> _cy_;
+  TensorInfo<scalar_t, index_type> gradoutput_;
+  TensorInfo<scalar_t, index_type> gradoutputcell_;
+  TensorInfo<scalar_t, index_type> gradInputCx_;
+  index_type hsz_;
+  index_type totalElements_;
+};
+
+template <typename scalar_t, typename accscalar_t, typename index_type>
+struct GruCellForwardFunctor {
+  void operator()(sycl::nd_item<1> item) const {
+    bool has_bias = Bias1_.data != nullptr;
+
+    for (index_type linearIndex = item.get_global_id(0);
+         linearIndex < totalElements_;
+         linearIndex += item.get_group_range(0) * item.get_local_range(0)) {
+      index_type offset = (linearIndex / hsz_) * 3 * hsz_ + linearIndex % hsz_;
+
+      scalar_t ir = DEVICE_LINEAR_GET(Input_, offset + 0 * hsz_);
+      scalar_t ii = DEVICE_LINEAR_GET(Input_, offset + 1 * hsz_);
+      scalar_t in = DEVICE_LINEAR_GET(Input_, offset + 2 * hsz_);
+      scalar_t hr = DEVICE_LINEAR_GET(Hidden_, offset + 0 * hsz_);
+      scalar_t hi = DEVICE_LINEAR_GET(Hidden_, offset + 1 * hsz_);
+      scalar_t hn = DEVICE_LINEAR_GET(Hidden_, offset + 2 * hsz_);
+
+      scalar_t hx = DEVICE_LINEAR_GET(_hx_, linearIndex);
+      scalar_t* hy = &DEVICE_LINEAR_GET(_hy_, linearIndex);
+
+      scalar_t b1r, b1i, b1n, b2r, b2i, b2n;
+
+      if (has_bias) {
+        b1r = DEVICE_BIAS_GET(Bias1_, linearIndex % hsz_ + 0 * hsz_);
+        b1i = DEVICE_BIAS_GET(Bias1_, linearIndex % hsz_ + 1 * hsz_);
+        b1n = DEVICE_BIAS_GET(Bias1_, linearIndex % hsz_ + 2 * hsz_);
+
+        b2r = DEVICE_BIAS_GET(Bias2_, linearIndex % hsz_ + 0 * hsz_);
+        b2i = DEVICE_BIAS_GET(Bias2_, linearIndex % hsz_ + 1 * hsz_);
+        b2n = DEVICE_BIAS_GET(Bias2_, linearIndex % hsz_ + 2 * hsz_);
+      } else {
+        b1r = F2H(0.0);
+        b1i = F2H(0.0);
+        b1n = F2H(0.0);
+        b2r = F2H(0.0);
+        b2i = F2H(0.0);
+        b2n = F2H(0.0);
+      }
+
+      offset = (linearIndex / hsz_) * 5 * hsz_ + linearIndex % hsz_;
+
+      accscalar_t rg, ig, ng;
+
+      rg = sigmoid<accscalar_t>(H2F(ir) + H2F(hr) + H2F(b1r) + H2F(b2r));
+      ig = sigmoid<accscalar_t>(H2F(ii) + H2F(hi) + H2F(b1i) + H2F(b2i));
+
+      ng = H2F(in) + H2F(b1n) + rg * (H2F(hn) + H2F(b2n));
+      ng = std::tanh(ng);
+      *hy = F2H(ng + ig * (H2F(hx) - ng));
+
+      // SAVE FOR BACKWARDS
+      DEVICE_LINEAR_GET(storage_, offset + 0 * hsz_) = F2H(rg);
+      DEVICE_LINEAR_GET(storage_, offset + 1 * hsz_) = F2H(ig);
+      DEVICE_LINEAR_GET(storage_, offset + 2 * hsz_) = F2H(ng);
+      DEVICE_LINEAR_GET(storage_, offset + 3 * hsz_) = hx;
+      DEVICE_LINEAR_GET(storage_, offset + 4 * hsz_) = F2H(H2F(hn) + H2F(b2n));
+    }
+  }
+
+  GruCellForwardFunctor(
+      TensorInfo<scalar_t, index_type> Input,
+      const TensorInfo<scalar_t, index_type> Hidden,
+      const TensorInfo<scalar_t, index_type> Bias1,
+      const TensorInfo<scalar_t, index_type> Bias2,
+      const TensorInfo<scalar_t, index_type> _hx,
+      const TensorInfo<scalar_t, index_type> _hy,
+      const TensorInfo<scalar_t, index_type> storage,
+      const index_type hsz,
+      const index_type totalElements)
+      : Input_(Input),
+        Hidden_(Hidden),
+        Bias1_(Bias1),
+        Bias2_(Bias2),
+        _hx_(_hx),
+        _hy_(_hy),
+        storage_(storage),
+        hsz_(hsz),
+        totalElements_(totalElements) {}
+
+ private:
+  TensorInfo<scalar_t, index_type> Input_;
+  const TensorInfo<scalar_t, index_type> Hidden_;
+  const TensorInfo<scalar_t, index_type> Bias1_;
+  const TensorInfo<scalar_t, index_type> Bias2_;
+  const TensorInfo<scalar_t, index_type> _hx_;
+  const TensorInfo<scalar_t, index_type> _hy_;
+  const TensorInfo<scalar_t, index_type> storage_;
+  const index_type hsz_;
+  const index_type totalElements_;
+};
+
+template <typename scalar_t, typename accscalar_t, typename index_type>
+struct GruCellBackwardFunctor {
+  void operator()(sycl::nd_item<1> item) const {
+    for (index_type linearIndex = item.get_global_id(0);
+         linearIndex < totalElements_;
+         linearIndex += item.get_group_range(0) * item.get_local_range(0)) {
+      index_type offset = (linearIndex / hsz_) * 5 * hsz_ + linearIndex % hsz_;
+
+      scalar_t rg = DEVICE_LINEAR_GET(storage_, offset + 0 * hsz_);
+      scalar_t ig = DEVICE_LINEAR_GET(storage_, offset + 1 * hsz_);
+      scalar_t ng = DEVICE_LINEAR_GET(storage_, offset + 2 * hsz_);
+      scalar_t hx = DEVICE_LINEAR_GET(storage_, offset + 3 * hsz_);
+      scalar_t hn = DEVICE_LINEAR_GET(storage_, offset + 4 * hsz_);
+
+      scalar_t go = DEVICE_LINEAR_GET(gradOutput_, linearIndex);
+
+      offset = (linearIndex / hsz_) * 3 * hsz_ + linearIndex % hsz_;
+
+      accscalar_t gig = H2F(go) * (H2F(hx) - H2F(ng)) * (1 - H2F(ig)) * H2F(ig);
+      accscalar_t ghx = H2F(go) * H2F(ig);
+      accscalar_t gin = H2F(go) * (1 - H2F(ig)) * (1 - H2F(ng) * H2F(ng));
+      accscalar_t ghn = gin * H2F(rg);
+      accscalar_t grg = gin * H2F(hn) * (1 - H2F(rg)) * H2F(rg);
+
+      DEVICE_LINEAR_GET(gradInInput_, offset + 0 * hsz_) = F2H(grg);
+      DEVICE_LINEAR_GET(gradInInput_, offset + 1 * hsz_) = F2H(gig);
+      DEVICE_LINEAR_GET(gradInInput_, offset + 2 * hsz_) = F2H(gin);
+
+      DEVICE_LINEAR_GET(gradInHidden_, offset + 0 * hsz_) = F2H(grg);
+      DEVICE_LINEAR_GET(gradInHidden_, offset + 1 * hsz_) = F2H(gig);
+      DEVICE_LINEAR_GET(gradInHidden_, offset + 2 * hsz_) = F2H(ghn);
+      DEVICE_LINEAR_GET(gradInputHx_, linearIndex) = F2H(ghx);
+    }
+  }
+
+  GruCellBackwardFunctor(
+      TensorInfo<scalar_t, index_type> gradInInput,
+      TensorInfo<scalar_t, index_type> gradInHidden,
+      TensorInfo<scalar_t, index_type> gradOutput,
+      TensorInfo<scalar_t, index_type> gradInputHx,
+      TensorInfo<scalar_t, index_type> storage,
+      index_type hsz,
+      index_type totalElements)
+      : gradInInput_(gradInInput),
+        gradInHidden_(gradInHidden),
+        gradOutput_(gradOutput),
+        gradInputHx_(gradInputHx),
+        storage_(storage),
+        hsz_(hsz),
+        totalElements_(totalElements) {}
+
+ private:
+  TensorInfo<scalar_t, index_type> gradInInput_;
+  TensorInfo<scalar_t, index_type> gradInHidden_;
+  TensorInfo<scalar_t, index_type> gradOutput_;
+  TensorInfo<scalar_t, index_type> gradInputHx_;
+  TensorInfo<scalar_t, index_type> storage_;
+  index_type hsz_;
+  index_type totalElements_;
+};
+
+#undef DEVICE_LINEAR_GET
+#undef DEVICE_BIAS_GET
+#undef H2F
+#undef F2H
+
+template <typename scalar_t, typename index_type>
+void lstm_forward_impl(
+    const Tensor& input_gates,
+    const Tensor& hidden_gates,
+    const Tensor& input_bias,
+    const Tensor& hidden_bias,
+    const Tensor& cx,
+    const Tensor& hy,
+    const Tensor& cy,
+    const Tensor& workspace) {
+  using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
+
+  int64_t numel = cx.numel();
+  if (numel == 0)
+    return;
+
+  using KernelT = LstmCellForwardFunctor<scalar_t, accscalar_t, index_type>;
+  auto max_wg_size = syclMaxWorkGroupSize<KernelT>();
+  auto config = rnn_get_launch_config(max_wg_size, numel);
+  auto nwg = std::get<0>(config);
+  auto local_range = std::get<1>(config);
+
+  auto input_gatesI = getTensorInfo<scalar_t, index_type>(input_gates);
+  auto hidden_gatesI = getTensorInfo<scalar_t, index_type>(hidden_gates);
+  auto input_biasI = tryGetTensorInfo<scalar_t, index_type>(input_bias);
+  auto hidden_biasI = tryGetTensorInfo<scalar_t, index_type>(hidden_bias);
+  auto cxI = getTensorInfo<scalar_t, index_type>(cx);
+  auto hyI = getTensorInfo<scalar_t, index_type>(hy);
+  auto cyI = getTensorInfo<scalar_t, index_type>(cy);
+  auto workspaceI = getTensorInfo<scalar_t, index_type>(workspace);
+  index_type hidden_size = cxI.sizes[cxI.dims - 1];
+
+  if (allContiguous(
+          {input_gates,
+           hidden_gates,
+           input_bias,
+           hidden_bias,
+           cx,
+           hy,
+           cy,
+           workspace})) {
+    collapseDims(
+        input_gatesI,
+        hidden_gatesI,
+        input_biasI,
+        hidden_biasI,
+        cxI,
+        hyI,
+        cyI,
+        workspaceI);
+    KernelT kfn(
+        input_gatesI,
+        hidden_gatesI,
+        input_biasI,
+        hidden_biasI,
+        cxI,
+        hyI,
+        cyI,
+        workspaceI,
+        hidden_size,
+        numel);
+    sycl_kernel_submit(
+        nwg * local_range, local_range, getCurrentSYCLQueue(), kfn);
+  } else {
+    KernelT kfn(
+        input_gatesI,
+        hidden_gatesI,
+        input_biasI,
+        hidden_biasI,
+        cxI,
+        hyI,
+        cyI,
+        workspaceI,
+        hidden_size,
+        numel);
+    sycl_kernel_submit(
+        nwg * local_range, local_range, getCurrentSYCLQueue(), kfn);
+  }
+}
+
+template <typename scalar_t, typename index_type>
+void lstm_backward_impl(
+    const Tensor& grad_hy,
+    const Tensor& grad_cy,
+    const Tensor& cx,
+    const Tensor& cy,
+    const Tensor& workspace,
+    const Tensor& grad_gates,
+    const Tensor& grad_cx) {
+  using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
+
+  int64_t numel = cx.numel();
+  if (numel == 0)
+    return;
+
+  using KernelT = LstmCellBackwardFunctor<scalar_t, accscalar_t, index_type>;
+  auto max_wg_size = syclMaxWorkGroupSize<KernelT>();
+  auto config = rnn_get_launch_config(max_wg_size, numel);
+  auto nwg = std::get<0>(config);
+  auto local_range = std::get<1>(config);
+
+  auto grad_hyI = tryGetTensorInfo<scalar_t, index_type>(grad_hy);
+  auto grad_cyI = tryGetTensorInfo<scalar_t, index_type>(grad_cy);
+  auto cxI = getTensorInfo<scalar_t, index_type>(cx);
+  auto cyI = getTensorInfo<scalar_t, index_type>(cy);
+  auto workspaceI = getTensorInfo<scalar_t, index_type>(workspace);
+  auto grad_gatesI = getTensorInfo<scalar_t, index_type>(grad_gates);
+  auto grad_cxI = getTensorInfo<scalar_t, index_type>(grad_cx);
+  index_type hidden_size = cxI.sizes[cxI.dims - 1];
+
+  if (allContiguous(
+          {grad_hy, grad_cy, cx, cy, workspace, grad_gates, grad_cx})) {
+    collapseDims(
+        grad_hyI, grad_cyI, cxI, cyI, workspaceI, grad_gatesI, grad_cxI);
+    KernelT kfn(
+        workspaceI,
+        grad_gatesI,
+        cxI,
+        cyI,
+        grad_hyI,
+        grad_cyI,
+        grad_cxI,
+        hidden_size,
+        numel);
+    sycl_kernel_submit(
+        nwg * local_range, local_range, getCurrentSYCLQueue(), kfn);
+  } else {
+    KernelT kfn(
+        workspaceI,
+        grad_gatesI,
+        cxI,
+        cyI,
+        grad_hyI,
+        grad_cyI,
+        grad_cxI,
+        hidden_size,
+        numel);
+    sycl_kernel_submit(
+        nwg * local_range, local_range, getCurrentSYCLQueue(), kfn);
+  }
+}
+
+template <typename scalar_t, typename index_type>
+void gru_forward_impl(
+    const Tensor& input_gates,
+    const Tensor& hidden_gates,
+    const Tensor& input_bias,
+    const Tensor& hidden_bias,
+    const Tensor& hx,
+    const Tensor& hy,
+    const Tensor& workspace) {
+  using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
+
+  int64_t numel = hx.numel();
+  if (numel == 0)
+    return;
+
+  using KernelT = GruCellForwardFunctor<scalar_t, accscalar_t, index_type>;
+  auto max_wg_size = syclMaxWorkGroupSize<KernelT>();
+  auto config = rnn_get_launch_config(max_wg_size, numel);
+  auto nwg = std::get<0>(config);
+  auto local_range = std::get<1>(config);
+
+  auto input_gatesI = getTensorInfo<scalar_t, index_type>(input_gates);
+  auto hidden_gatesI = getTensorInfo<scalar_t, index_type>(hidden_gates);
+  auto input_biasI = tryGetTensorInfo<scalar_t, index_type>(input_bias);
+  auto hidden_biasI = tryGetTensorInfo<scalar_t, index_type>(hidden_bias);
+  auto hxI = getTensorInfo<scalar_t, index_type>(hx);
+  auto hyI = getTensorInfo<scalar_t, index_type>(hy);
+  auto workspaceI = getTensorInfo<scalar_t, index_type>(workspace);
+  index_type hidden_size = hxI.sizes[hxI.dims - 1];
+
+  if (allContiguous(
+          {input_gates,
+           hidden_gates,
+           input_bias,
+           hidden_bias,
+           hx,
+           hy,
+           workspace})) {
+    collapseDims(
+        input_gatesI,
+        hidden_gatesI,
+        input_biasI,
+        hidden_biasI,
+        hxI,
+        hyI,
+        workspaceI);
+    KernelT kfn(
+        input_gatesI,
+        hidden_gatesI,
+        input_biasI,
+        hidden_biasI,
+        hxI,
+        hyI,
+        workspaceI,
+        hidden_size,
+        numel);
+    sycl_kernel_submit(
+        nwg * local_range, local_range, getCurrentSYCLQueue(), kfn);
+  } else {
+    KernelT kfn(
+        input_gatesI,
+        hidden_gatesI,
+        input_biasI,
+        hidden_biasI,
+        hxI,
+        hyI,
+        workspaceI,
+        hidden_size,
+        numel);
+    sycl_kernel_submit(
+        nwg * local_range, local_range, getCurrentSYCLQueue(), kfn);
+  }
+}
+
+template <typename scalar_t, typename index_type>
+void gru_backward_impl(
+    const Tensor& grad_hy,
+    const Tensor& workspace,
+    const Tensor& grad_input_gates,
+    const Tensor& grad_hidden_gates,
+    const Tensor& grad_hx) {
+  using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
+
+  int64_t numel = grad_hy.numel();
+  if (numel == 0)
+    return;
+
+  using KernelT = GruCellBackwardFunctor<scalar_t, accscalar_t, index_type>;
+  auto max_wg_size = syclMaxWorkGroupSize<KernelT>();
+  auto config = rnn_get_launch_config(max_wg_size, numel);
+  auto nwg = std::get<0>(config);
+  auto local_range = std::get<1>(config);
+
+  auto grad_hyI = getTensorInfo<scalar_t, index_type>(grad_hy);
+  auto workspaceI = getTensorInfo<scalar_t, index_type>(workspace);
+  auto grad_input_gatesI =
+      getTensorInfo<scalar_t, index_type>(grad_input_gates);
+  auto grad_hidden_gatesI =
+      getTensorInfo<scalar_t, index_type>(grad_hidden_gates);
+  auto grad_hxI = getTensorInfo<scalar_t, index_type>(grad_hx);
+  index_type hidden_size = grad_hyI.sizes[grad_hyI.dims - 1];
+
+  if (allContiguous(
+          {grad_hy, workspace, grad_input_gates, grad_hidden_gates, grad_hx})) {
+    collapseDims(
+        grad_hyI, workspaceI, grad_input_gatesI, grad_hidden_gatesI, grad_hxI);
+    KernelT kfn(
+        grad_input_gatesI,
+        grad_hidden_gatesI,
+        grad_hyI,
+        grad_hxI,
+        workspaceI,
+        hidden_size,
+        numel);
+    sycl_kernel_submit(
+        nwg * local_range, local_range, getCurrentSYCLQueue(), kfn);
+  } else {
+    KernelT kfn(
+        grad_input_gatesI,
+        grad_hidden_gatesI,
+        grad_hyI,
+        grad_hxI,
+        workspaceI,
+        hidden_size,
+        numel);
+    sycl_kernel_submit(
+        nwg * local_range, local_range, getCurrentSYCLQueue(), kfn);
+  }
+}
+
+// Note [64-bit index math check elision]
+// It's enough to perform the check for 64-bit math on the largest tensor only.
+// If 32-bit is enough for it, it will suffice for all other tensors too, and we
+// can save some work using this trick.
+
+std::tuple<Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_kernel(
+    const Tensor& input_gates,
+    const Tensor& hidden_gates,
+    const Tensor& cx,
+    const std::optional<Tensor>& input_bias_opt,
+    const std::optional<Tensor>& hidden_bias_opt) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> input_bias_maybe_owned =
+      at::borrow_from_optional_tensor(input_bias_opt);
+  const Tensor& input_bias = *input_bias_maybe_owned;
+  const Tensor& hidden_bias = hidden_bias_opt.value_or(Tensor());
+
+  checkSizes(
+      "_thnn_fused_lstm_cell_xpu",
+      {input_gates, "input_gates", 1},
+      {hidden_gates, "hidden_gates", 2},
+      {input_bias, "input_bias", 3},
+      {hidden_bias, "hidden_bias", 4},
+      /*factor=*/4,
+      {cx, "prev_hidden", 5});
+
+  auto workspace = at::empty_like(input_gates, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  auto hy = at::empty_like(cx, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  auto cy = at::empty_like(cx, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      input_gates.scalar_type(),
+      "_thnn_fused_lstm_cell_xpu",
+      [&] {
+        if (canUse32BitIndexMath(
+                workspace)) { // See Note [64-bit index math check elision]
+          lstm_forward_impl<scalar_t, int32_t>(
+              input_gates,
+              hidden_gates,
+              input_bias,
+              hidden_bias,
+              cx,
+              hy,
+              cy,
+              workspace);
+        } else {
+          lstm_forward_impl<scalar_t, int64_t>(
+              input_gates,
+              hidden_gates,
+              input_bias,
+              hidden_bias,
+              cx,
+              hy,
+              cy,
+              workspace);
+        }
+      });
+  return std::make_tuple(std::move(hy), std::move(cy), std::move(workspace));
+}
+
+void checkLSTMBackwardSizes(
+    const TensorArg& grad_hy,
+    const TensorArg& grad_cy,
+    const TensorArg& cx,
+    const TensorArg& cy,
+    const TensorArg& workspace) {
+  CheckedFrom c = "fused_lstm_cell_backward";
+  const TensorArg& defined_grad = grad_hy->defined() ? grad_hy : grad_cy;
+  checkDim(c, defined_grad, 2);
+  auto exp_size = defined_grad->sizes();
+  if (grad_hy->defined()) {
+    checkSize(c, grad_hy, exp_size);
+  }
+  if (grad_cy->defined()) {
+    checkSize(c, grad_cy, exp_size);
+  }
+  checkSize(c, cx, exp_size);
+  checkSize(c, cy, exp_size);
+  checkDim(c, workspace, 2);
+  checkNumel(c, workspace, exp_size[0] * exp_size[1] * 4);
+}
+
+std::tuple<Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backward_kernel(
+    const std::optional<Tensor>& grad_hy_opt,
+    const std::optional<Tensor>& grad_cy_opt,
+    const Tensor& cx,
+    const Tensor& cy,
+    const Tensor& workspace,
+    bool has_bias) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> grad_hy_maybe_owned =
+      at::borrow_from_optional_tensor(grad_hy_opt);
+  const Tensor& grad_hy = *grad_hy_maybe_owned;
+  const Tensor& grad_cy = grad_cy_opt.value_or(Tensor());
+
+  if (!grad_hy.defined() && !grad_cy.defined()) {
+    return std::tuple<Tensor, Tensor, Tensor>();
+  }
+  checkLSTMBackwardSizes(
+      {grad_hy, "grad_hy", 1},
+      {grad_cy, "grad_cy", 2},
+      {cx, "cx", 3},
+      {cy, "cy", 4},
+      {workspace, "workspace", 5});
+
+  auto grad_gates = at::empty_like(workspace, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  auto grad_cx = at::empty_like(cx, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      workspace.scalar_type(),
+      "_thnn_fused_lstm_cell_backward_xpu",
+      [&] {
+        if (canUse32BitIndexMath(
+                workspace)) { // See Note [64-bit index math check elision]
+          lstm_backward_impl<scalar_t, int32_t>(
+              grad_hy, grad_cy, cx, cy, workspace, grad_gates, grad_cx);
+        } else {
+          lstm_backward_impl<scalar_t, int64_t>(
+              grad_hy, grad_cy, cx, cy, workspace, grad_gates, grad_cx);
+        }
+      });
+
+  auto grad_bias =
+      has_bias ? grad_gates.sum(0, /*keepdim=*/false) : at::Tensor{};
+  return std::make_tuple(
+      std::move(grad_gates), std::move(grad_cx), std::move(grad_bias));
+}
+
+static constexpr int64_t GRU_WORKSPACE_MULTIPLIER = 5;
+
+std::tuple<Tensor, Tensor> _thnn_fused_gru_cell_kernel(
+    const Tensor& input_gates,
+    const Tensor& hidden_gates,
+    const Tensor& hx,
+    const std::optional<Tensor>& input_bias_opt,
+    const std::optional<Tensor>& hidden_bias_opt) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> input_bias_maybe_owned =
+      at::borrow_from_optional_tensor(input_bias_opt);
+  const Tensor& input_bias = *input_bias_maybe_owned;
+  const Tensor& hidden_bias = hidden_bias_opt.value_or(Tensor());
+
+  checkSizes(
+      "_thnn_fused_gru_cell_xpu",
+      {input_gates, "input_gates", 1},
+      {hidden_gates, "hidden_gates", 2},
+      {input_bias, "input_bias", 3},
+      {hidden_bias, "hidden_bias", 4},
+      /*factor=*/3,
+      {hx, "prev_hidden", 5});
+
+  auto workspace = at::empty(
+      {hx.size(0), hx.size(1) * GRU_WORKSPACE_MULTIPLIER}, hx.options());
+  auto hy = at::empty_like(hx, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      input_gates.scalar_type(),
+      "_thnn_fused_gru_cell_xpu",
+      [&] {
+        if (canUse32BitIndexMath(
+                workspace)) { // See Note [64-bit index math check elision]
+          gru_forward_impl<scalar_t, int32_t>(
+              input_gates,
+              hidden_gates,
+              input_bias,
+              hidden_bias,
+              hx,
+              hy,
+              workspace);
+        } else {
+          gru_forward_impl<scalar_t, int64_t>(
+              input_gates,
+              hidden_gates,
+              input_bias,
+              hidden_bias,
+              hx,
+              hy,
+              workspace);
+        }
+      });
+  return std::make_tuple(std::move(hy), std::move(workspace));
+}
+
+void checkGRUBackwardSizes(
+    const TensorArg& grad_hy,
+    const TensorArg& workspace) {
+  CheckedFrom c = "fused_gru_cell_backward";
+  checkDim(c, grad_hy, 2);
+  checkSize(
+      c,
+      workspace,
+      {grad_hy->size(0), grad_hy->size(1) * GRU_WORKSPACE_MULTIPLIER});
+}
+
+std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor>
+_thnn_fused_gru_cell_backward_kernel(
+    const Tensor& grad_hy,
+    const Tensor& workspace,
+    bool has_bias) {
+  checkGRUBackwardSizes({grad_hy, "grad_hy", 1}, {workspace, "workspace", 2});
+
+  int64_t hidden_size = workspace.size(1) / GRU_WORKSPACE_MULTIPLIER;
+  auto grad_input_gates =
+      at::empty({workspace.size(0), hidden_size * 3}, workspace.options());
+  auto grad_hidden_gates =
+      at::empty({workspace.size(0), hidden_size * 3}, workspace.options());
+  auto grad_hx = at::empty_like(grad_hy, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      grad_hy.scalar_type(),
+      "_thnn_fused_gru_cell_backward_xpu",
+      [&] {
+        if (canUse32BitIndexMath(
+                workspace)) { // See Note [64-bit index math check elision]
+          gru_backward_impl<scalar_t, int32_t>(
+              grad_hy, workspace, grad_input_gates, grad_hidden_gates, grad_hx);
+        } else {
+          gru_backward_impl<scalar_t, int64_t>(
+              grad_hy, workspace, grad_input_gates, grad_hidden_gates, grad_hx);
+        }
+      });
+
+  at::Tensor grad_input_bias, grad_hidden_bias;
+  if (has_bias) {
+    grad_input_bias = grad_input_gates.sum(0, /*keepdim=*/false);
+    grad_hidden_bias = grad_hidden_gates.sum(0, /*keepdim=*/false);
+  }
+
+  return std::make_tuple(
+      std::move(grad_input_gates),
+      std::move(grad_hidden_gates),
+      std::move(grad_hx),
+      std::move(grad_input_bias),
+      std::move(grad_hidden_bias));
+}
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/RNNKernels.h b/src/ATen/native/xpu/sycl/RNNKernels.h
new file mode 100644
index 000000000..07f0e3f78
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/RNNKernels.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <ATen/native/TensorIterator.h>
+
+namespace at::native::xpu {
+
+TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_kernel(
+    const Tensor& input_gates,
+    const Tensor& hidden_gates,
+    const Tensor& cx,
+    const std::optional<Tensor>& input_bias_opt,
+    const std::optional<Tensor>& hidden_bias_opt);
+
+TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor>
+_thnn_fused_lstm_cell_backward_kernel(
+    const std::optional<Tensor>& grad_hy_opt,
+    const std::optional<Tensor>& grad_cy_opt,
+    const Tensor& cx,
+    const Tensor& cy,
+    const Tensor& workspace,
+    bool has_bias);
+
+TORCH_XPU_API std::tuple<Tensor, Tensor> _thnn_fused_gru_cell_kernel(
+    const Tensor& input_gates,
+    const Tensor& hidden_gates,
+    const Tensor& hx,
+    const std::optional<Tensor>& input_bias_opt,
+    const std::optional<Tensor>& hidden_bias_opt);
+
+TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor>
+_thnn_fused_gru_cell_backward_kernel(
+    const Tensor& grad_hy,
+    const Tensor& workspace,
+    bool has_bias);
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ResizeKernel.cpp b/src/ATen/native/xpu/sycl/ResizeKernel.cpp
index 237a1c213..f1ee7f944 100644
--- a/src/ATen/native/xpu/sycl/ResizeKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ResizeKernel.cpp
@@ -25,8 +25,9 @@ void resize_bytes_xpu(StorageImpl* storage, size_t size_bytes) {
   c10::xpu::XPUGuard guard(device.index());
   at::DataPtr data = allocator->allocate(size_bytes);
   if (storage->data_ptr()) {
-    auto q = at::xpu::getCurrentSYCLQueue();
+    at::globalContext().lazyInitDevice(c10::DeviceType::XPU);
 
+    auto q = at::xpu::getCurrentSYCLQueue();
     q.memcpy(
         data.get(), storage->data(), std::min(storage->nbytes(), size_bytes));
   }
diff --git a/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.cpp b/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.cpp
index 533630175..7f6f33805 100644
--- a/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.cpp
+++ b/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.cpp
@@ -86,7 +86,7 @@ template <typename scalar_t>
 inline void _rrelu_with_noise_xpu_train(
     Tensor& output,
     const Tensor& input_,
-    const Tensor& noise_,
+    Tensor& noise_,
     const Scalar& lower_,
     const Scalar& upper_,
     std::optional<Generator> generator) {
@@ -153,7 +153,7 @@ inline void _rrelu_with_noise_xpu_train(
 
 Tensor& rrelu_with_noise_kernel(
     const Tensor& self,
-    const Tensor& noise,
+    Tensor& noise,
     const Scalar& lower,
     const Scalar& upper,
     bool training,
diff --git a/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.h b/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.h
index 8371c38ab..fa7e568ea 100644
--- a/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.h
+++ b/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.h
@@ -7,7 +7,7 @@ namespace at::native::xpu {
 
 TORCH_XPU_API Tensor& rrelu_with_noise_kernel(
     const Tensor& self,
-    const Tensor& noise,
+    Tensor& noise,
     const Scalar& lower,
     const Scalar& upper,
     bool training,
diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
index 28d812f2c..0a0c7e718 100644
--- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
+++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
@@ -210,7 +210,8 @@ template <
     int outer_loop,
     bool is_masked,
     typename calc_t,
-    typename vec_t>
+    typename vec_t,
+    bool is_safe_softmax>
 struct DispatchSoftmaxForwardKernelFunctor
     : public __SYCL_KER_CONFIG_CONVENTION__ {
   [[intel::reqd_sub_group_size(SIMD)]] void operator()(
@@ -240,7 +241,8 @@ struct DispatchSoftmaxForwardKernelFunctor
       if (index >= dim_size_)
         break;
 
-      reg_in[i] = *(reinterpret_cast<const vec_t*>(in_data_ + group_offset + index));
+      reg_in[i] =
+          *(reinterpret_cast<const vec_t*>(in_data_ + group_offset + index));
       if constexpr (is_masked) {
         auto vec_offset = group_offset + index;
 #pragma unroll(vec_size)
@@ -309,6 +311,10 @@ struct DispatchSoftmaxForwardKernelFunctor
         if constexpr (LogSoftMax) {
           reg_in[i][j] =
               static_cast<scalar_t>(reg_in[i][j] - max_value - sum_value);
+        } else if (
+            is_safe_softmax &&
+            max_value == std::numeric_limits<accscalar_t>::lowest()) {
+          reg_in[i][j] = static_cast<scalar_t>(0);
         } else if (sum_value == 0) {
           reg_in[i][j] = nan_;
         } else {
@@ -386,7 +392,8 @@ template <
     bool LogSoftMax,
     int outer_loop,
     bool is_masked = false,
-    typename calc_t = decltype(nullptr)>
+    typename calc_t = decltype(nullptr),
+    bool is_safe_softmax = false>
 bool dispatch_softmax_forward_kernel(
     const scalar_t* in_data,
     scalar_t* out_data,
@@ -412,7 +419,8 @@ bool dispatch_softmax_forward_kernel(
         outer_loop,
         is_masked,
         calc_t,
-        vec_t>;
+        vec_t,
+        /*is_safe_softmax = */ false>;
 
     int sub_group_num, global_size_row, local_size_row, range, local_size;
     int max_group_size =
@@ -460,8 +468,8 @@ bool dispatch_softmax_forward_kernel(
         outer_loop,
         is_masked,
         DummyFunctor,
-        vec_t>;
-
+        vec_t,
+        is_safe_softmax>;
     int sub_group_num, global_size_row, local_size_row, range, local_size;
     int max_group_size =
         get_wgroup_size<SIMD, vec_size, outer_loop, KernelClass>(
@@ -506,7 +514,8 @@ template <
     typename IndexType,
     bool LogSoftMax,
     typename vec_t,
-    int align_bytes>
+    int align_bytes,
+    bool is_safe_softmax>
 struct SoftmaxForwardKernelFunctor {
   void operator()(sycl::nd_item<1> item) const {
     IndexType local_id = item.get_local_id(0);
@@ -562,6 +571,10 @@ struct SoftmaxForwardKernelFunctor {
             if (LogSoftMax)
               out_data_[group_offset + linear_idx] = static_cast<scalar_t>(
                   in_data_[group_offset + linear_idx] - max_value - sum_value);
+            else if (
+                is_safe_softmax &&
+                max_value == std::numeric_limits<accscalar_t>::lowest())
+              out_data_[group_offset + linear_idx] = static_cast<scalar_t>(0);
             else
               out_data_[group_offset + linear_idx] = static_cast<scalar_t>(
                   std::exp(in_data_[group_offset + linear_idx] - max_value) *
@@ -576,6 +589,10 @@ struct SoftmaxForwardKernelFunctor {
           if (LogSoftMax)
             in_val[j] =
                 static_cast<scalar_t>(in_val[j] - max_value - sum_value);
+          else if (
+              is_safe_softmax &&
+              max_value == std::numeric_limits<accscalar_t>::lowest())
+            in_val[j] = static_cast<scalar_t>(0);
           else
             in_val[j] = static_cast<scalar_t>(
                 std::exp(in_val[j] - max_value) * sum_value);
@@ -610,7 +627,8 @@ template <
     typename scalar_t,
     typename accscalar_t,
     typename IndexType,
-    bool LogSoftMax>
+    bool LogSoftMax,
+    bool is_safe_softmax>
 void softmax_forward_kernel(
     const scalar_t* in_data,
     scalar_t* out_data,
@@ -625,7 +643,8 @@ void softmax_forward_kernel(
       IndexType,
       LogSoftMax,
       vec_t,
-      align_bytes>;
+      align_bytes,
+      is_safe_softmax>;
 
   int local_size = std::min(
       (dim_size + vec_size - 1) / vec_size,
@@ -645,7 +664,8 @@ template <
     typename accscalar_t,
     typename IndexType,
     bool LogSoftMax,
-    typename vec_t>
+    typename vec_t,
+    bool is_safe_softmax>
 struct SpatialSoftmaxForwardKernelFunctor
     : public __SYCL_KER_CONFIG_CONVENTION__ {
   void operator()(sycl::nd_item<3> item) const {
@@ -658,14 +678,16 @@ struct SpatialSoftmaxForwardKernelFunctor
     // get max value
     accscalar_t max_value[vec_size];
     auto offset = local_row_id * inner_size_ + global_col * vec_size;
-    vec_t value = *(reinterpret_cast<const vec_t*>(in_data_ + group_offset + offset));
+    vec_t value =
+        *(reinterpret_cast<const vec_t*>(in_data_ + group_offset + offset));
 #pragma unroll(vec_size)
     for (int j = 0; j < vec_size; ++j) {
       max_value[j] = accscalar_t(value[j]);
     }
     for (int i = local_row_id + block_row_; i < dim_size_; i += block_row_) {
       offset = i * inner_size_ + global_col * vec_size;
-      value = *(reinterpret_cast<const vec_t*>(in_data_ + group_offset + offset));
+      value =
+          *(reinterpret_cast<const vec_t*>(in_data_ + group_offset + offset));
 #pragma unroll(vec_size)
       for (int j = 0; j < vec_size; ++j) {
         max_value[j] = std::max(max_value[j], accscalar_t(value[j]));
@@ -695,7 +717,8 @@ struct SpatialSoftmaxForwardKernelFunctor
     }
     for (int i = local_row_id + block_row_; i < dim_size_; i += block_row_) {
       offset = i * inner_size_ + global_col * vec_size;
-      value = *(reinterpret_cast<const vec_t*>(in_data_ + group_offset + offset));
+      value =
+          *(reinterpret_cast<const vec_t*>(in_data_ + group_offset + offset));
 #pragma unroll(vec_size)
       for (int j = 0; j < vec_size; ++j) {
         sum_value[j] += std::exp(value[j] - max_value[j]);
@@ -736,6 +759,10 @@ struct SpatialSoftmaxForwardKernelFunctor
           if (LogSoftMax)
             in_val[j] =
                 static_cast<scalar_t>(in_val[j] - max_value[j] - sum_value[j]);
+          else if (
+              is_safe_softmax &&
+              max_value[j] == -std::numeric_limits<scalar_t>::infinity())
+            in_val[j] = static_cast<scalar_t>(0);
           else
             in_val[j] = static_cast<scalar_t>(
                 std::exp(in_val[j] - max_value[j]) * sum_value[j]);
@@ -787,7 +814,8 @@ template <
     typename scalar_t,
     typename accscalar_t,
     typename IndexType,
-    bool LogSoftMax>
+    bool LogSoftMax,
+    bool is_safe_softmax>
 void spatial_softmax_forward(
     const scalar_t* in_data,
     scalar_t* out_data,
@@ -801,7 +829,8 @@ void spatial_softmax_forward(
       accscalar_t,
       IndexType,
       LogSoftMax,
-      vec_t>;
+      vec_t,
+      is_safe_softmax>;
 
   int local_size, block_row;
   get_wgroup_size_spatial<vec_size, KernelClass>(
@@ -818,7 +847,8 @@ void spatial_softmax_forward(
       accscalar_t,
       IndexType,
       LogSoftMax,
-      vec_t>(
+      vec_t,
+      is_safe_softmax>(
       in_data,
       out_data,
       dim_size,
@@ -827,7 +857,6 @@ void spatial_softmax_forward(
       local_size,
       block_row,
       group_num);
-
   auto& queue = getCurrentSYCLQueue();
   sycl_kernel_submit(global_range, local_range, queue, kfn);
 }
@@ -1387,7 +1416,11 @@ void spatial_softmax_backward_kernel(
   sycl_kernel_submit(global_range, local_range, queue, kfn);
 }
 
-template <typename scalar_t, typename accscalar_t, bool LogSoftMax>
+template <
+    typename scalar_t,
+    typename accscalar_t,
+    bool LogSoftMax,
+    bool is_safe_softmax>
 void spatial_softmax_forward(
     const Tensor& output,
     const Tensor& input,
@@ -1432,7 +1465,10 @@ void spatial_softmax_forward(
         accscalar_t,                                              \
         uint32_t,                                                 \
         LogSoftMax,                                               \
-        outer_loop>(                                              \
+        outer_loop,                                               \
+        /*is_masked = */ false,                                   \
+        /*calc_t = */ decltype(nullptr),                          \
+        /*is_safe_softmax = */ is_safe_softmax>(                  \
         input.const_data_ptr<scalar_t>(),                         \
         output.mutable_data_ptr<scalar_t>(),                      \
         dim_size,                                                 \
@@ -1446,7 +1482,8 @@ void spatial_softmax_forward(
         scalar_t,                                 \
         accscalar_t,                              \
         IndexType,                                \
-        LogSoftMax>(                              \
+        LogSoftMax,                               \
+        is_safe_softmax>(                         \
         input.const_data_ptr<scalar_t>(),         \
         output.mutable_data_ptr<scalar_t>(),      \
         dim_size,                                 \
@@ -1460,7 +1497,8 @@ void spatial_softmax_forward(
         scalar_t,                                         \
         accscalar_t,                                      \
         IndexType,                                        \
-        LogSoftMax>(                                      \
+        LogSoftMax,                                       \
+        is_safe_softmax>(                                 \
         input.const_data_ptr<scalar_t>(),                 \
         output.mutable_data_ptr<scalar_t>(),              \
         dim_size,                                         \
@@ -1749,7 +1787,8 @@ Tensor& masked_softmax_forward(
         LogSoftMax,                                                    \
         outer_loop,                                                    \
         true,                                                          \
-        decltype(input_calc)>(                                         \
+        decltype(input_calc),                                          \
+        /*is_safe_softmax = */ false>(                                 \
         input.const_data_ptr<scalar_t>(),                              \
         output.mutable_data_ptr<scalar_t>(),                           \
         dim_size,                                                      \
@@ -1922,7 +1961,7 @@ void masked_softmax_backward(
 #undef SIMD32
 } // namespace impl
 
-template <bool LogSoftMax>
+template <bool LogSoftMax, bool is_safe_softmax = false>
 void host_softmax(
     const Tensor& input_,
     const int64_t dim_,
@@ -1953,8 +1992,11 @@ void host_softmax(
         "host_softmax",
         [&] {
           using accscalar_t = acc_type_device<scalar_t, kXPU>;
-          impl::spatial_softmax_forward<scalar_t, accscalar_t, LogSoftMax>(
-              output, input, dim);
+          impl::spatial_softmax_forward<
+              scalar_t,
+              accscalar_t,
+              LogSoftMax,
+              is_safe_softmax>(output, input, dim);
         });
   }
   // return output;
@@ -2045,6 +2087,29 @@ void _log_softmax_backward_kernel(
       grad.contiguous(), output.contiguous(), dim, half_to_float, grad_input);
 }
 
+Tensor _safe_softmax_kernel(
+    const Tensor& self,
+    int64_t dim,
+    const bool half_to_float) {
+  auto output_options =
+      self.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  if (half_to_float) {
+    output_options = output_options.dtype(ScalarType::Float);
+  }
+  Tensor output = at::empty_like(self, output_options);
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      self.scalar_type(),
+      "_safe_softmax",
+      [&] {
+        host_softmax<false, true>(
+            self.contiguous(), dim, half_to_float, output);
+      });
+
+  return output;
+}
+
 Tensor masked_softmax_kernel(
     const Tensor& input_,
     const Tensor& mask_,
diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.h b/src/ATen/native/xpu/sycl/SoftMaxKernels.h
index 0fc08496b..fc26fec3e 100644
--- a/src/ATen/native/xpu/sycl/SoftMaxKernels.h
+++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.h
@@ -32,6 +32,9 @@ TORCH_XPU_API void _log_softmax_backward_kernel(
     bool half_to_float,
     const Tensor& grad_input);
 
+TORCH_XPU_API Tensor
+_safe_softmax_kernel(const Tensor& self, int64_t dim, const bool half_to_float);
+
 TORCH_XPU_API Tensor masked_softmax_kernel(
     const Tensor& input_,
     const Tensor& mask_,
diff --git a/src/ATen/xpu/EmptyTensor.cpp b/src/ATen/xpu/EmptyTensor.cpp
index 3f5e998f8..6411bb221 100644
--- a/src/ATen/xpu/EmptyTensor.cpp
+++ b/src/ATen/xpu/EmptyTensor.cpp
@@ -54,6 +54,7 @@ TensorBase empty_strided_xpu(
     IntArrayRef stride,
     ScalarType dtype,
     c10::optional<Device> device_opt) {
+  at::globalContext().lazyInitDevice(c10::DeviceType::XPU);
   const auto device = device_or_default(device_opt);
   TORCH_INTERNAL_ASSERT(device.is_xpu());
   const c10::DeviceGuard device_guard(device);
diff --git a/test/regressions/test_safe_softmax.py b/test/regressions/test_safe_softmax.py
new file mode 100644
index 000000000..7b390080a
--- /dev/null
+++ b/test/regressions/test_safe_softmax.py
@@ -0,0 +1,44 @@
+import torch
+from torch.testing._internal.common_utils import TestCase
+
+cpu_device = torch.device("cpu")
+xpu_device = torch.device("xpu")
+
+
+class TestSafeSoftMax(TestCase):
+    def test_sm(self):
+        for dtype in [torch.float, torch.float16, torch.bfloat16]:
+            x_cpu = torch.randn(128,128,128).to(dtype)
+            x_xpu = x_cpu.to(xpu_device)
+            r_cpu = torch.ops.aten._safe_softmax(x_cpu, -1)
+            r_xpu = torch.ops.aten._safe_softmax(x_xpu, -1)
+            self.assertEqual(r_xpu.to(cpu_device), r_cpu)
+            x_cpu[0,0,:] = -float("inf")
+            x_xpu = x_cpu.to(xpu_device)
+            r_cpu = torch.ops.aten._safe_softmax(x_cpu, -1)
+            r_xpu = torch.ops.aten._safe_softmax(x_xpu, -1)
+            self.assertEqual(r_xpu.to(cpu_device), r_cpu)
+
+            x_cpu = torch.randn(128,128,128).to(dtype)
+            x_xpu = x_cpu.to(xpu_device)
+            r_cpu = torch.ops.aten._safe_softmax(x_cpu, 1)
+            r_xpu = torch.ops.aten._safe_softmax(x_xpu, 1)
+            self.assertEqual(r_xpu.to(cpu_device), r_cpu)
+            x_cpu[0,:,0] = -float("inf")
+            x_xpu = x_cpu.to(xpu_device)
+            r_cpu = torch.ops.aten._safe_softmax(x_cpu, 1)
+            r_xpu = torch.ops.aten._safe_softmax(x_xpu, 1)
+            self.assertEqual(r_xpu.to(cpu_device), r_cpu)
+
+            x_cpu = torch.randn(128,128,128).to(dtype)
+            x_xpu = x_cpu.to(xpu_device)
+            r_cpu = torch.ops.aten._safe_softmax(x_cpu, 0)
+            r_xpu = torch.ops.aten._safe_softmax(x_xpu, 0)
+            self.assertEqual(r_xpu.to(cpu_device), r_cpu)
+            x_cpu[:,0,0] = -float("inf")
+            x_xpu = x_cpu.to(xpu_device)
+            r_cpu = torch.ops.aten._safe_softmax(x_cpu, 0)
+            r_xpu = torch.ops.aten._safe_softmax(x_xpu, 0)
+            self.assertEqual(r_xpu.to(cpu_device), r_cpu)
+
+
diff --git a/test/xpu/extended/run_test_with_skip_bmg.py b/test/xpu/extended/run_test_with_skip_bmg.py
new file mode 100644
index 000000000..6499550f5
--- /dev/null
+++ b/test/xpu/extended/run_test_with_skip_bmg.py
@@ -0,0 +1,22 @@
+import os
+import pytest
+import sys
+from skip_list_common import skip_dict
+from skip_list_win import skip_dict as skip_dict_win
+from skip_list_win_bmg import skip_dict as skip_dict_win_bmg
+
+IS_WINDOWS = sys.platform == "win32"
+
+skip_list = skip_dict["test_ops_xpu.py"]
+if IS_WINDOWS:
+    skip_list += skip_dict_win["test_ops_xpu.py"] + skip_dict_win_bmg["test_ops_xpu.py"]
+
+skip_options = "not " + skip_list[0]
+for skip_case in skip_list[1:]:
+    skip_option = " and not " + skip_case
+    skip_options += skip_option
+
+os.environ["PYTORCH_TEST_WITH_SLOW"]="1"
+test_command = ["-k", skip_options, "test_ops_xpu.py", "-v"]
+res = pytest.main(test_command)
+sys.exit(res)
diff --git a/test/xpu/extended/run_test_with_skip_lnl.py b/test/xpu/extended/run_test_with_skip_lnl.py
new file mode 100644
index 000000000..a795ca07a
--- /dev/null
+++ b/test/xpu/extended/run_test_with_skip_lnl.py
@@ -0,0 +1,22 @@
+import os
+import pytest
+import sys
+from skip_list_common import skip_dict
+from skip_list_win import skip_dict as skip_dict_win
+from skip_list_win_lnl import skip_dict as skip_dict_win_lnl
+
+IS_WINDOWS = sys.platform == "win32"
+
+skip_list = skip_dict["test_ops_xpu.py"]
+if IS_WINDOWS:
+    skip_list += skip_dict_win["test_ops_xpu.py"] + skip_dict_win_lnl["test_ops_xpu.py"]
+
+skip_options = "not " + skip_list[0]
+for skip_case in skip_list[1:]:
+    skip_option = " and not " + skip_case
+    skip_options += skip_option
+
+os.environ["PYTORCH_TEST_WITH_SLOW"]="1"
+test_command = ["-k", skip_options, "test_ops_xpu.py", "-v"]
+res = pytest.main(test_command)
+sys.exit(res)
diff --git a/test/xpu/extended/run_test_with_skip_mtl.py b/test/xpu/extended/run_test_with_skip_mtl.py
new file mode 100644
index 000000000..6ed39a64e
--- /dev/null
+++ b/test/xpu/extended/run_test_with_skip_mtl.py
@@ -0,0 +1,22 @@
+import os
+import pytest
+import sys
+from skip_list_common import skip_dict
+from skip_list_win import skip_dict as skip_dict_win
+from skip_list_win_mtl import skip_dict as skip_dict_win_mtl
+
+IS_WINDOWS = sys.platform == "win32"
+
+skip_list = skip_dict["test_ops_xpu.py"]
+if IS_WINDOWS:
+    skip_list += skip_dict_win["test_ops_xpu.py"] + skip_dict_win_mtl["test_ops_xpu.py"]
+
+skip_options = "not " + skip_list[0]
+for skip_case in skip_list[1:]:
+    skip_option = " and not " + skip_case
+    skip_options += skip_option
+
+os.environ["PYTORCH_TEST_WITH_SLOW"]="1"
+test_command = ["-k", skip_options, "test_ops_xpu.py", "-v"]
+res = pytest.main(test_command)
+sys.exit(res)
\ No newline at end of file
diff --git a/test/xpu/extended/skip_list_arc.py b/test/xpu/extended/skip_list_arc.py
index e1e701b84..c8e26ccf3 100644
--- a/test/xpu/extended/skip_list_arc.py
+++ b/test/xpu/extended/skip_list_arc.py
@@ -7,5 +7,21 @@
         "test_compare_cpu_bincount_xpu_int64",
         "test_compare_cpu_bincount_xpu_int8",
         "test_compare_cpu_bincount_xpu_uint8",
+        # RuntimeError: Kernel is incompatible with all devices in devs
+        # https://github.com/intel/torch-xpu-ops/issues/1150
+        "test_compare_cpu_logcumsumexp_xpu_float16",
+        "test_compare_cpu_logcumsumexp_xpu_float32",
+        "test_compare_cpu_nn_functional_pdist_xpu_float32",
+        "test_compare_cpu_tril_indices_xpu_int32",
+        "test_compare_cpu_tril_indices_xpu_int64",
+        "test_compare_cpu_triu_indices_xpu_int32",
+        "test_compare_cpu_triu_indices_xpu_int64",
+        "test_backward_logcumsumexp_xpu_float32",
+        "test_backward_nn_functional_pdist_xpu_float32",
+        "test_forward_ad_logcumsumexp_xpu_float32",
+        "test_operator_logcumsumexp_xpu_float32",
+        "test_operator_nn_functional_pdist_xpu_float32",
+        "test_view_replay_logcumsumexp_xpu_float32",
+        "test_view_replay_nn_functional_pdist_xpu_float32",
     ),
 }
diff --git a/test/xpu/extended/skip_list_common.py b/test/xpu/extended/skip_list_common.py
index 6b5fd653e..643d631eb 100644
--- a/test/xpu/extended/skip_list_common.py
+++ b/test/xpu/extended/skip_list_common.py
@@ -194,5 +194,9 @@
     # Greatest absolute difference: 0.0625 at index (1,) (up to 0.001 allowed)
     #  Greatest relative difference: 0.00640869140625 at index (1,) (up to 0.001 allowed)
     "test_compare_cpu_xlogy_xpu_bfloat16",
+    "test_compare_cpu_div_trunc_rounding_xpu_float64",
+    "test_compare_cpu_div_trunc_rounding_xpu_float16",
+    "test_compare_cpu_div_floor_rounding_xpu_float16",
+    "test_compare_cpu_div_floor_rounding_xpu_bfloat16",
     ),
 }
diff --git a/test/xpu/extended/skip_list_win_bmg.py b/test/xpu/extended/skip_list_win_bmg.py
new file mode 100644
index 000000000..2ee1dd31e
--- /dev/null
+++ b/test/xpu/extended/skip_list_win_bmg.py
@@ -0,0 +1,13 @@
+skip_dict = {
+    "test_ops_xpu.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1173
+        # Fatal Python error: Illegal instruction
+        "test_compare_cpu_grid_sampler_2d_xpu_float64",
+        "test_compare_cpu_cosh_xpu_complex64",
+        "test_compare_cpu_nn_functional_softshrink_xpu_bfloat16",
+        "test_compare_cpu_nn_functional_softshrink_xpu_float16",
+        "test_compare_cpu_nn_functional_softshrink_xpu_float32",
+        "test_compare_cpu_nn_functional_softshrink_xpu_float64",
+        "test_compare_cpu_square_xpu_complex128",
+    ),
+}
diff --git a/test/xpu/extended/skip_list_win_lnl.py b/test/xpu/extended/skip_list_win_lnl.py
new file mode 100644
index 000000000..2ee1dd31e
--- /dev/null
+++ b/test/xpu/extended/skip_list_win_lnl.py
@@ -0,0 +1,13 @@
+skip_dict = {
+    "test_ops_xpu.py": (
+        # https://github.com/intel/torch-xpu-ops/issues/1173
+        # Fatal Python error: Illegal instruction
+        "test_compare_cpu_grid_sampler_2d_xpu_float64",
+        "test_compare_cpu_cosh_xpu_complex64",
+        "test_compare_cpu_nn_functional_softshrink_xpu_bfloat16",
+        "test_compare_cpu_nn_functional_softshrink_xpu_float16",
+        "test_compare_cpu_nn_functional_softshrink_xpu_float32",
+        "test_compare_cpu_nn_functional_softshrink_xpu_float64",
+        "test_compare_cpu_square_xpu_complex128",
+    ),
+}
diff --git a/test/xpu/extended/skip_list_win_mtl.py b/test/xpu/extended/skip_list_win_mtl.py
new file mode 100644
index 000000000..8ec6baac6
--- /dev/null
+++ b/test/xpu/extended/skip_list_win_mtl.py
@@ -0,0 +1,51 @@
+skip_dict = {
+    # failed on MTL windows, skip first for Preci
+    "test_ops_xpu.py": (
+        "test_compare_cpu_cosh_xpu_complex128",
+        "test_compare_cpu_frexp_xpu_bfloat16",
+        "test_compare_cpu_frexp_xpu_float16",
+        "test_compare_cpu_frexp_xpu_float32",
+        "test_compare_cpu_frexp_xpu_float64",
+        "test_compare_cpu_max_pool2d_with_indices_backward_xpu_bfloat16",
+        "test_compare_cpu_max_pool2d_with_indices_backward_xpu_float16",
+        "test_compare_cpu_max_pool2d_with_indices_backward_xpu_float32",
+        "test_compare_cpu_max_pool2d_with_indices_backward_xpu_float64",
+        "test_compare_cpu_nn_functional_avg_pool2d_xpu_bfloat16",
+        "test_compare_cpu_nn_functional_avg_pool3d_xpu_float32",
+        "test_compare_cpu_nn_functional_avg_pool3d_xpu_float64",
+        "test_compare_cpu_nn_functional_batch_norm_xpu_float16",
+        "test_compare_cpu_nn_functional_interpolate_bicubic_xpu_float32",
+        "test_compare_cpu_nn_functional_interpolate_bicubic_xpu_float64",
+        "test_compare_cpu_nn_functional_interpolate_bilinear_xpu_float32",
+        "test_compare_cpu_nn_functional_interpolate_bilinear_xpu_float64",
+        "test_compare_cpu_nn_functional_max_pool2d_xpu_bfloat16",
+        "test_compare_cpu_nn_functional_max_pool2d_xpu_float16",
+        "test_compare_cpu_nn_functional_max_pool2d_xpu_float32",
+        "test_compare_cpu_nn_functional_max_pool2d_xpu_float64",
+        "test_compare_cpu_norm_nuc_xpu_complex128",
+        "test_compare_cpu_norm_nuc_xpu_complex64",
+        "test_compare_cpu_norm_nuc_xpu_float32",
+        "test_compare_cpu_norm_nuc_xpu_float64",
+        "test_compare_cpu_sinh_xpu_complex128",
+        "test_compare_cpu_softmax_with_dtype_xpu_bfloat16",
+        "test_compare_cpu_softmax_with_dtype_xpu_complex128",
+        "test_compare_cpu_softmax_with_dtype_xpu_complex64",
+        "test_compare_cpu_softmax_with_dtype_xpu_float64",
+        "test_compare_cpu_softmax_with_dtype_xpu_int32",
+        "test_compare_cpu_softmax_with_dtype_xpu_int64",
+        "test_compare_cpu_softmax_with_dtype_xpu_uint8",
+        "test_compare_cpu_softmax_xpu_float64",
+        "test_compare_cpu_square_xpu_complex128",
+        "test_backward_norm_nuc_xpu_float32",
+        "test_cow_input_norm_nuc_xpu_float32",
+        "test_forward_ad_norm_nuc_xpu_float32",
+        "test_operator_norm_nuc_xpu_float32",
+        "test_view_replay_norm_nuc_xpu_float32",
+        "test_compare_cpu_nn_functional_avg_pool2d_xpu_float32",
+        "test_compare_cpu_nn_functional_avg_pool2d_xpu_float64",
+        "test_compare_cpu_softmax_with_dtype_xpu_bool",
+        "test_compare_cpu_softmax_with_dtype_xpu_float32",
+        "test_compare_cpu_softmax_with_dtype_xpu_int16",
+        "test_compare_cpu_softmax_with_dtype_xpu_int8",
+    ),
+}
diff --git a/test/xpu/run_test_with_skip_bmg.py b/test/xpu/run_test_with_skip_bmg.py
new file mode 100644
index 000000000..9bd360296
--- /dev/null
+++ b/test/xpu/run_test_with_skip_bmg.py
@@ -0,0 +1,24 @@
+import os
+import sys
+from skip_list_common import skip_dict
+from skip_list_win import skip_dict as skip_dict_win
+from skip_list_win_bmg import skip_dict as skip_dict_win_bmg
+from xpu_test_utils import launch_test
+
+
+res = 0
+IS_WINDOWS = sys.platform == "win32"
+
+for key in skip_dict:
+    skip_list = skip_dict[key]
+    if IS_WINDOWS and key in skip_dict_win:
+        skip_list += skip_dict_win[key]
+    if IS_WINDOWS and key in skip_dict_win_bmg:
+        skip_list += skip_dict_win_bmg[key]
+    res += launch_test(key, skip_list)
+
+if os.name == "nt":
+    sys.exit(res)
+else:    
+    exit_code = os.WEXITSTATUS(res)
+    sys.exit(exit_code)
\ No newline at end of file
diff --git a/test/xpu/run_test_with_skip_lnl.py b/test/xpu/run_test_with_skip_lnl.py
new file mode 100644
index 000000000..4413626ea
--- /dev/null
+++ b/test/xpu/run_test_with_skip_lnl.py
@@ -0,0 +1,24 @@
+import os
+import sys
+from skip_list_common import skip_dict
+from skip_list_win import skip_dict as skip_dict_win
+from skip_list_win_lnl import skip_dict as skip_dict_win_lnl
+from xpu_test_utils import launch_test
+
+
+res = 0
+IS_WINDOWS = sys.platform == "win32"
+
+for key in skip_dict:
+    skip_list = skip_dict[key]
+    if IS_WINDOWS and key in skip_dict_win:
+        skip_list += skip_dict_win[key]
+    if IS_WINDOWS and key in skip_dict_win_lnl:
+        skip_list += skip_dict_win_lnl[key]
+    res += launch_test(key, skip_list)
+
+if os.name == "nt":
+    sys.exit(res)
+else:    
+    exit_code = os.WEXITSTATUS(res)
+    sys.exit(exit_code)
\ No newline at end of file
diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py
index 7c3aa7f8e..fdf481f9c 100644
--- a/test/xpu/skip_list_common.py
+++ b/test/xpu/skip_list_common.py
@@ -2,30 +2,62 @@
     "test_ops_xpu.py": (
         # Skip list of base line
 
-        # Need to revisit when the ops are enabled
-        # AssertionError: The supported dtypes for xxx on device type xpu are incorrect!
+        # XPU implementation doesn't claimn FP8 now
+        # https://github.com/intel/torch-xpu-ops/issues/461
+        "float8",
+
+        # workarounds for the following tests
+        # https://github.com/intel/torch-xpu-ops/issues/1214
+        "test_python_ref__refs_exp_xpu_complex128",
+        "test_python_ref__refs_sigmoid_xpu_complex128",
+        "test_python_ref_executor__refs_log2_executor_aten_xpu_complex128",
+        "test_python_ref_executor__refs_exp_executor_aten_xpu_complex128",
+        "test_python_ref_torch_fallback__refs_log2_xpu_complex128",
+        "test_python_ref_torch_fallback__refs_log10_xpu_complex128",
+        "test_python_ref_torch_fallback__refs_sigmoid_xpu_complex128",
+        "test_python_ref_executor__refs_log10_executor_aten_xpu_complex128",
+        "test_noncontiguous_samples_histogram_xpu_float32",
+
+        # TODO: Fix the following tests
+        "test_out_warning_torch__scaled_mm_xpu",
+
+        # To be removed from this file.
+        # CUDA and XPU both XFAIL now.
+        "test_out_narrow_copy_xpu_float32", 
+        # This case is marked as skip but XPU failed. However, CUDA and XPU throw the same runtime error.
+        "test_out_histc_xpu_float32",
+
+        # AssertionError: The supported dtypes for __rmod__ on device type xpu are incorrect!
+        # The following dtypes worked in forward but are not listed by the OpInfo: {torch.int8, torch.int16, torch.int32, torch.int64, torch.uint8}.
         "test_dtypes___rmod___xpu",
+        
+        # Data type is not supported in oneDNN!
         "test_dtypes_nn_functional_conv1d_xpu",
         "test_dtypes_nn_functional_conv2d_xpu",
         "test_dtypes_nn_functional_conv3d_xpu",
         "test_dtypes_nn_functional_conv_transpose1d_xpu",
         "test_dtypes_nn_functional_conv_transpose2d_xpu",
         "test_dtypes_nn_functional_conv_transpose3d_xpu",
+
+        # AssertionError: The supported dtypes for nn.functional.softsign on device type xpu are incorrect!
         "test_dtypes_nn_functional_softsign_xpu",
+
+        # AssertionError: The supported dtypes for sparse.sampled_addmm on device type xpu are incorrect! - OPs not supported
         "test_dtypes_sparse_sampled_addmm_xpu",
-        # AssertionError: RuntimeError not raised
+        
+        # OPs not supported
         "test_errors_dot_xpu",
-        "test_errors_kthvalue_xpu",
         "test_errors_vdot_xpu",
-        # Fallback cases with skipCPUIfNoLapack, AssertionError: Tensor-likes are not close!
+
+        # Linalg OPs not supported
         "test_noncontiguous_samples_linalg_det_xpu_float32",
         "test_noncontiguous_samples_linalg_slogdet_xpu_float32",
         "test_noncontiguous_samples_linalg_solve_ex_xpu_float32",
         "test_noncontiguous_samples_linalg_solve_xpu_float32",
         "test_noncontiguous_samples_linalg_tensorsolve_xpu_float32",
         "test_noncontiguous_samples_logdet_xpu_float32",
-        "test_noncontiguous_samples_nn_functional_conv3d_xpu_complex64",
 
+        # Sparse CSR OPs not supported
         # RuntimeError: device type of values (xpu) must be CPU or CUDA or Meta
         # https://github.com/intel/torch-xpu-ops/issues/357
         "test_compare_cpu_sparse_sampled_addmm_xpu_float32",
@@ -51,6 +83,7 @@
         "test_noncontiguous_samples_nn_functional_conv1d_xpu_int64",
         "test_noncontiguous_samples_nn_functional_conv2d_xpu_int64",
 
+        # Linalg OPs not supported
         # RuntimeError: mode only supports CPU AND CUDA device type, got: xpu
         # Issue https://github.com/intel/torch-xpu-ops/issues/327
         "test_numpy_ref_linalg_tensorinv_xpu_float64",
@@ -62,19 +95,20 @@
         "test_variant_consistency_eager_nn_functional_conv_transpose3d_xpu_complex64",
         "test_variant_consistency_eager_nn_functional_conv_transpose3d_xpu_float32",
 
-        # Need revisit when the op is enabled
-        # Unexpected success, xpu passed because it compares to cpu
+        # Linalg OPs not supported
         "test_compare_cpu_linalg_lu_factor_ex_xpu_float32",
         "test_compare_cpu_linalg_lu_factor_xpu_float32",
         "test_compare_cpu_linalg_lu_xpu_float32",
+
+        # XPU hang. CUDA hang as well. 
+        # https://github.com/pytorch/pytorch/issues/79528
         "test_compare_cpu_special_hermite_polynomial_h_xpu_float32",
 
-         # XFAIL of CUDA and XPU, unexpected success in fallback
+        # XFAIL of CUDA and XPU, unexpected success in fallback
+        # Linalg OPs not supported
         "test_out_cholesky_inverse_xpu_float32",
         "test_out_geqrf_xpu_float32",
-        "test_out_narrow_copy_xpu_float32",
         "test_out_ormqr_xpu_float32",
-        "test_out_histc_xpu_float32",
         
         # XFAIL of CUDA, XPU got unexpected success
         "test_python_ref__refs_div_no_rounding_mode_xpu_complex32",
@@ -87,6 +121,7 @@
         "test_python_ref_torch_fallback__refs_pow_xpu_complex32",
 
         # unexpected success because of cpu fallback
+        # Linalg OPs not supported
         "test_out_triangular_solve_xpu_float32",
 
         # Newly added:
@@ -107,15 +142,17 @@
         "_jiterator_",
         # https://github.com/intel/torch-xpu-ops/issues/157
         # Segfault:
-        "test_dtypes_nn_functional_linear_xpu",  # https://github.com/intel/torch-xpu-ops/issues/157
         "test_dtypes_nn_functional_multi_head_attention_forward_xpu",  # https://github.com/intel/torch-xpu-ops/issues/157
+
+        # Linalg OPs not supported
         "test_dtypes_pca_lowrank_xpu",  # https://github.com/intel/torch-xpu-ops/issues/157
         "test_dtypes_svd_lowrank_xpu",  # https://github.com/intel/torch-xpu-ops/issues/157
+
+        # RuntimeError: Long is not supported in oneDNN!
         "test_noncontiguous_samples_nn_functional_linear_xpu_int64",  # https://github.com/intel/torch-xpu-ops/issues/157
+
         # https://github.com/intel/torch-xpu-ops/issues/157
-        # Failures:
-        "test_compare_cpu_addmm_xpu_float32",
-        "test_compare_cpu_addmv_xpu_float32",
+        # Datatype not supported in oneDNN
         "test_dtypes_addmm_decomposed_xpu",
         "test_dtypes_addmm_xpu",
         "test_dtypes_addmv_xpu",
@@ -395,11 +432,13 @@
         "test_variant_consistency_eager_svd_xpu_complex64",
         "test_variant_consistency_eager_tensordot_xpu_complex64",
         "test_variant_consistency_eager_triangular_solve_xpu_complex64",
+
         # oneDNN issues
         # RuntimeError: value cannot be converted to type float without overflow
         # https://github.com/intel/torch-xpu-ops/issues/683
         "test_conj_view_addbmm_xpu_complex64",
         "test_neg_conj_view_addbmm_xpu_complex128",
+
         ### Error #0 in TestMathBitsXPU , RuntimeError: Double and complex datatype matmul is not supported in oneDNN
         # https://github.com/intel/torch-xpu-ops/issues/254
         "test_conj_view___rmatmul___xpu_complex64",
@@ -609,32 +648,34 @@
         "test_conj_view_svd_lowrank_xpu_complex64",
         "test_neg_conj_view_pca_lowrank_xpu_complex128",
         "test_neg_conj_view_svd_lowrank_xpu_complex128",
+        
+        # oneDNN issues
         ### Error #1 in TestMathBitsXPU , RuntimeError: could not create a primitive descriptor for a deconvolution forward propagation primitive
         # https://github.com/intel/torch-xpu-ops/issues/253
         "test_conj_view_nn_functional_conv_transpose2d_xpu_complex64",
         "test_conj_view_nn_functional_conv_transpose3d_xpu_complex64",
         "test_neg_view_nn_functional_conv_transpose2d_xpu_float64",
         "test_neg_view_nn_functional_conv_transpose3d_xpu_float64",
-        # Op impl aligns with CUDA on the supported dtypes.
-        # RuntimeError: "avg_pool2d_xpu" not implemented for 'Long'.
-        # Retrieve the case, once avg_pool1d is supported. Test infra will change claimed dtypes in test case once the op is listed
-        # in XPU supported operators. Then the case will work.
-        "test_noncontiguous_samples_nn_functional_avg_pool1d_xpu_int64",
-        "test_noncontiguous_samples_nn_functional_local_response_norm_xpu_int64",
-
-        # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16.
-        # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error.
-        #"test_dtypes_polar_xpu",
+
         # implemented aten::histogram to align MPS operators coverage, CUDA doesn't support
         # but test_dtypes infrastructure leverage CUDA supported datatypes
         "test_dtypes_histogram_xpu",
 
-        # Unexpected success, CUDA got XFAIL because CUDA does not have historgramadd supported"
+        # Unexpected success, CUDA got XFAIL because CUDA does not have historgramadd supported
         "test_errors_histogramdd_xpu",
 
         # 2025 bundle std::pow complex result is different on host and device
         "test_python_ref__refs_square_xpu_complex64",
         "test_python_ref_torch_fallback__refs_square_xpu_complex64",
+        "test_python_ref_torch_fallback__refs_exp_xpu_complex128",
+
+        # Failed on rolling driver, passed on preci
+        "test_python_ref__refs_div_trunc_rounding_xpu_float64",
+        "test_python_ref_executor__refs_div_trunc_rounding_executor_aten_xpu_float64",
+        "test_python_ref_torch_fallback__refs_div_trunc_rounding_xpu_float64",
+
+        # TODO: passed from source code building version, investigate
+        "test_python_ref__refs_log2_xpu_complex128",
     ),
 
     "test_binary_ufuncs_xpu.py": (
@@ -661,7 +702,7 @@
 
     "test_autograd_fallback_xpu.py": None,
 
-    "test_sort_and_select_xpu.py": ("test_sort_large_slice_xpu",),  # Hard code CUDA
+    "test_sort_and_select_xpu.py": ("test_sort_large_slice_xpu",),  # Hard code CUDA, UT has already been rewritten to test/regressions/test_sort.py.
 
     "nn/test_embedding_xpu.py": (
         # NotImplementedError: Could not run 'aten::_indices' with arguments from the 'SparseXPU' backend.
@@ -713,8 +754,12 @@
         "test_disable_fastpath_xpu",
         # We have no mechanism to handle SDPBackend::ERROR so far. Will give a fully support when we support all SDPBackends.
         "test_dispatch_fails_no_backend_xpu",
+
+        # NestedTensorXPU not supported
         # Could not run 'aten::_to_copy' with arguments from the 'NestedTensorXPU' backend
         "test_with_nested_tensor_input_xpu",
+
+        # oneDNN issues
         # Double and complex datatype matmul is not supported in oneDNN
         # https://github.com/intel/torch-xpu-ops/issues/253
         "test_sdp_math_gradcheck_contiguous_inputs_False_xpu",
@@ -920,33 +965,7 @@
         "test_cpu_gpu_parity_nn_ConvTranspose2d_xpu_complex32",
         # CPU fallback fails
         # RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.
-        "test_save_load_nn_GRU_eval_mode_xpu_float32",
-        "test_save_load_nn_GRUCell_xpu_float32",
-        "test_save_load_nn_GRU_train_mode_xpu_float32",
-        # Could not run 'aten::_thnn_fused_lstm_cell' with arguments from the 'CPU' backend.
-        "_LSTM_",
-        "_LSTMCell_",
-        # CPU fallback fails
-        # Could not run 'aten::_thnn_fused_gru_cell' with arguments from the 'CPU' backend.
-        "test_to_nn_GRUCell_swap_True_set_grad_False_xpu_float32",
-        "test_to_nn_GRU_eval_mode_swap_True_set_grad_False_xpu_float32",
-        "test_to_nn_GRU_train_mode_swap_True_set_grad_False_xpu_float32 ",
-        "test_cpu_gpu_parity_nn_GRUCell_xpu_float32",
-        "test_cpu_gpu_parity_nn_GRU_eval_mode_xpu_float32",
-        "test_cpu_gpu_parity_nn_GRU_train_mode_xpu_float32",
-        "test_forward_nn_GRUCell_xpu_float32",
-        "test_forward_nn_GRU_eval_mode_xpu_float32",
-        "test_forward_nn_GRU_train_mode_xpu_float32",
-        "test_if_train_and_eval_modes_differ_nn_GRUCell_xpu_float32",
-        "test_memory_format_nn_GRUCell_xpu_float32",
-        "test_memory_format_nn_GRU_eval_mode_xpu_float32",
-        "test_memory_format_nn_GRU_train_mode_xpu_float32",
-        "test_multiple_device_transfer_nn_GRUCell_xpu_float32",
-        "test_multiple_device_transfer_nn_GRU_eval_mode_xpu_float32",
-        "test_multiple_device_transfer_nn_GRU_train_mode_xpu_float32",
-        "test_non_contiguous_tensors_nn_GRUCell_xpu_float32",
-        "test_non_contiguous_tensors_nn_GRU_eval_mode_xpu_float32",
-        "test_non_contiguous_tensors_nn_GRU_train_mode_xpu_float32",
+        
         # AssertionError: False is not true
         "test_to_nn_BatchNorm1d_eval_mode_swap_True_set_grad_True_xpu_float32",
         "test_to_nn_BatchNorm1d_train_mode_swap_True_set_grad_True_xpu_float32",
@@ -991,6 +1010,7 @@
         "test_type",
         # rnn fallback to cpu
         "test_cudnn_weight_format",
+        # oneDNN issues
         # AssertionError: MultiheadAttention does not support NestedTensor outside of its fast path. The fast path was not hit because some Tensor argument's device is neither one of cpu, cuda or privateuseone
         "test_TransformerEncoderLayer_empty_xpu",
         "test_transformerencoderlayer_xpu_float16",
@@ -1015,12 +1035,8 @@
         "test_rnn_retain_variables_xpu_float64",
         "test_transformerencoderlayer_xpu_float64",
         "test_variable_sequence_xpu_float64",
-        # AssertionError: RuntimeError not raised
-        "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_True_num_channels_3_mode_bicubic_uint8_xpu_uint8",
-        "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_True_num_channels_3_mode_bilinear_uint8_xpu_uint8",
-        "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_True_num_channels_5_mode_bicubic_uint8_xpu_uint8",
-        "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_True_num_channels_5_mode_bilinear_uint8_xpu_uint8",
-        # upsamplingNearest2d: Failed: Unexpected success
+        # Unexpected success: CUDA only test case, launch grid_y == 2**16 (larger than CUDA maximum y-dimension limit 65535) and expect fail.
+        # SYCL don't have this limitation and hence can pass.
         "test_upsamplingNearest2d_launch_fail_xpu",
         # Could not run 'aten::_thnn_fused_lstm_cell' with arguments from the 'CPU' backend.
         "test_RNN_cudnn_weight_norm",
@@ -1040,13 +1056,6 @@
     ),
 
     "test_indexing_xpu.py": (
-        # CPU bias cases
-        # It is kernel assert on XPU implementation not exception on host.
-        # We are same as CUDA implementation. And CUDA skips these cases.
-        "test_trivial_fancy_out_of_bounds_xpu",
-        # index boundary should be checked. 
-        # https://github.com/intel/torch-xpu-ops/issues/783
-        "test_advancedindex_xpu_float64",
         # XPU implementation doesn't claimn FP8 now
         # https://github.com/intel/torch-xpu-ops/issues/461
         "test_index_put_src_datatype_xpu_float8_e5m2",
@@ -1104,8 +1113,6 @@
         # Sometimes, will raise AssertionError: "Simulate error" does not match "grad can be implicitly created only for scalar outputs"
         # https://github.com/intel/torch-xpu-ops/issues/1071
         "test_reentrant_parent_error_on_cpu_xpu",
-        # Could not run 'aten::_thnn_fused_lstm_cell' with arguments from the 'CPU' backend.
-        "test_rnn_backward_to_input_but_not_parameters_xpu",
     ),
 
     "test_reductions_xpu.py": (
@@ -1116,8 +1123,6 @@
     "test_unary_ufuncs_xpu.py": (
         # AssertionError: Jiterator is only supported on CUDA and ROCm GPUs, none are available.
         "_jiterator_",
-        # CPU Fallback fails: Tensor-likes are not close!
-        "test_reference_numerics_large_tanh_xpu_complex32",
         # For extreme value processing, Numpy and XPU results are inconsistent
         # std operations get different behavior on std::complex operarands for extremal cases
         "test_reference_numerics_extremal__refs_log_xpu_complex64",
@@ -1158,7 +1163,8 @@
         # Greatest relative difference: 1.9145216356264427e-05 at index (463, 204) (up to 1.3e-06 allowed)
         "test_reference_numerics_normal__refs_asinh_xpu_complex64",
         "test_reference_numerics_normal_asinh_xpu_complex64",
-        # Failed: Unexpected success
+        "test_batch_vs_slicing__refs_sigmoid_xpu_complex128",
+        # Unexpected success: CUDA uses thrust::sqrt and has accuracy issue. XPU use std::sqrt and has no issue.
         "test_reference_numerics_large_rsqrt_xpu_complex32",
         # Numeric difference
         # https://github.com/intel/torch-xpu-ops/issues/544
@@ -1178,10 +1184,6 @@
         # CUDA XFAIL
         "test_reference_numerics_large__refs_rsqrt_xpu_complex32",
 
-        # Compiler issue in handling tanh with real or imag inf.
-        # https://github.com/intel/torch-xpu-ops/issues/184, https://jira.devtools.intel.com/browse/CMPLRLIBS-34974
-        "test_reference_numerics_large__refs_tanh_xpu_complex32",
-
         # 2025 bundle std::pow complex result is different on host and device
         "test_exp_xpu_complex64",
         "test_reference_numerics_extremal__refs_exp2_xpu_complex64",
@@ -1193,6 +1195,8 @@
     ),
 
     "test_masked_xpu.py": (
+        # Summary: Sparse CSR for XPU is not supported
+
         # NotImplementedError: Could not run 'aten::_to_sparse_csr' with arguments from the 'SparseXPU' backend.
         # https://github.com/intel/torch-xpu-ops/issues/357
         "test_mask_layout_sparse_coo_masked_amax_xpu_bfloat16",
@@ -1329,6 +1333,9 @@
     "nn/test_lazy_modules_xpu.py": None,
 
     "test_linalg_xpu.py": (
+        # Summary:
+        # All linear algebra related ops are not supported for XPU.
+
         # _convert_weight_to_int4pack not support
         "_int4_mm_m_",
         # RuntimeError: Double and complex datatype matmul is not supported in oneDNN
@@ -1535,6 +1542,8 @@
         # XPU does not support tunable.
         "test_bmm_tunableop_rocm_xpu_float32",
         "test_numeric_check_leak_tunableop_rocm_xpu_float32",
+        "test_dump_results_on_exit_tunableop_xpu_float32",
+        "test_rotating_buffer_tunableop_xpu_float32",
         # CUDA bias cases added in latest PyTorch
         # AttributeError: module 'torch._C' has no attribute '_cuda_tunableop_enable'
         "test_matmul_check_entries_tunableop_xpu_float16",
@@ -1580,6 +1589,8 @@
     ),
 
     "test_ops_fwd_gradients_xpu.py": (
+        # All of the followings are oneDNN issues
+
         # RuntimeError: Double and complex datatype matmul is not supported in oneDNN
         "test_fn_fwgrad_bwgrad___rmatmul___xpu_complex128",
         "test_fn_fwgrad_bwgrad___rmatmul___xpu_float64",
@@ -1884,6 +1895,8 @@
     ),
 
     "test_maskedtensor_xpu.py": (
+        # Summary: SparseCsrXPU OPs are not supported
+
         # NotImplementedError: Could not run 'aten::_to_sparse_csr' with arguments from the 'SparseXPU' backend.
         # https://github.com/intel/torch-xpu-ops/issues/357
         "test_to_dense_xpu",
@@ -1987,13 +2000,12 @@
         # ACTUAL: array([-1.108163e+12,  1.108163e+12], dtype=float32)
         # DESIRED: array([-1.108163e+12,  1.090847e+12], dtype=float32)
         "test_fq_module_per_tensor_xpu",
-        # AssertionError: False is not true : Expected dScale=tensor([-0.0173], device='xpu:0') to match scale.grad=tensor([0.0189], device='xpu:0')
-        "test_learnable_backward_per_channel_cuda_xpu",
     ),
 
     "quantization/core/test_workflow_module_xpu.py": None,
 
     "quantization/core/test_quantized_tensor_xpu.py": (
+        # Summary: Quantized OPs are not supported for XPU
         # NotImplementedError: Could not run 'aten::dequantize.self' with arguments from the 'QuantizedXPU' backend
         "test_compare_per_channel_device_numerics_xpu",
         # NotImplementedError: Could not run 'aten::dequantize.self' with arguments from the 'QuantizedXPU' backend.
@@ -2022,6 +2034,8 @@
     ),
 
     "test_ops_gradients_xpu.py": (
+        # All are oneDNN issues
+
         ### Error #0 in TestBwdGradientsXPU , totally 271 , RuntimeError: Double and complex datatype matmul is not supported in oneDNN
         "test_fn_grad___rmatmul___xpu_complex128",
         "test_fn_grad___rmatmul___xpu_float64",
@@ -2297,11 +2311,13 @@
         "test_fn_gradgrad_pca_lowrank_xpu_complex128",
         "test_fn_gradgrad_svd_lowrank_xpu_complex128",
         "test_fn_grad_linalg_norm_xpu_complex128",
+
         ### Error #1 in TestBwdGradientsXPU , totally 4 , RuntimeError: value cannot be converted to type float without overflow
         "test_fn_grad_addbmm_xpu_complex128",
         "test_fn_gradgrad_addbmm_xpu_complex128",
         "test_inplace_grad_addbmm_xpu_complex128",
         "test_inplace_gradgrad_addbmm_xpu_complex128",
+
         ### Error #4 in TestBwdGradientsXPU , totally 8 , RuntimeError: could not create a primitive descriptor for a deconvolution forward propagation primitive
         "test_fn_grad_nn_functional_conv_transpose2d_xpu_complex128",
         "test_fn_grad_nn_functional_conv_transpose2d_xpu_float64",
@@ -2322,6 +2338,7 @@
     ),
 
     "test_torch_xpu.py": (
+        # 'torch.xpu' has no attribute ...
         ### Error #1 in TestTorchDeviceTypeXPU , totally 2 , AttributeError: module 'torch.xpu' has no attribute 'FloatTensor'
         "test_grad_scaling_state_dict_xpu",
         ### Error #2 in TestTorchDeviceTypeXPU , totally 1 , AttributeError: 'torch.storage.TypedStorage' object has no attribute 'is_xpu'
@@ -2331,6 +2348,7 @@
         ### Error #4 in TestTorchDeviceTypeXPU , totally 4 , AttributeError: module 'torch.xpu' has no attribute 'FloatStorage'
         "test_storage_setitem_xpu_float32",
         "test_tensor_storage_type_xpu_float32",
+
         ### Error #7 in TestTorchDeviceTypeXPU , totally 1 , TypeError: map2_ is only implemented on CPU tensors
         "test_broadcast_fn_map2_xpu",
         ### Error #8 in TestTorchDeviceTypeXPU , totally 1 , TypeError: map_ is only implemented on CPU tensors
@@ -2346,16 +2364,8 @@
         "test_sync_warning_xpu",
         ### Error #19 in TestTorchDeviceTypeXPU , totally 1 , RuntimeError: _share_fd_: only available on CPU
         "test_module_share_memory_xpu",
-        ### Error #23 in TestTorchDeviceTypeXPU , totally 26 , AssertionError: RuntimeError not raised : expected a non-deterministic error, but it was not raised
-        "test_nondeterministic_alert_AdaptiveAvgPool2d_xpu",
-        "test_nondeterministic_alert_CTCLoss_xpu",
-        "test_nondeterministic_alert_EmbeddingBag_max_xpu",
-        "test_nondeterministic_alert_MaxPool3d_xpu",
-        "test_nondeterministic_alert_NLLLoss_xpu",
-        "test_nondeterministic_alert_interpolate_bilinear_xpu",
-        "test_nondeterministic_alert_put_accumulate_xpu",
-        ### Error #24 in TestTorchDeviceTypeXPU , totally 1 , AttributeError: 'TestTorchDeviceTypeXPU' object has no attribute 'check_device_nondeterministic_alert'
-        "test_nondeterministic_alert_AvgPool3d_xpu",
+
+        # 'torch.xpu' has no attribute ...
         ### Error #30 in TestTorchDeviceTypeXPU , totally 2 , AttributeError: module 'torch.xpu' has no attribute 'BoolStorage'
         "test_storage_setitem_xpu_bool",
         "test_tensor_storage_type_xpu_bool",
@@ -2384,11 +2394,7 @@
         "test_tensor_storage_type_xpu_bfloat16",
         ### Error #39 in TestTorchDeviceTypeXPU , totally 1 , AttributeError: module 'torch.xpu' has no attribute 'HalfStorage'
         "test_tensor_storage_type_xpu_float16",
-        ### Error #40 in TestTorchDeviceTypeXPU , totally 1 , FAILED test_torch_xpu.py::TestTorch::test_index_add - RuntimeError: expected ...
-        "test_tensor_storage_type_xpu_uint8",
-        ### Error #41 in TestTorchDeviceTypeXPU , totally 1 , FAILED test_torch_xpu.py::TestTorch::test_print - AttributeError: module 'tor...
-        "test_tensor_storage_type_xpu_uint8",
-        ### Error #42 in TestTorchDeviceTypeXPU , totally 1 , FAILED test_torch_xpu.py::TestTorch::test_storage_error - AttributeError: 'to...
+        ### Module 'torch.xpu' has no attribute 'ByteStorage'
         "test_tensor_storage_type_xpu_uint8",
         # issue 302 , 8
         "test_print",
@@ -2420,6 +2426,7 @@
     ),
 
     "test_native_mha_xpu.py": (
+        # NestedTensorXPU related OPs
         # NotImplementedError: Could not run 'aten::_native_multi_head_attention' with arguments from the 'NestedTensorXPU' backend.
         "test_native_multihead_self_attention_use_nt_False_use_padding_True_pad_all_False_need_weights_False_average_attn_weights_False_fused_False_xpu_float16",
         "test_native_multihead_self_attention_use_nt_False_use_padding_True_pad_all_False_need_weights_False_average_attn_weights_False_fused_False_xpu_float32",
@@ -2476,6 +2483,7 @@
     ),
 
     "nn/test_convolution_xpu.py": (
+        # Summary: all of them are oneDNN related issues
         # XPU unsupport ops, skip.
         # https://github.com/intel/torch-xpu-ops/issues/348
         "test_cudnn_convolution_relu_xpu_float16",
@@ -2507,7 +2515,6 @@
         # https://github.com/intel/torch-xpu-ops/issues/774
         "_jiterator_",
 
-
         # RuntimeError: Short is not supported in oneDNN! Need oneDNN's support, suggest to keep skip.
         "test_dispatch_meta_outplace_nn_functional_linear_xpu_int16",
         "test_dispatch_symbolic_meta_outplace_nn_functional_linear_xpu_int16",
@@ -2519,7 +2526,6 @@
         "test_meta_outplace_nn_functional_linear_xpu_int64",
 
         # RuntimeError: Double and complex datatype matmul is not supported in oneDNN
-
         "test_dispatch_meta_inplace_addbmm_xpu_complex",
         "test_dispatch_meta_outplace_addbmm_xpu_complex",
         "test_dispatch_symbolic_meta_inplace_addbmm_xpu_complex",
@@ -3254,7 +3260,10 @@
 
     "test_type_promotion_xpu.py": None,
 
-    "test_distributions_xpu.py": None,
+    "test_distributions_xpu.py": (
+        # TODO: Passed on lts driver version, but failed on rolling driver version
+        "test_gamma_gpu_sample_xpu",
+    ),
 
     "test_optim_xpu.py": (
         # oneDNN issues
diff --git a/test/xpu/skip_list_win_bmg.py b/test/xpu/skip_list_win_bmg.py
new file mode 100644
index 000000000..a91d4f4a5
--- /dev/null
+++ b/test/xpu/skip_list_win_bmg.py
@@ -0,0 +1,39 @@
+skip_dict = {
+    # tensor(0.-0.j, device='xpu:0', dtype=torch.complex32) tensor(nan+nanj, device='xpu:0', dtype=torch.complex32) (1.5707964+0j)
+    "test_unary_ufuncs_xpu.pyy": (
+        "test_reference_numerics_small_acos_xpu_complex32",
+        "test_reference_numerics_small_asin_xpu_complex32",
+        "test_reference_numerics_small_asinh_xpu_complex32",
+        "test_reference_numerics_small_atan_xpu_complex32",
+        "test_reference_numerics_small_atanh_xpu_complex32",
+        # Need to check compiler std::sin() on inf+infj
+        "test_reference_numerics_extremal__refs_sin_xpu_complex128",
+        "test_reference_numerics_extremal__refs_sin_xpu_complex64",
+        "test_reference_numerics_extremal_nn_functional_tanhshrink_xpu_complex128",
+        "test_reference_numerics_extremal_nn_functional_tanhshrink_xpu_complex64",
+        "test_reference_numerics_extremal_sin_xpu_complex128",
+        "test_reference_numerics_extremal_sin_xpu_complex64",
+        "test_reference_numerics_extremal_sinh_xpu_complex128",
+        "test_reference_numerics_extremal_sinh_xpu_complex64",
+        "test_reference_numerics_large__refs_sin_xpu_complex32",
+        "test_reference_numerics_large_sin_xpu_complex32",
+        # Known issue of exp accuracy
+        # tensor(13437.7000-501.j, device='xpu:0', dtype=torch.complex128) tensor(inf+infj, device='xpu:0', dtype=torch.complex128) (-inf+infj)
+        "test_reference_numerics_large__refs_exp_xpu_complex128",
+        "test_reference_numerics_large_exp_xpu_complex128",
+        "test_reference_numerics_small_exp_xpu_complex32",
+        ":test_reference_numerics_normal_special_i1_xpu_float32",
+        "test_reference_numerics_normal_sigmoid_xpu_complex32",
+        "test_reference_numerics_small_sigmoid_xpu_complex32",
+    ),
+    # https://github.com/intel/torch-xpu-ops/issues/1171
+    # AssertionError: 'Assertion maxind >= 0 && maxind < outputImageSize failed' not found in '\nAssertHandler::printMessage\n' : The expected error was not found
+    "nn\test_pooling_xpu.py": (
+        "test_MaxUnpool_index_errors_case1_xpu",
+        "test_MaxUnpool_index_errors_case2_xpu",
+        "test_MaxUnpool_index_errors_case4_xpu",
+        "test_MaxUnpool_index_errors_case6_xpu",
+        "test_MaxUnpool_index_errors_case7_xpu",
+        "test_MaxUnpool_index_errors_case9_xpu",
+    ),
+}
diff --git a/test/xpu/skip_list_win_lnl.py b/test/xpu/skip_list_win_lnl.py
new file mode 100644
index 000000000..a9e8bfc3f
--- /dev/null
+++ b/test/xpu/skip_list_win_lnl.py
@@ -0,0 +1,38 @@
+skip_dict = {
+    # tensor(0.-0.j, device='xpu:0', dtype=torch.complex32) tensor(nan+nanj, device='xpu:0', dtype=torch.complex32) (1.5707964+0j)
+    "test_unary_ufuncs_xpu.pyy": (
+        "test_reference_numerics_small_acos_xpu_complex32",
+        "test_reference_numerics_small_asin_xpu_complex32",
+        "test_reference_numerics_small_asinh_xpu_complex32",
+        "test_reference_numerics_small_atan_xpu_complex32",
+        "test_reference_numerics_small_atanh_xpu_complex32",
+        # Need to check compiler std::sin() on inf+infj
+        "test_reference_numerics_extremal__refs_sin_xpu_complex128",
+        "test_reference_numerics_extremal__refs_sin_xpu_complex64",
+        "test_reference_numerics_extremal_nn_functional_tanhshrink_xpu_complex128",
+        "test_reference_numerics_extremal_nn_functional_tanhshrink_xpu_complex64",
+        "test_reference_numerics_extremal_sin_xpu_complex128",
+        "test_reference_numerics_extremal_sin_xpu_complex64",
+        "test_reference_numerics_extremal_sinh_xpu_complex128",
+        "test_reference_numerics_extremal_sinh_xpu_complex64",
+        "test_reference_numerics_large__refs_sin_xpu_complex32",
+        "test_reference_numerics_large_sin_xpu_complex32",
+        # Known issue of exp accuracy
+        # tensor(13437.7000-501.j, device='xpu:0', dtype=torch.complex128) tensor(inf+infj, device='xpu:0', dtype=torch.complex128) (-inf+infj)
+        "test_reference_numerics_large__refs_exp_xpu_complex128",
+        "test_reference_numerics_large_exp_xpu_complex128",
+        "test_reference_numerics_small_exp_xpu_complex32",
+        ":test_reference_numerics_normal_special_i1_xpu_float32",
+        "test_reference_numerics_normal_sigmoid_xpu_complex32",
+    ),
+    # https://github.com/intel/torch-xpu-ops/issues/1171
+    # AssertionError: 'Assertion maxind >= 0 && maxind < outputImageSize failed' not found in '\nAssertHandler::printMessage\n' : The expected error was not found
+    "nn\test_pooling_xpu.py": (
+        "test_MaxUnpool_index_errors_case1_xpu",
+        "test_MaxUnpool_index_errors_case2_xpu",
+        "test_MaxUnpool_index_errors_case4_xpu",
+        "test_MaxUnpool_index_errors_case6_xpu",
+        "test_MaxUnpool_index_errors_case7_xpu",
+        "test_MaxUnpool_index_errors_case9_xpu",
+    ),
+}
diff --git a/test/xpu/test_decomp_xpu.py b/test/xpu/test_decomp_xpu.py
index d659197d9..2e39ca90d 100644
--- a/test/xpu/test_decomp_xpu.py
+++ b/test/xpu/test_decomp_xpu.py
@@ -39,6 +39,7 @@ def _op_assert_ref(test_case, op, test_dtype, i, orig, decomp, ref, args, kwargs
         (torch.float16, torch.ops.aten.nll_loss_forward.default): 1e-2,
         (torch.bfloat16, torch.ops.aten.nll_loss_forward.default): 1e-1,
         (torch.float16, torch.ops.aten.nll_loss2d_forward.default): 1e-2,
+        (torch.float16, torch.ops.aten.nll_loss2d_backward.default): 1e-4,
         (torch.bfloat16, torch.ops.aten.nll_loss2d_forward.default): 2e-1,
         (torch.float16, torch.ops.aten.hardswish.default): 2e-7,
         (torch.bfloat16, torch.ops.aten.hardswish.default): 2e-7,
diff --git a/test/xpu/test_indexing_xpu.py b/test/xpu/test_indexing_xpu.py
index d57567318..b4299789e 100644
--- a/test/xpu/test_indexing_xpu.py
+++ b/test/xpu/test_indexing_xpu.py
@@ -13,6 +13,7 @@
     from test_indexing import NumpyTests,TestIndexing
     import torch
 
+    torch.Tensor.is_cuda = torch.Tensor.is_xpu
     
     def __test_index_put_accumulate_with_optional_tensors(self, device):
         # TODO: replace with a better solution.
diff --git a/test/xpu/test_torch_xpu.py b/test/xpu/test_torch_xpu.py
index 9c54ffdcc..8dce5989c 100644
--- a/test/xpu/test_torch_xpu.py
+++ b/test/xpu/test_torch_xpu.py
@@ -1439,8 +1439,10 @@ def test_nondeterministic_alert_AvgPool3d(self, device):
         res = module(input)
         grad = torch.ones_like(res)
 
-        self.check_device_nondeterministic_alert(grad, 'avg_pool3d_backward')
-
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad, retain_graph=True),
+            'avg_pool3d_backward_' + torch.device(device).type,
+            torch.device(device).type == 'cuda' or torch.device(device).type == 'xpu')
 
     @skipIfMPS
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
@@ -1478,7 +1480,7 @@ def test_nondeterministic_alert_MaxPool3d(self, device):
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'max_pool3d_with_indices_backward' + torch.device(device).type,
+            'max_pool3d_with_indices_backward_' + torch.device(device).type,
             torch.device(device).type == 'cuda' or torch.device(device).type == 'xpu')
 
     @skipIfMPS
@@ -1770,10 +1772,9 @@ def test_nondeterministic_alert_NLLLoss(self, device):
         input = torch.randn(2, 3, 5, 5, device=device)
         target = torch.rand(2, 5, 5, device=device).mul(3).floor().long()
 
-
         self.check_nondeterministic_alert(
             lambda: module(input, target),
-            'nll_loss2d_forward_out_' + torch.device(device).type + '_template',
+            'nll_loss2d_forward_' + torch.device(device).type,
             torch.device(device).type == 'cuda' or torch.device(device).type == 'xpu')
 
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
@@ -1788,7 +1789,7 @@ def test_nondeterministic_alert_CTCLoss(self, device):
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'ctc_loss_backward_gpu',
+            'ctc_loss_backward_' + torch.device(device).type,
             torch.device(device).type == 'cuda' or torch.device(device).type == 'xpu')
 
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
diff --git a/test/xpu/test_unary_ufuncs_xpu.py b/test/xpu/test_unary_ufuncs_xpu.py
index 0e05a8e7c..a6c12a2ad 100644
--- a/test/xpu/test_unary_ufuncs_xpu.py
+++ b/test/xpu/test_unary_ufuncs_xpu.py
@@ -1,6 +1,7 @@
 # Owner(s): ["module: intel"]
 
-from torch.testing._internal.common_device_type import instantiate_device_type_tests
+import torch
+from torch.testing._internal.common_device_type import instantiate_device_type_tests, onlyXPU
 from torch.testing._internal.common_utils import run_tests
 
 try:
@@ -11,6 +12,38 @@
 with XPUPatchForImport(False):
     from test_unary_ufuncs import TestUnaryUfuncs
 
+    @onlyXPU
+    def _nonzero_static_large(self, device):
+        # large enough to have multiple iters per SM even on H100
+        # with 132 sms
+        size_inp = 1024 * 16 * 132 + 1024 * 16
+        x = torch.zeros(size_inp, device=device)
+        # unique indices
+        indices = torch.randperm(size_inp, device=device)[: size_inp // 2]
+        sorted, _ = torch.sort(indices)
+        x[sorted] = 1
+        res = torch.nonzero_static(x, size=size_inp // 2).view(-1)
+        self.assertEqual(res, sorted)
+        # no oob writes
+        out = torch.full((size_inp,), 10, device=device, dtype=torch.int64)
+        res = torch.nonzero_static(x, size=size_inp // 4, out=out[: size_inp // 2])
+        self.assertEqual(out[: size_inp // 4], sorted[: size_inp // 4])
+        self.assertEqual(
+            out[size_inp // 4 :],
+            torch.tensor(10, device="xpu").expand_as(out[size_inp // 4 :]),
+        )
+        # correct fill for 2d
+        x = x.view(2, size_inp // 2)
+        ref = x.nonzero()
+        res = x.nonzero_static(size=size_inp // 2 + 2)
+        self.assertEqual(res.shape, [size_inp // 2 + 2, 2])
+        self.assertEqual(ref, res[: size_inp // 2])
+        self.assertEqual(
+            res[size_inp // 2 :],
+            torch.tensor(-1, device="xpu").expand_as(res[size_inp // 2 :]),
+        )
+    TestUnaryUfuncs.test_nonzero_static_large = _nonzero_static_large
+
 instantiate_device_type_tests(TestUnaryUfuncs, globals(),only_for=("xpu"), allow_xpu=True)
 
 if __name__ == "__main__":
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index 8dc208ed4..4f8ef5635 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -223,6 +223,8 @@
     "nn.functional.ctc_loss",
     "nn.functional.channel_shuffle",
     "nn.functional.multi_head_attention_forward",
+    "nn.GRUCell",
+    "nn.LSTMCell",
     "sigmoid",
     "logsigmoid",
     "sgn",
diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml
index d5e307cdf..f19a57c7f 100644
--- a/yaml/native/native_functions.yaml
+++ b/yaml/native/native_functions.yaml
@@ -2036,6 +2036,10 @@
   dispatch:
     XPU: softmax_xpu_out
 
+- func: _safe_softmax(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+  dispatch:
+    XPU: _safe_softmax_xpu
+
 - func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
   structured_delegate: _softmax_backward_data.out
 
@@ -5988,12 +5992,6 @@
     XPU: native_multi_head_attention_xpu
   autogen: _native_multi_head_attention.out
 
-# This aten function is kept so that we can test the choice function from Python
-- func: _fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None, bool enable_gqa=False) -> int
-  dispatch:
-    XPU: _fused_sdp_choice_xpu
-  tags: nondeterministic_seeded
-
 - func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
   structured_delegate: argmin.out
   device_check: NoCheck   # TensorIterator
@@ -7597,6 +7595,34 @@
   dispatch:
     XPU: ctc_loss_backward_tensor
 
+- func: lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor)
+
+- func: gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor
+
+# Fused RNN kernels
+- func: _thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    XPU: _thnn_fused_lstm_cell_xpu
+  autogen: _thnn_fused_lstm_cell.out
+
+# NB: The composite version of this function below is a simple wrapper that duplicates some of the outputs
+#     It is necessary to avoid triggering TensorImpl use count checks in debug mode
+# NB: this is function is NOT differentiable
+- func: _thnn_fused_lstm_cell_backward_impl(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    XPU: _thnn_fused_lstm_cell_backward_xpu
+  autogen: _thnn_fused_lstm_cell_backward_impl.out
+
+- func: _thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor)
+  dispatch:
+    XPU: _thnn_fused_gru_cell_xpu
+  autogen: _thnn_fused_gru_cell.out
+
+- func: _thnn_fused_gru_cell_backward(Tensor grad_hy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+  dispatch:
+    XPU: _thnn_fused_gru_cell_backward_xpu
+  autogen: _thnn_fused_gru_cell_backward.out
+
 - func: hardshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   structured_inherits: TensorIteratorBase
@@ -8209,17 +8235,18 @@
   variants: function
   tags: pointwise
 
-- func: rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
+- func: rrelu_with_noise.out(Tensor self, Tensor(b!) noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   tags: nondeterministic_seeded
   dispatch:
     XPU: rrelu_with_noise_out_xpu
 
-- func: rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
+- func: rrelu_with_noise(Tensor self, Tensor(b!) noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
   python_module: nn
   dispatch:
     XPU: rrelu_with_noise_xpu
   tags: nondeterministic_seeded
+  autogen: rrelu_with_noise_functional
 
 - func: rrelu_with_noise_backward(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result) -> Tensor
   python_module: nn
@@ -8227,7 +8254,7 @@
     CompositeExplicitAutograd: rrelu_with_noise_backward
   autogen: rrelu_with_noise_backward.out
 
-- func: rrelu_with_noise_(Tensor(a!) self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)
+- func: rrelu_with_noise_(Tensor(a!) self, Tensor(b!) noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)
   python_module: nn
   tags: nondeterministic_seeded
   dispatch: