diff --git a/.github/actions/inductor-xpu-e2e-test/action.yml b/.github/actions/inductor-xpu-e2e-test/action.yml
index 4f81534fd..4e1fce079 100644
--- a/.github/actions/inductor-xpu-e2e-test/action.yml
+++ b/.github/actions/inductor-xpu-e2e-test/action.yml
@@ -51,7 +51,7 @@ runs:
shell: bash
run: |
source activate e2e_ci
- source .github/scripts/env.sh
+ source .github/scripts/env.sh ${{ inputs.pytorch }}
if [[ ${{ inputs.suite }} == *"torchbench"* ]]; then
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
cd ../ && rm -rf audio && git clone --single-branch -b main https://github.com/pytorch/audio.git
@@ -94,7 +94,7 @@ runs:
shell: bash
run: |
source activate e2e_ci
- source .github/scripts/env.sh
+ source .github/scripts/env.sh ${{ inputs.pytorch }}
cp .github/scripts/inductor_xpu_test.sh ../pytorch
cd ../pytorch
diff --git a/.github/ci_expected_accuracy/check_expected.py b/.github/ci_expected_accuracy/check_expected.py
index 48543c930..6d321e34b 100644
--- a/.github/ci_expected_accuracy/check_expected.py
+++ b/.github/ci_expected_accuracy/check_expected.py
@@ -16,14 +16,14 @@
# load csv files
-test_data= pd.read_csv(args.csv_file)
+test_data= pd.read_csv(args.csv_file, comment='#')
# test_data = test_data.reset_index() # make sure indexes pair with number of rows
# test_data = test_data.sort_values(by=["name"], ascending=True)
test_names = [row["name"] for index, row in test_data.iterrows()]
current_path = pathlib.Path(__file__).parent.resolve()
refer_file = str(current_path) + "/" + args.category + "_" + args.suite + "_" + args.mode + ".csv"
-refer_data= pd.read_csv(refer_file)
+refer_data= pd.read_csv(refer_file, comment='#')
# refer_data = refer_data.reset_index() # make sure indexes pair with number of rows
# refer_data = refer_data.sort_values(by=["name"], ascending=True)
refer_names = [row["name"] for index, row in refer_data.iterrows()]
diff --git a/.github/ci_expected_accuracy/inductor_huggingface_training.csv b/.github/ci_expected_accuracy/inductor_huggingface_training.csv
index a75d3d225..e2d5645e2 100644
--- a/.github/ci_expected_accuracy/inductor_huggingface_training.csv
+++ b/.github/ci_expected_accuracy/inductor_huggingface_training.csv
@@ -13,7 +13,8 @@ CamemBert,pass,pass,pass,pass,pass
DebertaForMaskedLM,pass,pass,pass,pass,pass
DebertaForQuestionAnswering,pass,pass,pass,pass,pass
DebertaV2ForMaskedLM,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip
-DebertaV2ForQuestionAnswering,pass,pass,pass,pass,pass
+# Skip DebertaV2ForQuestionAnswering issue: https://github.com/intel/torch-xpu-ops/issues/1216
+DebertaV2ForQuestionAnswering,fail_accuracy,fail_accuracy,fail_accuracy,pass,pass
DistilBertForMaskedLM,pass,pass,pass,pass,pass
DistilBertForQuestionAnswering,pass,pass,pass,pass,pass
DistillGPT2,pass,pass,pass,pass,pass
diff --git a/.github/ci_expected_accuracy/inductor_torchbench_inference.csv b/.github/ci_expected_accuracy/inductor_torchbench_inference.csv
index 4825aa41f..832923854 100644
--- a/.github/ci_expected_accuracy/inductor_torchbench_inference.csv
+++ b/.github/ci_expected_accuracy/inductor_torchbench_inference.csv
@@ -102,5 +102,6 @@ torch_multimodal_clip,pass,pass,pass,eager_fail_to_run,eager_fail_to_run
tts_angular,pass,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
vgg16,pass,pass,pass,pass,pass
vision_maskrcnn,pass,pass,pass,eager_fail_to_run,eager_fail_to_run
-yolov3,pass,pass,pass,pass,pass
+# Skip yolov3 for known torchbench issue: https://github.com/intel/torch-xpu-ops/issues/1229
+yolov3,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
hf_Roberta_base,pass,pass,pass,pass,pass
diff --git a/.github/ci_expected_accuracy/inductor_torchbench_training.csv b/.github/ci_expected_accuracy/inductor_torchbench_training.csv
index dc766eac0..36a646a14 100644
--- a/.github/ci_expected_accuracy/inductor_torchbench_training.csv
+++ b/.github/ci_expected_accuracy/inductor_torchbench_training.csv
@@ -102,5 +102,6 @@ torch_multimodal_clip,pass,pass,pass,eager_fail_to_run,eager_fail_to_run
tts_angular,pass,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
vgg16,pass,pass,pass,pass,pass
vision_maskrcnn,pass,pass,pass,eager_fail_to_run,eager_fail_to_run
-yolov3,pass,pass,pass,pass,pass
+# Skip yolov3 for known torchbench issue: https://github.com/intel/torch-xpu-ops/issues/1229
+yolov3,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run
hf_Roberta_base,pass,pass,pass,pass,pass
diff --git a/.github/scripts/apply_torch_pr.py b/.github/scripts/apply_torch_pr.py
index b4b441263..bbe89ed7d 100644
--- a/.github/scripts/apply_torch_pr.py
+++ b/.github/scripts/apply_torch_pr.py
@@ -12,9 +12,7 @@
# Fallback to CPU for XPU FP64
"https://github.com/pytorch/pytorch/pull/126516",
# Modify the tolerance level in TIMM benchmark
- "https://github.com/pytorch/pytorch/pull/129735",
- # [XPU] Update XPU C Shim Header
- "https://github.com/pytorch/pytorch/pull/141086",
+ "https://github.com/pytorch/pytorch/pull/143739",
]
)
parser.add_argument('--extra-pr-list', '-e', nargs='+',default=[])
@@ -59,7 +57,7 @@ def appyly_pr(pr_info, re_apply_msg):
pr_file = pr_info["diff_url"].split("/")[-1]
urllib.request.urlretrieve(pr_info["diff_url"], pr_file)
# apply diff
- apply_cmd = "git apply --3way " + pr_file + " && rm -f " + pr_file
+ apply_cmd = "git apply --3way " + pr_file
apply_info = subprocess.Popen(apply_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
apply_message = apply_info.communicate()[0].decode("utf-8")
apply_status = apply_info.returncode
diff --git a/.github/scripts/env.sh b/.github/scripts/env.sh
index 56d8e3930..9cfd67477 100644
--- a/.github/scripts/env.sh
+++ b/.github/scripts/env.sh
@@ -1,4 +1,11 @@
#!/bin/bash
-source /opt/intel/oneapi/compiler/latest/env/vars.sh
-source /opt/intel/oneapi/umf/latest/env/vars.sh
-source /opt/intel/oneapi/pti/latest/env/vars.sh
+
+if [ "$1" != "nightly_wheel" ];then
+ source /opt/intel/oneapi/compiler/latest/env/vars.sh
+ source /opt/intel/oneapi/umf/latest/env/vars.sh
+ source /opt/intel/oneapi/pti/latest/env/vars.sh
+ source /opt/intel/oneapi/ccl/latest/env/vars.sh
+ source /opt/intel/oneapi/mpi/latest/env/vars.sh
+else
+ echo "Don't need to source DL-Essential for nightly wheel"
+fi
diff --git a/.github/scripts/inductor_summary.py b/.github/scripts/inductor_summary.py
index e11147664..8edd8970c 100644
--- a/.github/scripts/inductor_summary.py
+++ b/.github/scripts/inductor_summary.py
@@ -3,6 +3,8 @@
import pandas as pd
from scipy.stats import gmean
from styleframe import StyleFrame, Styler, utils
+import numpy as np
+from openpyxl import Workbook
parser = argparse.ArgumentParser(description="Generate report")
parser.add_argument('-s', '--suite', default=["huggingface"], nargs='*', type=str, help='model suite name')
@@ -665,6 +667,73 @@ def update_summary(excel, scenario, suite):
sf.set_row_height(j, 30)
sf.to_excel(sheet_name=suite + '_' + scenario + '_Summary', excel_writer=excel)
+def summary_conclusion(scenario, excel):
+ excel.book.save(excel)
+ df = pd.read_excel(excel, sheet_name = None, header = None)
+ #df = pd.DataFrame(excel)
+ if scenario == 'performance':
+ sheet_names = list(df.keys())
+ sheet_names = [s for s in sheet_names if 'Summary' in s and 'performance' in s]
+ sheet_names.sort()
+ print(f"Merge excel as below:\n{sheet_names}")
+ print("\n")
+ features = [[]] * 21
+ for sheet_name in sheet_names:
+ df_sheet = df[sheet_name]
+ df_sheet = df_sheet.values
+ features = np.hstack((features, df_sheet))
+
+ if len(sheet_names) == 1:
+ print("sheet not merge")
+ elif len(sheet_names) == 2:
+ print("2 sheets merge")
+ if 'huggingface' in sheet_names[0]:
+ features[:, 4:5] = features[:, 14:15]
+ features[:, 6:7] = features[:, 16:17]
+ else:
+ features[:, 4:5] = features[:, 14:15]
+ else:
+ print("3 sheets merge")
+ features[:, 4:5] = features[:, 24:25]
+ features[:, 6:7] = features[:, 16:17]
+
+ df_concat = StyleFrame(pd.DataFrame(features).iloc[:,:10])
+ for i in range(10):
+ df_concat.set_column_width(i, 22)
+ for j in range(1, 23):
+ df_concat.set_row_height(j, 30)
+ df_concat.to_excel(sheet_name='Perf_Summary', excel_writer=excel, index=False)
+ else:
+ sheet_names = list(df.keys())
+ sheet_names = [s for s in sheet_names if 'Summary' in s and 'accuracy' in s]
+ sheet_names.sort()
+ print(f"Merge excel as below:\n{sheet_names}")
+ print("\n")
+ features = [[]] * 11
+ for sheet_name in sheet_names:
+ df_sheet = df[sheet_name]
+ df_sheet = df_sheet.values
+ features = np.hstack((features, df_sheet))
+ if len(sheet_names) == 1:
+ print("sheet not merge")
+ elif len(sheet_names) == 2:
+ print("2 sheets merge")
+ if 'huggingface' in sheet_names[0]:
+ features[:, 3:4] = features[:, 12:13]
+ features[:, 5:6] = features[:, 14:15]
+ else:
+ features[:, 3:4] = features[:, 12:13]
+ else:
+ print("3 sheets merge")
+ features[:, 3:4] = features[:, 21:22]
+ features[:, 5:6] = features[:, 14:15]
+
+ df_concat = StyleFrame(pd.DataFrame(features).iloc[:,:9])
+ for i in range(10):
+ df_concat.set_column_width(i, 22)
+ for j in range(1, 13):
+ df_concat.set_row_height(j, 30)
+ df_concat.to_excel(sheet_name='Acc_Summary', excel_writer=excel, index=False)
def generate_report(excel, scenario_list, precision_list, mode_list, suite_list):
for sc in scenario_list:
@@ -693,8 +762,19 @@ def excel_postprocess(file, scenario, precison, mode, suite):
wdt.merge_cells(start_row=1, end_row=1, start_column=13, end_column=16)
wb.save(file)
+ if len(scenario) == 2:
+ wb.move_sheet("Perf_Summary", -(len(wb.worksheets)-1))
+ wb.move_sheet("Acc_Summary", -(len(wb.worksheets)-1))
+ elif len(scenario) == 1 and sc == 'accuracy':
+ wb.move_sheet("Acc_Summary", -(len(wb.worksheets)-1))
+ else:
+ wb.move_sheet("Perf_Summary", -(len(wb.worksheets)-1))
+
if __name__ == '__main__':
excel = StyleFrame.ExcelWriter('inductor_log/Inductor_E2E_Test_Report.xlsx')
generate_report(excel, args.scenario, args.precision, args.mode, args.suite)
+ for sc in args.scenario:
+ summary_conclusion(sc, excel)
excel_postprocess(excel, args.scenario, args.precision, args.mode, args.suite)
+ excel.close()
diff --git a/.github/scripts/spec.py b/.github/scripts/spec.py
new file mode 100644
index 000000000..b8bf6d59a
--- /dev/null
+++ b/.github/scripts/spec.py
@@ -0,0 +1,7 @@
+import torch
+
+DEVICE_NAME = 'xpu'
+
+MANUAL_SEED_FN = torch.xpu.manual_seed
+EMPTY_CACHE_FN = torch.xpu.empty_cache
+DEVICE_COUNT_FN = torch.xpu.device_count
diff --git a/.github/workflows/_linux_transformers.yml b/.github/workflows/_linux_transformers.yml
new file mode 100644
index 000000000..b21864e9b
--- /dev/null
+++ b/.github/workflows/_linux_transformers.yml
@@ -0,0 +1,355 @@
+name: Linux Transformers Test
+
+on:
+ pull_request:
+ branches:
+ - main
+ paths:
+ - '.github/scripts/spec.py'
+ - '.github/workflows/_linux_transformers.yml'
+ workflow_dispatch:
+ inputs:
+ pytorch:
+ required: false
+ type: string
+ default: 'nightly'
+ description: Pytorch branch/commit
+ python:
+ required: false
+ type: string
+ default: '3.10'
+ description: Python version
+ runner:
+ required: true
+ type: string
+ default: 'linux.idc.xpu'
+ description: Runner label
+ driver:
+ required: false
+ type: string
+ default: 'lts'
+ description: Driver lts/rolling
+ nightly_whl:
+ required: false
+ type: string
+ default: ''
+ description: Pytorch nightly wheel version
+ transformers:
+ required: false
+ type: string
+ default: 'v4.47.0'
+ description: Transformers version
+
+permissions: read-all
+
+jobs:
+ Torch-XPU-Transformers-Tests:
+ runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }}
+ env:
+ NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+ DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
+ python: ${{ inputs.python != '' && inputs.python || '3.10' }}
+ pytorch: ${{ inputs.pytorch != '' && inputs.pytorch || 'nightly' }}
+ transformers: ${{ inputs.transformers != '' && inputs.transformers || 'v4.47.0' }}
+ PYTORCH_DEBUG_XPU_FALLBACK: '1'
+ TRANSFORMERS_TEST_DEVICE_SPEC: 'spec.py'
+ steps:
+ - name: Checkout torch-xpu-ops
+ uses: actions/checkout@v4
+ with:
+ path: torch-xpu-ops
+ - name: Checkout Transformers
+ uses: actions/checkout@v4
+ with:
+ repository: huggingface/transformers
+ ref: ${{ env.transformers }}
+ path: transformers
+ - name: Prepare OS environment
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y \
+ espeak-ng \
+ git-lfs \
+ pkg-config \
+ libavcodec-dev \
+ libavdevice-dev \
+ libavfilter-dev \
+ libavformat-dev \
+ libavutil-dev \
+ libswresample-dev \
+ libswscale-dev
+ git lfs install
+ - name: Prepare Conda ENV
+ run: |
+ which conda && conda clean -ay
+ conda remove --all -y -n huggingface_transformers_test || rm -rf $(dirname ${CONDA_EXE})/../envs/huggingface_transformers_test
+ conda create -y -n huggingface_transformers_test python=${{ env.python }}
+ source activate huggingface_transformers_test
+ - name: Prepare Stock XPU Pytorch
+ run: |
+ pwd
+ source activate huggingface_transformers_test
+ if [ -z "${{ inputs.nightly_whl }}" ]; then
+ pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+ else
+ pip install torch==$(echo ${{ inputs.nightly_whl }}) torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+ fi
+ - name: Prepare Transformers
+ run: |
+ pwd
+ source activate huggingface_transformers_test
+ cd transformers
+ pip install -e .
+ pip install -e ".[dev-torch,testing,video]"
+ rm -rf tests_log && mkdir -p tests_log
+ rm -rf reports
+ cp ${{ github.workspace }}/torch-xpu-ops/.github/scripts/spec.py ./
+ - name: Report installed versions
+ run: |
+ source activate huggingface_transformers_test
+ echo "pip installed packages:"
+ pip list | tee ${{ github.workspace }}/transformers/tests_log/pip_list.txt
+ echo "lspci gpu devices:"
+ lspci -d ::0380 | tee ${{ github.workspace }}/transformers/tests_log/lspci_0380.txt
+ echo "GPU render nodes:"
+ cat /sys/class/drm/render*/device/device | tee ${{ github.workspace }}/transformers/tests_log/device_IDs.txt
+ echo "xpu-smi output:"
+ xpu-smi discovery -y --json --dump -1
+ - name: Sanitry check installed packages
+ run: |
+ source activate huggingface_transformers_test
+ # These checks are to exit earlier if for any reason Transformers
+ # reinstalled torch packages back to CUDA versions (not expected).
+ pip show torch | grep Version | grep xpu
+ pip show torchaudio | grep Version | grep xpu
+ pip show torchvision | grep Version | grep xpu
+ python -c 'import torch; exit(not torch.xpu.is_available())'
+ - name: Run -k backbone tests
+ env:
+ TEST_CASE: 'tests_backbone'
+ run: |
+ source activate huggingface_transformers_test
+ cd transformers
+ python3 -m pytest -rsf --make-reports=$TEST_CASE -k backbone tests || \
+ (echo "FAILED_CASES=$FAILED_CASES,$TEST_CASE" >> $GITHUB_ENV)
+ - name: Run tests/*.py
+ env:
+ TEST_CASE: 'tests_py'
+ run: |
+ source activate huggingface_transformers_test
+ cd transformers
+ python3 -m pytest -rsf --make-reports=$TEST_CASE tests/*.py || true
+ - name: Run tests/benchmark
+ env:
+ TEST_CASE: 'tests_benchmark'
+ run: |
+ source activate huggingface_transformers_test
+ cd transformers
+ python3 -m pytest -rsf --make-reports=$TEST_CASE tests/benchmark || true
+ - name: Run tests/generation
+ env:
+ TEST_CASE: 'tests_generation'
+ run: |
+ source activate huggingface_transformers_test
+ cd transformers
+ # Excluding tests due to:
+ # * torch.distributed.* not yet supported by XPU
+ pattern="not TestFSDPGeneration"
+ python3 -m pytest -rsf --make-reports=$TEST_CASE tests/generation -k "$pattern" || true
+ - name: Run tests/models
+ env:
+ TEST_CASE: 'tests_models'
+ run: |
+ source activate huggingface_transformers_test
+ cd transformers
+ # Excluding tests due to:
+ # * https://github.com/huggingface/transformers/issues/35252 (CUDA specific tests)
+ # * https://github.com/pytorch/pytorch/issues/140965 (aten::_linalg_eigvals)
+ pattern=" \
+ not test_model_parallelization and \
+ not test_model_parallel_equal_results and \
+ not test_resize_embeddings_untied and \
+ not test_resize_tokens_embeddings"
+ python3 -m pytest -rsf --make-reports=$TEST_CASE tests/models -k "$pattern" || true
+ - name: Run tests/pipelines
+ env:
+ TEST_CASE: 'tests_pipelines'
+ run: |
+ source activate huggingface_transformers_test
+ cd transformers
+ # Some tests are known to fail w/o clear pattern
+ # TODO: drop ||true after triage and fixes
+ python3 -m pytest -rsf --make-reports=$TEST_CASE tests/pipelines || true
+ - name: Run tests/trainer
+ env:
+ TEST_CASE: 'tests_trainer'
+ run: |
+ source activate huggingface_transformers_test
+ cd transformers
+ # Excluding tests due to:
+ # * Some ray tests hang, reason unknown
+ # * torch.distributed.* not yet supported by XPU
+ pattern=" \
+ not ray and \
+ not TestTrainerDistributed and \
+ not TestTrainerDistributedXPU and \
+ not TestFSDPTrainer"
+ python3 -m pytest -rsf --make-reports=$TEST_CASE tests/trainer -k "$pattern" || \
+ (echo "FAILED_CASES=$FAILED_CASES,$TEST_CASE" >> $GITHUB_ENV)
+ - name: Run tests/utils
+ env:
+ TEST_CASE: 'tests_utils'
+ run: |
+ source activate huggingface_transformers_test
+ cd transformers
+ # Excluding tests due to:
+ # * Network proxy connection issue, reason unknown
+ pattern="not test_load_img_url_timeout"
+ python3 -m pytest -rsf --make-reports=$TEST_CASE tests/utils -k "$pattern" || \
+ (echo "FAILED_CASES=$FAILED_CASES,$TEST_CASE" >> $GITHUB_ENV)
+ - name: Check for errors in tests
+ run: |
+ FAILED_CASES=$(echo $FAILED_CASES | sed 's/^,//')
+ echo "Failed cases: [$(echo $FAILED_CASES | sed 's/,/, /g')]"
+ test -z "$FAILED_CASES"
+ - name: Print results table
+ if: ${{ ! cancelled() }}
+ run: |
+ # Helper function to return number preceeding given pattern, i.e:
+ # === 25 failed, 11 warnings, 0 errors ===
+ # Call as follows:
+ # parse_stat $line "failed"
+ function parse_stat() {
+ stat=$(cat $1 | grep $2 | sed "s/.* \([0-9]*\) $2.*/\1/")
+ if [ -n "$stat" ]; then echo $stat; else echo "0"; fi
+ }
+ cd transformers
+ {
+ echo "### Results"
+ echo "| Test group | Errors | Failed | Deselected | Passed | Skipped |"
+ echo "| --- | --- | --- | --- | --- | --- |"
+ for stat in $(find reports -name stats.txt); do
+ # Each stat.txt is located in: reports/$test_group/stats.txt
+ test_group=$(echo $stat | cut -f 2 -d/)
+ # Get failed, passed, skipped, etc. counters
+ failed=$(parse_stat $stat failed)
+ passed=$(parse_stat $stat passed)
+ deselected=$(parse_stat $stat deselected)
+ skipped=$(parse_stat $stat skipped)
+ warnings=$(parse_stat $stat warnings)
+ errors=$(parse_stat $stat errors)
+ echo "| $test_group | $errors | $failed | $deselected | $passed | $skipped |"
+ done
+ } >> $GITHUB_STEP_SUMMARY
+ - name: Print failure lines
+ if: ${{ ! cancelled() }}
+ run: |
+ cd transformers
+ {
+ echo "### Failure lines"
+ echo "| Test group |File | Error | Comment |"
+ echo "| --- | --- | --- | --- |"
+ rm -rf _failures.txt
+ for failure in $(find reports -name failures_line.txt); do
+ # Each failure_line.txt is located in: reports/$test_group/failure_line.txt
+ test_group=$(echo $failure | cut -f2 -d/)
+ tail -n +2 $failure | sed "s/^/$test_group /" >> _failures.txt
+ done
+ # failures_line.txt file does not have test case information,
+ # so we can just sort the output and report uniq values
+ sort _failures.txt | uniq > _failures_uniq.txt
+ while read line; do
+ test_group=$(echo $line | cut -f1 -d" ")
+ file=$(echo $line | cut -f2 -d" " | sed "s/\(.*\):$/\1/")
+ error=$(echo $line | cut -f3 -d" " | sed "s/\(.*\):$/\1/")
+ # Failure comments often contain special characters which complicate
+ # parsing failure lines. But fortunately we know for sure where comments
+ # start. So we just output all contents starting from this position and
+ # wrap everything in
to avoid collisions with Markdown formatting.
+ comment="$(echo $line | cut -f4- -d' ' | sed 's/\(.*\):$/\1/')
"
+ echo "| $test_group | $file | $error | $comment |"
+ done <_failures_uniq.txt
+ } >> $GITHUB_STEP_SUMMARY
+ - name: Print not implemented XPU backend ops
+ run: |
+ cd transformers
+ {
+ echo "### Not implemented ops"
+ echo "| Test group | Operator | Status |"
+ echo "| --- | --- | --- |"
+ rm -rf _ops.txt && touch _ops.txt
+ for log in $(find reports -name failures_line.txt); do
+ # Each failure_line.txt is located in: reports/$test_group/failure_line.txt
+ test_group=$(echo $log | cut -f2 -d/)
+ ops=$(grep NotImplementedError $log | grep "for the XPU device" | sed "s/.*The operator '\(.*\)' is not.*/\1/")
+ for op in $ops; do
+ echo "| $test_group | $op
| not implemented |" >> _ops.txt
+ done
+ done
+ for log in $(find reports -name warnings.txt); do
+ # Each warnings.txt is located in: reports/$test_group/warnings.txt
+ test_group=$(echo $log | cut -f2 -d/)
+ ops=$(grep UserWarning $log | grep "on the XPU backend" | sed "s/.*The operator '\(.*\) on the XPU.*/\1/")
+ for op in $ops; do
+ echo "| $test_group | $op
| fallback to CPU happens |" >> _ops.txt
+ done
+ done
+ sort _ops.txt | uniq
+ } >> $GITHUB_STEP_SUMMARY
+ - name: Print annotations
+ if: ${{ ! cancelled() }}
+ run: |
+ source activate huggingface_transformers_test
+ {
+ echo "### Annotations"
+ echo "| | |"
+ echo "| --- | --- |"
+ echo "| jobs.$GITHUB_JOB.versions.os | $(source /etc/os-release && echo $VERSION_ID) |"
+ echo "| jobs.$GITHUB_JOB.versions.linux-kernel | $(uname -r) |"
+ echo "| jobs.$GITHUB_JOB.versions.python | $(python --version | cut -f2 -d' ') |"
+ packages=" \
+ level-zero \
+ libigc1 \
+ libigc2 \
+ libze1 \
+ libze-intel-gpu1 \
+ intel-i915-dkms \
+ intel-level-zero-gpu \
+ intel-opencl-icd"
+ for package in $packages; do
+ package_version=$(dpkg -l | grep $package | grep ii | head -1 | sed "s/ */ /g" | cut -f3 -d" ")
+ echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |"
+ done
+ packages="accelerate \
+ numpy \
+ torch \
+ torchaudio \
+ torchvision \
+ transformers"
+ for package in $packages; do
+ package_version=$(python -c "import $package; print($package.__version__)" || true)
+ echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |"
+ done
+ # printing annotations for GPU cards
+ var="[$(cat /sys/class/drm/render*/device/vendor || true)]"
+ echo "| jobs.$GITHUB_JOB.drm.render_nodes_vendor_ids | $(echo $var | sed 's/ /,/g') |"
+ var="[$(cat /sys/class/drm/render*/device/device || true)]"
+ echo "| jobs.$GITHUB_JOB.drm.render_nodes_device_ids | $(echo $var | sed 's/ /,/g') |"
+ var=$(python -c "import torch; print(torch.version.xpu)" || true)
+ echo "| jobs.$GITHUB_JOB.torch.version.xpu | $var |"
+ var=$(python -c "import torch; print(torch.xpu.device_count())" || true)
+ echo "| jobs.$GITHUB_JOB.torch.xpu.device_count | $var |"
+ # printing annotations with key environment variables
+ echo "| jobs.$GITHUB_JOB.env.ZE_AFFINITY_MASK | $ZE_AFFINITY_MASK |"
+ echo "| jobs.$GITHUB_JOB.env.NEOReadDebugKeys | $NEOReadDebugKeys |"
+ echo "| jobs.$GITHUB_JOB.env.PYTORCH_ENABLE_XPU_FALLBACK | $PYTORCH_ENABLE_XPU_FALLBACK |"
+ echo "| jobs.$GITHUB_JOB.env.PYTORCH_DEBUG_XPU_FALLBACK | $PYTORCH_DEBUG_XPU_FALLBACK |"
+ } >> $GITHUB_STEP_SUMMARY
+ - name: Upload Test log
+ if: ${{ ! cancelled() }}
+ uses: actions/upload-artifact@v4
+ with:
+ name: Torch-XPU-Transformers-Log-${{ github.event.pull_request.number || github.sha }}
+ path: |
+ ${{ github.workspace }}/transformers/reports
+ ${{ github.workspace }}/transformers/tests_log
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index d2f717230..b724d4259 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -65,19 +65,21 @@ jobs:
conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
source activate xpu_op_${ZE_AFFINITY_MASK}
cd ../ && rm -rf pytorch
- git clone https://github.com/pytorch/pytorch pytorch
- cd pytorch && git checkout $(echo ${{ inputs.pytorch }} |sed 's/^nightly_wheel$/nightly/')
- # apply PRs for stock pytorch
pip install requests
- python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
- git status && git show -s
- git submodule sync && git submodule update --init --recursive
- if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
- echo "Don't replace torch-xpu-ops!"
- else
- rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
- # Workaround for torch-xpu-ops ci test
- sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+ git clone https://github.com/pytorch/pytorch pytorch
+ if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+ cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
+ # apply PRs for stock pytorch
+ python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+ git status && git show -s
+ git submodule sync && git submodule update --init --recursive
+ if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+ echo "Don't replace torch-xpu-ops!"
+ else
+ rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+ # Workaround for torch-xpu-ops ci test
+ sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+ fi
fi
- name: Triton Installation
run: |
@@ -96,15 +98,15 @@ jobs:
- name: Build Pytorch XPU
run: |
source activate xpu_op_${ZE_AFFINITY_MASK}
- source .github/scripts/env.sh
+ source .github/scripts/env.sh ${{ inputs.pytorch }}
pip install mkl-static==2025.0.1 mkl-include==2025.0.1
- cd ../pytorch
if [[ ${{ inputs.abi }} == '0' ]]; then
export _GLIBCXX_USE_CXX11_ABI=0
else
export _GLIBCXX_USE_CXX11_ABI=1
fi
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
+ cd ../pytorch
export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
pip install -r requirements.txt
WERROR=1 python setup.py bdist_wheel
@@ -112,12 +114,21 @@ jobs:
git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
else
pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+ TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
+ cd ../pytorch
+ git reset --hard && git checkout ${TORCH_COMMIT_ID}
+ TORCH_XPU_OPS_COMMIT=$(> "${GITHUB_ENV}"
echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
- echo "TORCHBENCH_COMMIT_ID=$(> "${GITHUB_ENV}"
+ echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
diff --git a/.github/workflows/nightly_ondemand_rolling.yml b/.github/workflows/nightly_ondemand_rolling.yml
index 0a27b2b50..7515c5003 100644
--- a/.github/workflows/nightly_ondemand_rolling.yml
+++ b/.github/workflows/nightly_ondemand_rolling.yml
@@ -63,7 +63,7 @@ permissions: read-all
concurrency:
group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.keep_torch_xpu_ops }}-${{ inputs.ut }}-${{ inputs.triton }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }}
- cancel-in-progress: true
+ cancel-in-progress: ${{ github.event_name != 'schedule' }}
jobs:
Linux-Nightly-Ondemand-UT-Tests-Rolling:
@@ -158,7 +158,7 @@ jobs:
fi
echo "TORCH_BRANCH_ID=$(git rev-parse --abbrev-ref HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
- echo "TORCHBENCH_COMMIT_ID=$(> "${GITHUB_ENV}"
+ echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
diff --git a/.github/workflows/nightly_ondemand_whl.yml b/.github/workflows/nightly_ondemand_whl.yml
index 6b8d0b58f..a742f2b2d 100644
--- a/.github/workflows/nightly_ondemand_whl.yml
+++ b/.github/workflows/nightly_ondemand_whl.yml
@@ -53,11 +53,11 @@ permissions: read-all
concurrency:
group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.ut }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }}
- cancel-in-progress: true
+ cancel-in-progress: ${{ github.event_name != 'schedule' }}
jobs:
Linux-Nightly-Ondemand-UT-WHL-Tests:
- if: github.event_name == 'schedule' || ${{ inputs.ut_suite }}
+ if: github.event_name == 'schedule' || ${{ inputs.ut }}
uses: ./.github/workflows/_linux_ut.yml
with:
ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu' || inputs.ut }}
@@ -75,19 +75,20 @@ jobs:
ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu' || inputs.ut }}
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
outputs:
- TORCH_BRANCH_ID: ${{ steps.pinned.outputs.TORCH_BRANCH_ID }}
- TORCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCH_COMMIT_ID }}
- DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }}
- KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }}
- BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }}
- OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }}
- GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }}
+ TORCH_BRANCH_ID: ${{ steps.installed.outputs.TORCH_BRANCH_ID }}
+ TORCH_COMMIT_ID: ${{ steps.installed.outputs.TORCH_COMMIT_ID }}
+ TORCH_XPU_OPS_COMMIT: ${{ steps.installed.outputs.TORCH_XPU_OPS_COMMIT }}
TORCHBENCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCHBENCH_COMMIT_ID }}
TORCHVISION_COMMIT_ID: ${{ steps.pinned.outputs.TORCHVISION_COMMIT_ID }}
TORCHAUDIO_COMMIT_ID: ${{ steps.pinned.outputs.TORCHAUDIO_COMMIT_ID }}
TRANSFORMERS_VERSION: ${{ steps.pinned.outputs.TRANSFORMERS_VERSION }}
TIMM_COMMIT_ID: ${{ steps.pinned.outputs.TIMM_COMMIT_ID }}
TRITON_COMMIT_ID: ${{ steps.pinned.outputs.TRITON_COMMIT_ID }}
+ DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }}
+ KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }}
+ BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }}
+ OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }}
+ GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }}
TIMEOUT_MODELS: ${{ steps.summary.outputs.TIMEOUT_MODELS }}
steps:
- name: Checkout torch-xpu-ops
@@ -101,38 +102,43 @@ jobs:
pip install mkl-static==2025.0.1 mkl-include==2025.0.1
pip install pandas scipy tqdm
- name: Prepare Stock Pytorch
+ id: installed
run: |
pwd
source activate e2e_ci
- source .github/scripts/env.sh
+ pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+ echo "TORCH_BRANCH_ID=$(python -c 'import torch; print(torch.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+ TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
+ echo "TORCH_COMMIT_ID=${TORCH_COMMIT_ID}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
cd ../ && rm -rf pytorch
git clone https://github.com/pytorch/pytorch pytorch
- cd pytorch && git checkout $(echo ${{ env.pytorch }} |sed 's/^nightly_wheel$/nightly/')
+ cd pytorch && git checkout ${TORCH_COMMIT_ID}
# apply PRs for stock pytorch
pip install requests
- python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+ # python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
git status && git show -s
pip install -r requirements.txt
- cd ../
- pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
+ TORCH_XPU_OPS_COMMIT=$(> "${GITHUB_ENV}"
+ rm -rf third_party/torch-xpu-ops
+ git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
+ cd third_party/torch-xpu-ops
+ git checkout ${TORCH_XPU_OPS_COMMIT}
- name: Identify pinned versions
id: pinned
run: |
source activate e2e_ci
- source .github/scripts/env.sh
+ echo "TORCHVISION_COMMIT_ID=$(python -c 'import torchvision; print(torchvision.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+ echo "TORCHAUDIO_COMMIT_ID=$(python -c 'import torchaudio; print(torchaudio.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+ echo "TRITON_COMMIT_ID=$(python -c 'import triton; print(triton.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
cd ../pytorch
- echo "TRITON_COMMIT_ID=$(pip list |grep -w pytorch-triton-xpu |awk '{print $2}')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
- echo "TORCH_BRANCH_ID=nightly" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
- echo "TORCH_COMMIT_ID=$(pip list |grep -w torch |awk '{print $2}')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
- echo "TORCHBENCH_COMMIT_ID=$(> "${GITHUB_ENV}"
- echo "TORCHVISION_COMMIT_ID=$(pip list |grep -w torchvision |awk '{print $2}')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
- echo "TORCHAUDIO_COMMIT_ID=$(pip list |grep -w torchaudio |awk '{print $2}')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+ echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "DRIVER_VERSION=$(dkms status 2>&1 |grep 'intel-i915-dkms' |sed 's/.*\///;s/,.*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
- echo "BUNDLE_VERSION=$(dpcpp --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
+ echo "BUNDLE_VERSION=$(pip list |grep cmplr |head -n 1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
. /etc/os-release
echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
@@ -271,6 +277,7 @@ jobs:
repo="${{ github.repository }}"
TORCH_BRANCH_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_BRANCH_ID }}"
TORCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_COMMIT_ID }}"
+ TORCH_XPU_OPS_COMMIT="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_XPU_OPS_COMMIT }}"
DRIVER_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.DRIVER_VERSION }}"
KERNEL_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.KERNEL_VERSION }}"
BUNDLE_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.BUNDLE_VERSION }}"
@@ -307,7 +314,7 @@ jobs:
fi
# Test report
echo -e "**${test_status}** $test_type WHL Test on $(date +'%F'), See: $build_url\n" > ${{ github.workspace }}/report.txt
- printf "Torch-xpu-ops | PyTorch | Triton\n--- | --- | ---\n${GITHUB_WORKFLOW_SHA:0:7} on ${GITHUB_REF_NAME} | " >> ${{ github.workspace }}/report.txt
+ printf "Torch-xpu-ops | PyTorch | Triton\n--- | --- | ---\n${TORCH_XPU_OPS_COMMIT:0:7} on pinned | " >> ${{ github.workspace }}/report.txt
printf "[${TORCH_COMMIT_ID:0:7}](https://github.com/pytorch/pytorch/commit/${TORCH_COMMIT_ID:0:7}) on $TORCH_BRANCH_ID | " >> ${{ github.workspace }}/report.txt
echo -e "[${TRITON_COMMIT_ID:0:7}](https://github.com/intel/intel-xpu-backend-for-triton/commit/${TRITON_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt
printf "Transformers | Timm | Torchbench | Torchvision | Torchaudio\n--- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 53f93e629..fe6e428f5 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -90,7 +90,7 @@ jobs:
cd ../pytorch
echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" >> "${GITHUB_ENV}"
echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" >> "${GITHUB_ENV}"
- echo "TORCHBENCH_COMMIT_ID=$(> "${GITHUB_ENV}"
+ echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" >> "${GITHUB_ENV}"
echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" >> "${GITHUB_ENV}"
echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" >> "${GITHUB_ENV}"
echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" >> "${GITHUB_ENV}"
@@ -144,9 +144,9 @@ jobs:
run: |
rm -rf ${{ github.workspace }}/upload_files
cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files
- failed_case=$(grep "Real failed: models: *[1-9]" ${{ github.workspace }}/upload_files/summary_accuracy.log |wc -l || true)
+ failed_case=$(grep "Real failed models: *[1-9]" ${{ github.workspace }}/upload_files/summary_accuracy.log |wc -l || true)
if [ ${failed_case} -ne 0 ];then
- grep -E "Real failed: models: [1-9]|Summary for" ${{ github.workspace }}/summary_accuracy.log
+ grep -E "Real failed models: [1-9]|Summary for" ${{ github.workspace }}/upload_files/summary_accuracy.log
exit 1
fi
- name: Upload Inductor XPU E2E Data
diff --git a/src/ATen/native/transformers/Attention.cpp b/src/ATen/native/transformers/Attention.cpp
index bb8b4602b..3090dfbee 100644
--- a/src/ATen/native/transformers/Attention.cpp
+++ b/src/ATen/native/transformers/Attention.cpp
@@ -93,36 +93,6 @@ static bool check_for_seq_len_1_nested_tensor(
return true;
}
-int64_t _fused_sdp_choice_xpu(
- const Tensor& query,
- const Tensor& key,
- const Tensor& value,
- const std::optional& attn_mask_,
- double dropout_p,
- bool is_causal,
- std::optional scale,
- bool enable_gqa) {
- // We have implemented efficient_attention backend with xetla, flash_attention
- // backend is not supported now, which will be implemented in the future. So
- // we provide two backends here.
- sdp::sdp_params kernel_params{
- query, key, value, attn_mask_, dropout_p, is_causal, enable_gqa};
- // Because TORCHCHECK checks if condition is true we negate debug so that
- // The statements will be printed when debug is true
- bool print_debug = false;
- sdp::SDPBackend backend =
- sdp::can_use_mem_efficient_attention(kernel_params, print_debug)
- ? sdp::SDPBackend::efficient_attention
- : sdp::SDPBackend::math;
- if (backend == sdp::SDPBackend::error) {
- TORCH_CHECK(
- false,
- "No viable backend for scaled_dot_product_attention was found. ",
- "This is likely due to turning off both the math kernel and the fused kernels.");
- }
- return static_cast(backend);
-}
-
std::tuple native_multi_head_attention_xpu(
const Tensor& query,
const Tensor& key,
@@ -204,8 +174,12 @@ std::tuple native_multi_head_attention_xpu(
value.view({value.size(0), -1, num_head, dim_per_head}).transpose(1, 2);
sdp::sdp_params kernel_params{q, k, v, mask, 0.0, false, false};
- auto backend = static_cast(
- _fused_sdp_choice_xpu(q, k, v, mask, 0.0, false, {}, false));
+
+ sdp::SDPBackend backend = sdp::SDPBackend::math;
+ if (_fused_sdp_choice_stub.is_device_supported(q.device().type())) {
+ backend = static_cast(_fused_sdp_choice_stub(
+ q.device().type(), q, k, v, mask, 0.0, false, std::nullopt, false));
+ }
// strides from packed projection for nested tensors when seq_len is 1 will
// be and will trigger a contiguous call in the kernel, so we prevent this
diff --git a/src/ATen/native/transformers/SDPUtils.cpp b/src/ATen/native/transformers/SDPUtils.cpp
index db4409493..eca5f9829 100644
--- a/src/ATen/native/transformers/SDPUtils.cpp
+++ b/src/ATen/native/transformers/SDPUtils.cpp
@@ -4,6 +4,8 @@
namespace sdp {
+using c10::array_of;
+
bool check_all_tensors_on_device(sdp_params const& params, bool debug) {
// Check that all tensors are on the GPU device
// This should be handled by the stub dispatch, but whe call
diff --git a/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp b/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
index f0620c530..4a34e70d1 100644
--- a/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
+++ b/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
@@ -29,7 +29,7 @@ Tensor adaptive_avg_pool2d_backward_xpu(
(input.ndimension() == 3 || input.ndimension() == 4),
"non-empty 3D or 4D (batch mode) tensor expected for input");
- globalContext().alertNotDeterministic("_adaptive_avg_pool2d_backward");
+ globalContext().alertNotDeterministic("adaptive_avg_pool2d_backward_xpu");
Tensor grad_input;
if (input.numel() != 0) {
diff --git a/src/ATen/native/xpu/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/DilatedMaxPool2d.cpp
index 600d29e85..a08227b47 100644
--- a/src/ATen/native/xpu/DilatedMaxPool2d.cpp
+++ b/src/ATen/native/xpu/DilatedMaxPool2d.cpp
@@ -4,6 +4,7 @@
#include
#include
+#include
#include
#include
@@ -40,6 +41,62 @@ TORCH_IMPL_FUNC(max_pool2d_with_indices_out_xpu)
bool ceil_mode,
const Tensor& output,
const Tensor& indices) {
+ const int kH = safe_downcast(kernel_size[0]);
+ const int kW = kernel_size.size() == 1
+ ? kH
+ : safe_downcast(kernel_size[1]);
+ const int padH = safe_downcast(padding[0]);
+ const int padW =
+ padding.size() == 1 ? padH : safe_downcast(padding[1]);
+
+ const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
+ const int64_t nInputPlane = input.size(-3);
+ const int64_t inputHeight = input.size(-2);
+ const int64_t inputWidth = input.size(-1);
+
+ const int64_t outputHeight = output.size(-2);
+ const int64_t outputWidth = output.size(-1);
+ if (outputHeight == 1 && outputWidth == 1 && inputHeight <= kH &&
+ inputWidth <= kW && padH == 0 && padW == 0) {
+ auto smf = input.suggest_memory_format();
+ Tensor input_ = input.contiguous(smf);
+ bool is_3d = input.ndimension() == 3;
+ Tensor indices_, output_;
+ if (is_3d) {
+ indices_ = indices.contiguous();
+ output_ = output.contiguous();
+ } else {
+ indices_ = indices.contiguous(smf);
+ output_ = output.contiguous(smf);
+ }
+ if (!is_3d) {
+ input_.resize_({nbatch, nInputPlane, 1, inputHeight * inputWidth}, smf);
+ output_.resize_(
+ {nbatch, nInputPlane, 1, outputHeight * outputWidth}, smf);
+ indices_.resize_(
+ {nbatch, nInputPlane, 1, outputHeight * outputWidth}, smf);
+ at::max_outf(input_, 3, true, output_, indices_);
+ } else {
+ at::max_outf(input_, 2, true, output_, indices_);
+ }
+
+ if (!is_3d) {
+ input_.resize_({nbatch, nInputPlane, inputHeight, inputWidth}, smf);
+ output_.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, smf);
+ indices_.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, smf);
+ }
+
+ if ((is_3d && !indices.is_contiguous()) ||
+ (!is_3d && !indices.is_contiguous(smf))) {
+ indices.copy_(indices_);
+ }
+
+ if ((is_3d && !output.is_contiguous()) ||
+ (!is_3d && !output.is_contiguous(smf))) {
+ output.copy_(output_);
+ }
+ return;
+ }
xpu::max_pool2d_with_indices_kernel(
input,
kernel_size,
diff --git a/src/ATen/native/xpu/RNN.cpp b/src/ATen/native/xpu/RNN.cpp
new file mode 100644
index 000000000..74152f293
--- /dev/null
+++ b/src/ATen/native/xpu/RNN.cpp
@@ -0,0 +1,46 @@
+#include
+#include
+
+namespace at::native {
+
+std::tuple _thnn_fused_lstm_cell_xpu(
+ const Tensor& input_gates,
+ const Tensor& hidden_gates,
+ const Tensor& cx,
+ const std::optional& input_bias_opt,
+ const std::optional& hidden_bias_opt) {
+ return native::xpu::_thnn_fused_lstm_cell_kernel(
+ input_gates, hidden_gates, cx, input_bias_opt, hidden_bias_opt);
+}
+
+std::tuple _thnn_fused_lstm_cell_backward_xpu(
+ const std::optional& grad_hy_opt,
+ const std::optional& grad_cy_opt,
+ const Tensor& cx,
+ const Tensor& cy,
+ const Tensor& workspace,
+ bool has_bias) {
+ return native::xpu::_thnn_fused_lstm_cell_backward_kernel(
+ grad_hy_opt, grad_cy_opt, cx, cy, workspace, has_bias);
+}
+
+std::tuple _thnn_fused_gru_cell_xpu(
+ const Tensor& input_gates,
+ const Tensor& hidden_gates,
+ const Tensor& hx,
+ const std::optional& input_bias,
+ const std::optional& hidden_bias) {
+ return native::xpu::_thnn_fused_gru_cell_kernel(
+ input_gates, hidden_gates, hx, input_bias, hidden_bias);
+}
+
+std::tuple
+_thnn_fused_gru_cell_backward_xpu(
+ const Tensor& grad_hy,
+ const Tensor& workspace,
+ bool has_bias) {
+ return native::xpu::_thnn_fused_gru_cell_backward_kernel(
+ grad_hy, workspace, has_bias);
+}
+
+} // namespace at::native
diff --git a/src/ATen/native/xpu/RreluWithNoise.cpp b/src/ATen/native/xpu/RreluWithNoise.cpp
index f66833983..fb4e2c333 100644
--- a/src/ATen/native/xpu/RreluWithNoise.cpp
+++ b/src/ATen/native/xpu/RreluWithNoise.cpp
@@ -6,7 +6,7 @@ namespace native {
Tensor& rrelu_with_noise_out_xpu(
const Tensor& self,
- const Tensor& noise,
+ Tensor& noise,
const Scalar& lower,
const Scalar& upper,
bool training,
@@ -18,7 +18,7 @@ Tensor& rrelu_with_noise_out_xpu(
Tensor rrelu_with_noise_xpu(
const Tensor& self,
- const Tensor& noise,
+ Tensor& noise,
const Scalar& lower,
const Scalar& upper,
bool training,
@@ -30,7 +30,7 @@ Tensor rrelu_with_noise_xpu(
Tensor& rrelu_with_noise_xpu_(
Tensor& self,
- const Tensor& noise,
+ Tensor& noise,
const Scalar& lower,
const Scalar& upper,
bool training,
diff --git a/src/ATen/native/xpu/SoftMax.cpp b/src/ATen/native/xpu/SoftMax.cpp
index e816d48c8..f155165ce 100644
--- a/src/ATen/native/xpu/SoftMax.cpp
+++ b/src/ATen/native/xpu/SoftMax.cpp
@@ -76,6 +76,17 @@ TORCH_IMPL_FUNC(log_softmax_xpu_out)
xpu::_log_softmax_kernel(input, dim, half_to_float, output);
}
+Tensor _safe_softmax_xpu(
+ const Tensor& self,
+ int64_t dim,
+ std::optional dtype) {
+ // TODO: uncomment after XPU softmax support half_to_float=true
+ // if (self.scalar_type() == ScalarType::Half && dtype == ScalarType::Float)
+ // return xpu::_safe_softmax_kernel(self, dim_, true);
+ Tensor converted = dtype.has_value() ? self.toType(dtype.value()) : self;
+ return xpu::_safe_softmax_kernel(converted, dim, false);
+}
+
Tensor masked_softmax_xpu(
const Tensor& input_,
const Tensor& mask_,
diff --git a/src/ATen/native/xpu/UpSampleBilinear2d.cpp b/src/ATen/native/xpu/UpSampleBilinear2d.cpp
index ee8c37ac0..aec707193 100644
--- a/src/ATen/native/xpu/UpSampleBilinear2d.cpp
+++ b/src/ATen/native/xpu/UpSampleBilinear2d.cpp
@@ -30,6 +30,7 @@ TORCH_IMPL_FUNC(upsample_bilinear2d_backward_out_xpu)
std::optional scales_h,
std::optional scales_w,
const Tensor& grad_input) {
+ globalContext().alertNotDeterministic("upsample_bilinear2d_backward_out_xpu");
xpu::upsample_bilinear2d_backward_out_kernel(
grad_input,
grad_output,
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 8492a98be..72f2aacdd 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -184,9 +184,7 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
"_linalg_svd.U",
"lu_unpack.out",
"ormqr",
- "_scaled_dot_product_efficient_attention",
"_scaled_mm",
- "_thnn_fused_gru_cell",
"_to_sparse_csr",
"triangular_solve.X",
"_validate_compressed_sparse_indices",
diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
index e21c0160c..d94db11c9 100644
--- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
+++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
@@ -5,9 +5,9 @@
#pragma GCC diagnostic ignored "-Wreturn-type"
#include
+#include
#include
#include
-#include
#include
#include
diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h
index d530560e6..b07041fcb 100644
--- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h
+++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h
@@ -1,6 +1,6 @@
#pragma once
-#include
+#include
namespace at::native::xpu {
diff --git a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
index 57ac0d114..fb034f988 100644
--- a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
+++ b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
@@ -531,6 +531,8 @@ Tensor embedding_bag_backward_xpu_max(
const Tensor& max_indices_t,
int64_t num_weights,
int64_t padding_idx) {
+ globalContext().alertNotDeterministic("embedding_bag_backward_xpu_max");
+
auto max_indices = max_indices_t.contiguous();
auto grad_weight = at::zeros({num_weights, grad.size(1)}, grad.options());
int64_t stride = grad_weight.stride(0);
diff --git a/src/ATen/native/xpu/sycl/Indexing.cpp b/src/ATen/native/xpu/sycl/Indexing.cpp
index d429ecfbe..bcbd50c42 100644
--- a/src/ATen/native/xpu/sycl/Indexing.cpp
+++ b/src/ATen/native/xpu/sycl/Indexing.cpp
@@ -207,7 +207,7 @@ void index_select_kernel(
}),
AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
- AT_EXPAND(AT_FLOAT8_TYPES),
+ AT_EXPAND(AT_FLOAT8_TYPES),
kComplexHalf,
kHalf,
kBool,
@@ -1081,7 +1081,8 @@ void take_kernel(TensorIterator& iter, const TensorBase& input) {
canUse32BitIndexMath(input) ? ScalarType::Int : ScalarType::Long,
"take_xpu_index",
[&] {
- const scalar_t* indexed_ptr = input.template const_data_ptr();
+ const scalar_t* indexed_ptr =
+ input.template const_data_ptr();
TakeFunctor f(indexed_ptr);
take_put_kernel_template(iter, input, f);
});
@@ -1114,6 +1115,14 @@ void put_kernel(
TensorIterator& iter,
const TensorBase& output,
const bool accumulate) {
+ // Nondeterministic when index contains duplicate entries and we do not
+ // accumulate If we accumulate on GPU, we use atomicGPUAdd, which is
+ // non-deterministic
+ if (!accumulate ||
+ (accumulate && iter.tensor(1).device().type() == DeviceType::XPU)) {
+ at::globalContext().alertNotDeterministic("put_");
+ }
+
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
at::ScalarType::BFloat16,
at::ScalarType::Half,
diff --git a/src/ATen/native/xpu/sycl/LerpKernels.cpp b/src/ATen/native/xpu/sycl/LerpKernels.cpp
index 1648f193b..9d7551290 100644
--- a/src/ATen/native/xpu/sycl/LerpKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LerpKernels.cpp
@@ -57,15 +57,29 @@ struct LerpScalarFunctor {
opmath_t weight_val_;
};
+void lerp_scalar_kernel(
+ at::TensorIteratorBase& iter,
+ const c10::Scalar& weight);
+
void lerp_tensor_kernel(at::TensorIteratorBase& iter) {
auto dtype = iter.common_dtype();
if (at::isComplexType(dtype)) {
AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "lerp_xpu", [&] {
+ if (iter.is_cpu_scalar(3)) {
+ auto weight_val = iter.scalar_value(3);
+ iter.remove_operand(3);
+ return lerp_scalar_kernel(iter, weight_val);
+ }
gpu_kernel(iter, LerpTensorComplexFunctor());
});
} else {
AT_DISPATCH_FLOATING_TYPES_AND2(
at::ScalarType::Half, at::ScalarType::BFloat16, dtype, "lerp_xpu", [&] {
+ if (iter.is_cpu_scalar(3)) {
+ auto weight_val = iter.scalar_value(3);
+ iter.remove_operand(3);
+ return lerp_scalar_kernel(iter, weight_val);
+ }
gpu_kernel(iter, LerpTensorFunctor());
});
}
diff --git a/src/ATen/native/xpu/sycl/LossCTCKernels.cpp b/src/ATen/native/xpu/sycl/LossCTCKernels.cpp
index 9d26a48c7..3dd44968d 100644
--- a/src/ATen/native/xpu/sycl/LossCTCKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LossCTCKernels.cpp
@@ -1248,7 +1248,7 @@ Tensor ctc_loss_backward_kernel(
bool zero_infinity) {
// See Note [Writing Nondeterministic Operations]
// Nondeterministic because of atomicAdd usage
- globalContext().alertNotDeterministic("ctc_loss_backward_kernel");
+ globalContext().alertNotDeterministic("ctc_loss_backward_xpu");
return AT_DISPATCH_FLOATING_TYPES(
log_probs.scalar_type(), "ctc_loss_backward_xpu", [&] {
if (targets.scalar_type() == kLong) {
diff --git a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp
index 4b93cb3c3..8b018de6b 100644
--- a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp
@@ -186,7 +186,7 @@ void nll_loss2d_forward_kernel(
int64_t reduction,
int64_t ignore_index) {
if (reduction != at::Reduction::None) {
- at::globalContext().alertNotDeterministic("nll_loss2d_forward_kernel");
+ at::globalContext().alertNotDeterministic("nll_loss2d_forward_xpu");
}
total_weight.resize_({});
diff --git a/src/ATen/native/xpu/sycl/RNNKernels.cpp b/src/ATen/native/xpu/sycl/RNNKernels.cpp
new file mode 100644
index 000000000..bad6bdf69
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/RNNKernels.cpp
@@ -0,0 +1,968 @@
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+#include
+#include
+
+namespace at::native::xpu {
+
+using at::native::canUse32BitIndexMath;
+using at::xpu::detail::getTensorInfo;
+using at::xpu::detail::IndexToOffset;
+using at::xpu::detail::TensorInfo;
+
+std::tuple rnn_get_launch_config(
+ int64_t max_threads_per_group,
+ int64_t numel) {
+ int64_t num_groups =
+ (numel + max_threads_per_group - 1) / max_threads_per_group;
+ auto hw_max_groups = syclMaxWorkItemsPerTile() / max_threads_per_group;
+ num_groups = num_groups > hw_max_groups ? hw_max_groups : num_groups;
+ return std::make_tuple(num_groups, max_threads_per_group);
+}
+
+// Factor will be 3 for GRU and 4 for LSTM
+void checkSizes(
+ CheckedFrom c,
+ const TensorArg& input_gates,
+ const TensorArg& hidden_gates,
+ const TensorArg& input_bias,
+ const TensorArg& hidden_bias,
+ int64_t factor,
+ const TensorArg& prev_hidden) {
+ checkDim(c, input_gates, 2);
+ checkSameSize(c, input_gates, hidden_gates);
+ int64_t gates_size = input_gates->size(1);
+
+ if (input_bias->defined()) {
+ checkDim(c, input_bias, 1);
+ checkNumel(c, input_bias, gates_size);
+ checkSameSize(c, input_bias, hidden_bias);
+ }
+
+ checkDim(c, prev_hidden, 2);
+ checkNumel(c, prev_hidden, input_gates->size(0) * gates_size / factor);
+
+ checkAllSameGPU(
+ c, {input_gates, hidden_gates, input_bias, hidden_bias, prev_hidden});
+}
+
+bool allContiguous(at::TensorList tensors) {
+ return std::all_of(tensors.begin(), tensors.end(), [](const at::Tensor& t) {
+ return !t.defined() || t.is_contiguous();
+ });
+}
+
+template
+TensorInfo tryGetTensorInfo(const at::Tensor& t) {
+ return t.defined() ? getTensorInfo(t) : TensorInfo{};
+}
+
+void collapseDims(){};
+template
+void collapseDims(TensorInfo& info, Args&... infos) {
+ info.collapseDims();
+ collapseDims(infos...);
+}
+
+#define DEVICE_LINEAR_GET(D_TENSOR, INDEX) \
+ D_TENSOR.data[IndexToOffset::get(INDEX, D_TENSOR)]
+
+// Biases are always 1D
+#define DEVICE_BIAS_GET(D_TENSOR, INDEX) \
+ D_TENSOR.data[IndexToOffset::get(INDEX, D_TENSOR)]
+
+#define H2F(input) static_cast(input)
+#define F2H(input) static_cast(input)
+
+template
+inline T sigmoid(T in) {
+ T one = static_cast(1.0);
+ return one / (one + std::exp(-in));
+}
+
+template
+struct LstmCellForwardFunctor {
+ void operator()(sycl::nd_item<1> item) const {
+ bool has_bias = bias1_.data != nullptr;
+
+ for (index_type linearIndex = item.get_global_id(0);
+ linearIndex < totalElements_;
+ linearIndex += item.get_group_range(0) * item.get_local_range(0)) {
+ index_type offset = (linearIndex / hsz_) * 4 * hsz_ + linearIndex % hsz_;
+
+ scalar_t iig = DEVICE_LINEAR_GET(input_, offset + 0 * hsz_);
+ scalar_t ifg = DEVICE_LINEAR_GET(input_, offset + 1 * hsz_);
+ scalar_t icg = DEVICE_LINEAR_GET(input_, offset + 2 * hsz_);
+ scalar_t iog = DEVICE_LINEAR_GET(input_, offset + 3 * hsz_);
+
+ scalar_t hig = DEVICE_LINEAR_GET(hidden_, offset + 0 * hsz_);
+ scalar_t hfg = DEVICE_LINEAR_GET(hidden_, offset + 1 * hsz_);
+ scalar_t hcg = DEVICE_LINEAR_GET(hidden_, offset + 2 * hsz_);
+ scalar_t hog = DEVICE_LINEAR_GET(hidden_, offset + 3 * hsz_);
+
+ scalar_t* wig = &DEVICE_LINEAR_GET(workspace_, offset + 0 * hsz_);
+ scalar_t* wfg = &DEVICE_LINEAR_GET(workspace_, offset + 1 * hsz_);
+ scalar_t* wcg = &DEVICE_LINEAR_GET(workspace_, offset + 2 * hsz_);
+ scalar_t* wog = &DEVICE_LINEAR_GET(workspace_, offset + 3 * hsz_);
+
+ scalar_t cx = DEVICE_LINEAR_GET(_cx_, linearIndex);
+
+ scalar_t* hy = &DEVICE_LINEAR_GET(_hy_, linearIndex);
+ scalar_t* cy = &DEVICE_LINEAR_GET(_cy_, linearIndex);
+
+ scalar_t b1i, b1f, b1c, b1o;
+ scalar_t b2i, b2f, b2c, b2o;
+
+ if (has_bias) {
+ b1i = DEVICE_BIAS_GET(bias1_, linearIndex % hsz_ + 0 * hsz_);
+ b1f = DEVICE_BIAS_GET(bias1_, linearIndex % hsz_ + 1 * hsz_);
+ b1c = DEVICE_BIAS_GET(bias1_, linearIndex % hsz_ + 2 * hsz_);
+ b1o = DEVICE_BIAS_GET(bias1_, linearIndex % hsz_ + 3 * hsz_);
+
+ b2i = DEVICE_BIAS_GET(bias2_, linearIndex % hsz_ + 0 * hsz_);
+ b2f = DEVICE_BIAS_GET(bias2_, linearIndex % hsz_ + 1 * hsz_);
+ b2c = DEVICE_BIAS_GET(bias2_, linearIndex % hsz_ + 2 * hsz_);
+ b2o = DEVICE_BIAS_GET(bias2_, linearIndex % hsz_ + 3 * hsz_);
+ } else {
+ b1i = F2H(0.0);
+ b1f = F2H(0.0);
+ b1c = F2H(0.0);
+ b1o = F2H(0.0);
+ b2i = F2H(0.0);
+ b2f = F2H(0.0);
+ b2c = F2H(0.0);
+ b2o = F2H(0.0);
+ }
+
+ accscalar_t ig, fg, cg, og;
+ accscalar_t f_hy, f_cy;
+
+ ig = sigmoid(H2F(iig) + H2F(hig) + H2F(b1i) + H2F(b2i));
+ fg = sigmoid(H2F(ifg) + H2F(hfg) + H2F(b1f) + H2F(b2f));
+ cg = std::tanh(H2F(icg) + H2F(hcg) + H2F(b1c) + H2F(b2c));
+ og = sigmoid(H2F(iog) + H2F(hog) + H2F(b1o) + H2F(b2o));
+
+ f_cy = (fg * H2F(cx)) + (ig * cg);
+ f_hy = og * std::tanh(f_cy);
+
+ *hy = F2H(f_hy);
+ *cy = F2H(f_cy);
+
+ // SAVE FOR BACKWARDS
+ // Also need cy and cx but can be saved easily in python
+ *wig = F2H(ig);
+ *wfg = F2H(fg);
+ *wcg = F2H(cg);
+ *wog = F2H(og);
+ }
+ }
+
+ LstmCellForwardFunctor(
+ TensorInfo input,
+ TensorInfo hidden,
+ TensorInfo bias1,
+ TensorInfo bias2,
+ TensorInfo _cx,
+ TensorInfo _hy,
+ TensorInfo _cy,
+ TensorInfo workspace,
+ index_type hsz,
+ index_type totalElements)
+ : input_(input),
+ hidden_(hidden),
+ bias1_(bias1),
+ bias2_(bias2),
+ _cx_(_cx),
+ _hy_(_hy),
+ _cy_(_cy),
+ workspace_(workspace),
+ hsz_(hsz),
+ totalElements_(totalElements) {}
+
+ private:
+ TensorInfo input_;
+ TensorInfo hidden_;
+ TensorInfo bias1_;
+ TensorInfo bias2_;
+ TensorInfo _cx_;
+ TensorInfo _hy_;
+ TensorInfo _cy_;
+ TensorInfo workspace_;
+ index_type hsz_;
+ index_type totalElements_;
+};
+
+template
+struct LstmCellBackwardFunctor {
+ void operator()(sycl::nd_item<1> item) const {
+ bool has_gradoutput = gradoutput_.data != nullptr;
+ bool has_gradoutputcell = gradoutputcell_.data != nullptr;
+
+ for (index_type linearIndex = item.get_global_id(0);
+ linearIndex < totalElements_;
+ linearIndex += item.get_group_range(0) * item.get_local_range(0)) {
+ index_type offset = (linearIndex / hsz_) * 4 * hsz_ + linearIndex % hsz_;
+
+ scalar_t ig = DEVICE_LINEAR_GET(storage_, offset + 0 * hsz_);
+ scalar_t fg = DEVICE_LINEAR_GET(storage_, offset + 1 * hsz_);
+ scalar_t cg = DEVICE_LINEAR_GET(storage_, offset + 2 * hsz_);
+ scalar_t og = DEVICE_LINEAR_GET(storage_, offset + 3 * hsz_);
+
+ scalar_t* ih = &DEVICE_LINEAR_GET(gradInGates_, offset + 0 * hsz_);
+ scalar_t* fh = &DEVICE_LINEAR_GET(gradInGates_, offset + 1 * hsz_);
+ scalar_t* ch = &DEVICE_LINEAR_GET(gradInGates_, offset + 2 * hsz_);
+ scalar_t* oh = &DEVICE_LINEAR_GET(gradInGates_, offset + 3 * hsz_);
+
+ // will return hidden grads here
+ scalar_t cx = DEVICE_LINEAR_GET(_cx_, linearIndex);
+ scalar_t cy = DEVICE_LINEAR_GET(_cy_, linearIndex);
+
+ scalar_t* gi = &DEVICE_LINEAR_GET(gradInputCx_, linearIndex);
+
+ accscalar_t go = has_gradoutput
+ ? H2F(DEVICE_LINEAR_GET(gradoutput_, linearIndex))
+ : 0.f;
+ accscalar_t goc = has_gradoutputcell
+ ? H2F(DEVICE_LINEAR_GET(gradoutputcell_, linearIndex))
+ : 0.f;
+
+ accscalar_t gcx = std::tanh(H2F(cy));
+
+ accscalar_t gog = go * gcx;
+ gcx = go * H2F(og) * (1 - gcx * gcx) + goc;
+
+ accscalar_t gig = gcx * H2F(cg);
+ accscalar_t gfg = gcx * H2F(cx);
+ accscalar_t gcg = gcx * H2F(ig);
+
+ gcx = gcx * H2F(fg);
+
+ gig = gig * (1 - H2F(ig)) * H2F(ig);
+ gfg = gfg * (1 - H2F(fg)) * H2F(fg);
+ gcg = gcg * (1 - H2F(cg) * H2F(cg));
+ gog = gog * (1 - H2F(og)) * H2F(og);
+
+ *ih = F2H(gig);
+ *fh = F2H(gfg);
+ *ch = F2H(gcg);
+ *oh = F2H(gog);
+
+ *gi = F2H(gcx);
+ }
+ }
+
+ LstmCellBackwardFunctor(
+ TensorInfo storage,
+ TensorInfo gradInGates,
+ TensorInfo _cx,
+ TensorInfo _cy,
+ TensorInfo gradoutput,
+ TensorInfo gradoutputcell,
+ TensorInfo gradInputCx,
+ index_type hsz,
+ index_type totalElements)
+ : storage_(storage),
+ gradInGates_(gradInGates),
+ _cx_(_cx),
+ _cy_(_cy),
+ gradoutput_(gradoutput),
+ gradoutputcell_(gradoutputcell),
+ gradInputCx_(gradInputCx),
+ hsz_(hsz),
+ totalElements_(totalElements) {}
+
+ private:
+ TensorInfo storage_;
+ TensorInfo gradInGates_;
+ TensorInfo _cx_;
+ TensorInfo _cy_;
+ TensorInfo gradoutput_;
+ TensorInfo gradoutputcell_;
+ TensorInfo gradInputCx_;
+ index_type hsz_;
+ index_type totalElements_;
+};
+
+template
+struct GruCellForwardFunctor {
+ void operator()(sycl::nd_item<1> item) const {
+ bool has_bias = Bias1_.data != nullptr;
+
+ for (index_type linearIndex = item.get_global_id(0);
+ linearIndex < totalElements_;
+ linearIndex += item.get_group_range(0) * item.get_local_range(0)) {
+ index_type offset = (linearIndex / hsz_) * 3 * hsz_ + linearIndex % hsz_;
+
+ scalar_t ir = DEVICE_LINEAR_GET(Input_, offset + 0 * hsz_);
+ scalar_t ii = DEVICE_LINEAR_GET(Input_, offset + 1 * hsz_);
+ scalar_t in = DEVICE_LINEAR_GET(Input_, offset + 2 * hsz_);
+ scalar_t hr = DEVICE_LINEAR_GET(Hidden_, offset + 0 * hsz_);
+ scalar_t hi = DEVICE_LINEAR_GET(Hidden_, offset + 1 * hsz_);
+ scalar_t hn = DEVICE_LINEAR_GET(Hidden_, offset + 2 * hsz_);
+
+ scalar_t hx = DEVICE_LINEAR_GET(_hx_, linearIndex);
+ scalar_t* hy = &DEVICE_LINEAR_GET(_hy_, linearIndex);
+
+ scalar_t b1r, b1i, b1n, b2r, b2i, b2n;
+
+ if (has_bias) {
+ b1r = DEVICE_BIAS_GET(Bias1_, linearIndex % hsz_ + 0 * hsz_);
+ b1i = DEVICE_BIAS_GET(Bias1_, linearIndex % hsz_ + 1 * hsz_);
+ b1n = DEVICE_BIAS_GET(Bias1_, linearIndex % hsz_ + 2 * hsz_);
+
+ b2r = DEVICE_BIAS_GET(Bias2_, linearIndex % hsz_ + 0 * hsz_);
+ b2i = DEVICE_BIAS_GET(Bias2_, linearIndex % hsz_ + 1 * hsz_);
+ b2n = DEVICE_BIAS_GET(Bias2_, linearIndex % hsz_ + 2 * hsz_);
+ } else {
+ b1r = F2H(0.0);
+ b1i = F2H(0.0);
+ b1n = F2H(0.0);
+ b2r = F2H(0.0);
+ b2i = F2H(0.0);
+ b2n = F2H(0.0);
+ }
+
+ offset = (linearIndex / hsz_) * 5 * hsz_ + linearIndex % hsz_;
+
+ accscalar_t rg, ig, ng;
+
+ rg = sigmoid(H2F(ir) + H2F(hr) + H2F(b1r) + H2F(b2r));
+ ig = sigmoid(H2F(ii) + H2F(hi) + H2F(b1i) + H2F(b2i));
+
+ ng = H2F(in) + H2F(b1n) + rg * (H2F(hn) + H2F(b2n));
+ ng = std::tanh(ng);
+ *hy = F2H(ng + ig * (H2F(hx) - ng));
+
+ // SAVE FOR BACKWARDS
+ DEVICE_LINEAR_GET(storage_, offset + 0 * hsz_) = F2H(rg);
+ DEVICE_LINEAR_GET(storage_, offset + 1 * hsz_) = F2H(ig);
+ DEVICE_LINEAR_GET(storage_, offset + 2 * hsz_) = F2H(ng);
+ DEVICE_LINEAR_GET(storage_, offset + 3 * hsz_) = hx;
+ DEVICE_LINEAR_GET(storage_, offset + 4 * hsz_) = F2H(H2F(hn) + H2F(b2n));
+ }
+ }
+
+ GruCellForwardFunctor(
+ TensorInfo Input,
+ const TensorInfo Hidden,
+ const TensorInfo Bias1,
+ const TensorInfo Bias2,
+ const TensorInfo _hx,
+ const TensorInfo _hy,
+ const TensorInfo storage,
+ const index_type hsz,
+ const index_type totalElements)
+ : Input_(Input),
+ Hidden_(Hidden),
+ Bias1_(Bias1),
+ Bias2_(Bias2),
+ _hx_(_hx),
+ _hy_(_hy),
+ storage_(storage),
+ hsz_(hsz),
+ totalElements_(totalElements) {}
+
+ private:
+ TensorInfo Input_;
+ const TensorInfo Hidden_;
+ const TensorInfo Bias1_;
+ const TensorInfo Bias2_;
+ const TensorInfo _hx_;
+ const TensorInfo _hy_;
+ const TensorInfo storage_;
+ const index_type hsz_;
+ const index_type totalElements_;
+};
+
+template
+struct GruCellBackwardFunctor {
+ void operator()(sycl::nd_item<1> item) const {
+ for (index_type linearIndex = item.get_global_id(0);
+ linearIndex < totalElements_;
+ linearIndex += item.get_group_range(0) * item.get_local_range(0)) {
+ index_type offset = (linearIndex / hsz_) * 5 * hsz_ + linearIndex % hsz_;
+
+ scalar_t rg = DEVICE_LINEAR_GET(storage_, offset + 0 * hsz_);
+ scalar_t ig = DEVICE_LINEAR_GET(storage_, offset + 1 * hsz_);
+ scalar_t ng = DEVICE_LINEAR_GET(storage_, offset + 2 * hsz_);
+ scalar_t hx = DEVICE_LINEAR_GET(storage_, offset + 3 * hsz_);
+ scalar_t hn = DEVICE_LINEAR_GET(storage_, offset + 4 * hsz_);
+
+ scalar_t go = DEVICE_LINEAR_GET(gradOutput_, linearIndex);
+
+ offset = (linearIndex / hsz_) * 3 * hsz_ + linearIndex % hsz_;
+
+ accscalar_t gig = H2F(go) * (H2F(hx) - H2F(ng)) * (1 - H2F(ig)) * H2F(ig);
+ accscalar_t ghx = H2F(go) * H2F(ig);
+ accscalar_t gin = H2F(go) * (1 - H2F(ig)) * (1 - H2F(ng) * H2F(ng));
+ accscalar_t ghn = gin * H2F(rg);
+ accscalar_t grg = gin * H2F(hn) * (1 - H2F(rg)) * H2F(rg);
+
+ DEVICE_LINEAR_GET(gradInInput_, offset + 0 * hsz_) = F2H(grg);
+ DEVICE_LINEAR_GET(gradInInput_, offset + 1 * hsz_) = F2H(gig);
+ DEVICE_LINEAR_GET(gradInInput_, offset + 2 * hsz_) = F2H(gin);
+
+ DEVICE_LINEAR_GET(gradInHidden_, offset + 0 * hsz_) = F2H(grg);
+ DEVICE_LINEAR_GET(gradInHidden_, offset + 1 * hsz_) = F2H(gig);
+ DEVICE_LINEAR_GET(gradInHidden_, offset + 2 * hsz_) = F2H(ghn);
+ DEVICE_LINEAR_GET(gradInputHx_, linearIndex) = F2H(ghx);
+ }
+ }
+
+ GruCellBackwardFunctor(
+ TensorInfo gradInInput,
+ TensorInfo gradInHidden,
+ TensorInfo gradOutput,
+ TensorInfo gradInputHx,
+ TensorInfo storage,
+ index_type hsz,
+ index_type totalElements)
+ : gradInInput_(gradInInput),
+ gradInHidden_(gradInHidden),
+ gradOutput_(gradOutput),
+ gradInputHx_(gradInputHx),
+ storage_(storage),
+ hsz_(hsz),
+ totalElements_(totalElements) {}
+
+ private:
+ TensorInfo gradInInput_;
+ TensorInfo gradInHidden_;
+ TensorInfo gradOutput_;
+ TensorInfo gradInputHx_;
+ TensorInfo storage_;
+ index_type hsz_;
+ index_type totalElements_;
+};
+
+#undef DEVICE_LINEAR_GET
+#undef DEVICE_BIAS_GET
+#undef H2F
+#undef F2H
+
+template
+void lstm_forward_impl(
+ const Tensor& input_gates,
+ const Tensor& hidden_gates,
+ const Tensor& input_bias,
+ const Tensor& hidden_bias,
+ const Tensor& cx,
+ const Tensor& hy,
+ const Tensor& cy,
+ const Tensor& workspace) {
+ using accscalar_t = at::acc_type_device;
+
+ int64_t numel = cx.numel();
+ if (numel == 0)
+ return;
+
+ using KernelT = LstmCellForwardFunctor;
+ auto max_wg_size = syclMaxWorkGroupSize();
+ auto config = rnn_get_launch_config(max_wg_size, numel);
+ auto nwg = std::get<0>(config);
+ auto local_range = std::get<1>(config);
+
+ auto input_gatesI = getTensorInfo(input_gates);
+ auto hidden_gatesI = getTensorInfo(hidden_gates);
+ auto input_biasI = tryGetTensorInfo(input_bias);
+ auto hidden_biasI = tryGetTensorInfo(hidden_bias);
+ auto cxI = getTensorInfo(cx);
+ auto hyI = getTensorInfo(hy);
+ auto cyI = getTensorInfo(cy);
+ auto workspaceI = getTensorInfo(workspace);
+ index_type hidden_size = cxI.sizes[cxI.dims - 1];
+
+ if (allContiguous(
+ {input_gates,
+ hidden_gates,
+ input_bias,
+ hidden_bias,
+ cx,
+ hy,
+ cy,
+ workspace})) {
+ collapseDims(
+ input_gatesI,
+ hidden_gatesI,
+ input_biasI,
+ hidden_biasI,
+ cxI,
+ hyI,
+ cyI,
+ workspaceI);
+ KernelT kfn(
+ input_gatesI,
+ hidden_gatesI,
+ input_biasI,
+ hidden_biasI,
+ cxI,
+ hyI,
+ cyI,
+ workspaceI,
+ hidden_size,
+ numel);
+ sycl_kernel_submit(
+ nwg * local_range, local_range, getCurrentSYCLQueue(), kfn);
+ } else {
+ KernelT kfn(
+ input_gatesI,
+ hidden_gatesI,
+ input_biasI,
+ hidden_biasI,
+ cxI,
+ hyI,
+ cyI,
+ workspaceI,
+ hidden_size,
+ numel);
+ sycl_kernel_submit(
+ nwg * local_range, local_range, getCurrentSYCLQueue(), kfn);
+ }
+}
+
+template
+void lstm_backward_impl(
+ const Tensor& grad_hy,
+ const Tensor& grad_cy,
+ const Tensor& cx,
+ const Tensor& cy,
+ const Tensor& workspace,
+ const Tensor& grad_gates,
+ const Tensor& grad_cx) {
+ using accscalar_t = at::acc_type_device;
+
+ int64_t numel = cx.numel();
+ if (numel == 0)
+ return;
+
+ using KernelT = LstmCellBackwardFunctor;
+ auto max_wg_size = syclMaxWorkGroupSize();
+ auto config = rnn_get_launch_config(max_wg_size, numel);
+ auto nwg = std::get<0>(config);
+ auto local_range = std::get<1>(config);
+
+ auto grad_hyI = tryGetTensorInfo(grad_hy);
+ auto grad_cyI = tryGetTensorInfo(grad_cy);
+ auto cxI = getTensorInfo(cx);
+ auto cyI = getTensorInfo(cy);
+ auto workspaceI = getTensorInfo(workspace);
+ auto grad_gatesI = getTensorInfo(grad_gates);
+ auto grad_cxI = getTensorInfo(grad_cx);
+ index_type hidden_size = cxI.sizes[cxI.dims - 1];
+
+ if (allContiguous(
+ {grad_hy, grad_cy, cx, cy, workspace, grad_gates, grad_cx})) {
+ collapseDims(
+ grad_hyI, grad_cyI, cxI, cyI, workspaceI, grad_gatesI, grad_cxI);
+ KernelT kfn(
+ workspaceI,
+ grad_gatesI,
+ cxI,
+ cyI,
+ grad_hyI,
+ grad_cyI,
+ grad_cxI,
+ hidden_size,
+ numel);
+ sycl_kernel_submit(
+ nwg * local_range, local_range, getCurrentSYCLQueue(), kfn);
+ } else {
+ KernelT kfn(
+ workspaceI,
+ grad_gatesI,
+ cxI,
+ cyI,
+ grad_hyI,
+ grad_cyI,
+ grad_cxI,
+ hidden_size,
+ numel);
+ sycl_kernel_submit(
+ nwg * local_range, local_range, getCurrentSYCLQueue(), kfn);
+ }
+}
+
+template
+void gru_forward_impl(
+ const Tensor& input_gates,
+ const Tensor& hidden_gates,
+ const Tensor& input_bias,
+ const Tensor& hidden_bias,
+ const Tensor& hx,
+ const Tensor& hy,
+ const Tensor& workspace) {
+ using accscalar_t = at::acc_type_device;
+
+ int64_t numel = hx.numel();
+ if (numel == 0)
+ return;
+
+ using KernelT = GruCellForwardFunctor;
+ auto max_wg_size = syclMaxWorkGroupSize();
+ auto config = rnn_get_launch_config(max_wg_size, numel);
+ auto nwg = std::get<0>(config);
+ auto local_range = std::get<1>(config);
+
+ auto input_gatesI = getTensorInfo(input_gates);
+ auto hidden_gatesI = getTensorInfo(hidden_gates);
+ auto input_biasI = tryGetTensorInfo(input_bias);
+ auto hidden_biasI = tryGetTensorInfo(hidden_bias);
+ auto hxI = getTensorInfo(hx);
+ auto hyI = getTensorInfo(hy);
+ auto workspaceI = getTensorInfo(workspace);
+ index_type hidden_size = hxI.sizes[hxI.dims - 1];
+
+ if (allContiguous(
+ {input_gates,
+ hidden_gates,
+ input_bias,
+ hidden_bias,
+ hx,
+ hy,
+ workspace})) {
+ collapseDims(
+ input_gatesI,
+ hidden_gatesI,
+ input_biasI,
+ hidden_biasI,
+ hxI,
+ hyI,
+ workspaceI);
+ KernelT kfn(
+ input_gatesI,
+ hidden_gatesI,
+ input_biasI,
+ hidden_biasI,
+ hxI,
+ hyI,
+ workspaceI,
+ hidden_size,
+ numel);
+ sycl_kernel_submit(
+ nwg * local_range, local_range, getCurrentSYCLQueue(), kfn);
+ } else {
+ KernelT kfn(
+ input_gatesI,
+ hidden_gatesI,
+ input_biasI,
+ hidden_biasI,
+ hxI,
+ hyI,
+ workspaceI,
+ hidden_size,
+ numel);
+ sycl_kernel_submit(
+ nwg * local_range, local_range, getCurrentSYCLQueue(), kfn);
+ }
+}
+
+template
+void gru_backward_impl(
+ const Tensor& grad_hy,
+ const Tensor& workspace,
+ const Tensor& grad_input_gates,
+ const Tensor& grad_hidden_gates,
+ const Tensor& grad_hx) {
+ using accscalar_t = at::acc_type_device;
+
+ int64_t numel = grad_hy.numel();
+ if (numel == 0)
+ return;
+
+ using KernelT = GruCellBackwardFunctor;
+ auto max_wg_size = syclMaxWorkGroupSize();
+ auto config = rnn_get_launch_config(max_wg_size, numel);
+ auto nwg = std::get<0>(config);
+ auto local_range = std::get<1>(config);
+
+ auto grad_hyI = getTensorInfo(grad_hy);
+ auto workspaceI = getTensorInfo(workspace);
+ auto grad_input_gatesI =
+ getTensorInfo(grad_input_gates);
+ auto grad_hidden_gatesI =
+ getTensorInfo(grad_hidden_gates);
+ auto grad_hxI = getTensorInfo(grad_hx);
+ index_type hidden_size = grad_hyI.sizes[grad_hyI.dims - 1];
+
+ if (allContiguous(
+ {grad_hy, workspace, grad_input_gates, grad_hidden_gates, grad_hx})) {
+ collapseDims(
+ grad_hyI, workspaceI, grad_input_gatesI, grad_hidden_gatesI, grad_hxI);
+ KernelT kfn(
+ grad_input_gatesI,
+ grad_hidden_gatesI,
+ grad_hyI,
+ grad_hxI,
+ workspaceI,
+ hidden_size,
+ numel);
+ sycl_kernel_submit(
+ nwg * local_range, local_range, getCurrentSYCLQueue(), kfn);
+ } else {
+ KernelT kfn(
+ grad_input_gatesI,
+ grad_hidden_gatesI,
+ grad_hyI,
+ grad_hxI,
+ workspaceI,
+ hidden_size,
+ numel);
+ sycl_kernel_submit(
+ nwg * local_range, local_range, getCurrentSYCLQueue(), kfn);
+ }
+}
+
+// Note [64-bit index math check elision]
+// It's enough to perform the check for 64-bit math on the largest tensor only.
+// If 32-bit is enough for it, it will suffice for all other tensors too, and we
+// can save some work using this trick.
+
+std::tuple _thnn_fused_lstm_cell_kernel(
+ const Tensor& input_gates,
+ const Tensor& hidden_gates,
+ const Tensor& cx,
+ const std::optional& input_bias_opt,
+ const std::optional& hidden_bias_opt) {
+ // See [Note: hacky wrapper removal for optional tensor]
+ c10::MaybeOwned input_bias_maybe_owned =
+ at::borrow_from_optional_tensor(input_bias_opt);
+ const Tensor& input_bias = *input_bias_maybe_owned;
+ const Tensor& hidden_bias = hidden_bias_opt.value_or(Tensor());
+
+ checkSizes(
+ "_thnn_fused_lstm_cell_xpu",
+ {input_gates, "input_gates", 1},
+ {hidden_gates, "hidden_gates", 2},
+ {input_bias, "input_bias", 3},
+ {hidden_bias, "hidden_bias", 4},
+ /*factor=*/4,
+ {cx, "prev_hidden", 5});
+
+ auto workspace = at::empty_like(input_gates, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+ auto hy = at::empty_like(cx, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+ auto cy = at::empty_like(cx, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+ AT_DISPATCH_FLOATING_TYPES_AND2(
+ at::ScalarType::Half,
+ at::ScalarType::BFloat16,
+ input_gates.scalar_type(),
+ "_thnn_fused_lstm_cell_xpu",
+ [&] {
+ if (canUse32BitIndexMath(
+ workspace)) { // See Note [64-bit index math check elision]
+ lstm_forward_impl(
+ input_gates,
+ hidden_gates,
+ input_bias,
+ hidden_bias,
+ cx,
+ hy,
+ cy,
+ workspace);
+ } else {
+ lstm_forward_impl(
+ input_gates,
+ hidden_gates,
+ input_bias,
+ hidden_bias,
+ cx,
+ hy,
+ cy,
+ workspace);
+ }
+ });
+ return std::make_tuple(std::move(hy), std::move(cy), std::move(workspace));
+}
+
+void checkLSTMBackwardSizes(
+ const TensorArg& grad_hy,
+ const TensorArg& grad_cy,
+ const TensorArg& cx,
+ const TensorArg& cy,
+ const TensorArg& workspace) {
+ CheckedFrom c = "fused_lstm_cell_backward";
+ const TensorArg& defined_grad = grad_hy->defined() ? grad_hy : grad_cy;
+ checkDim(c, defined_grad, 2);
+ auto exp_size = defined_grad->sizes();
+ if (grad_hy->defined()) {
+ checkSize(c, grad_hy, exp_size);
+ }
+ if (grad_cy->defined()) {
+ checkSize(c, grad_cy, exp_size);
+ }
+ checkSize(c, cx, exp_size);
+ checkSize(c, cy, exp_size);
+ checkDim(c, workspace, 2);
+ checkNumel(c, workspace, exp_size[0] * exp_size[1] * 4);
+}
+
+std::tuple _thnn_fused_lstm_cell_backward_kernel(
+ const std::optional& grad_hy_opt,
+ const std::optional& grad_cy_opt,
+ const Tensor& cx,
+ const Tensor& cy,
+ const Tensor& workspace,
+ bool has_bias) {
+ // See [Note: hacky wrapper removal for optional tensor]
+ c10::MaybeOwned grad_hy_maybe_owned =
+ at::borrow_from_optional_tensor(grad_hy_opt);
+ const Tensor& grad_hy = *grad_hy_maybe_owned;
+ const Tensor& grad_cy = grad_cy_opt.value_or(Tensor());
+
+ if (!grad_hy.defined() && !grad_cy.defined()) {
+ return std::tuple();
+ }
+ checkLSTMBackwardSizes(
+ {grad_hy, "grad_hy", 1},
+ {grad_cy, "grad_cy", 2},
+ {cx, "cx", 3},
+ {cy, "cy", 4},
+ {workspace, "workspace", 5});
+
+ auto grad_gates = at::empty_like(workspace, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+ auto grad_cx = at::empty_like(cx, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+ AT_DISPATCH_FLOATING_TYPES_AND2(
+ at::ScalarType::Half,
+ at::ScalarType::BFloat16,
+ workspace.scalar_type(),
+ "_thnn_fused_lstm_cell_backward_xpu",
+ [&] {
+ if (canUse32BitIndexMath(
+ workspace)) { // See Note [64-bit index math check elision]
+ lstm_backward_impl(
+ grad_hy, grad_cy, cx, cy, workspace, grad_gates, grad_cx);
+ } else {
+ lstm_backward_impl(
+ grad_hy, grad_cy, cx, cy, workspace, grad_gates, grad_cx);
+ }
+ });
+
+ auto grad_bias =
+ has_bias ? grad_gates.sum(0, /*keepdim=*/false) : at::Tensor{};
+ return std::make_tuple(
+ std::move(grad_gates), std::move(grad_cx), std::move(grad_bias));
+}
+
+static constexpr int64_t GRU_WORKSPACE_MULTIPLIER = 5;
+
+std::tuple _thnn_fused_gru_cell_kernel(
+ const Tensor& input_gates,
+ const Tensor& hidden_gates,
+ const Tensor& hx,
+ const std::optional& input_bias_opt,
+ const std::optional& hidden_bias_opt) {
+ // See [Note: hacky wrapper removal for optional tensor]
+ c10::MaybeOwned input_bias_maybe_owned =
+ at::borrow_from_optional_tensor(input_bias_opt);
+ const Tensor& input_bias = *input_bias_maybe_owned;
+ const Tensor& hidden_bias = hidden_bias_opt.value_or(Tensor());
+
+ checkSizes(
+ "_thnn_fused_gru_cell_xpu",
+ {input_gates, "input_gates", 1},
+ {hidden_gates, "hidden_gates", 2},
+ {input_bias, "input_bias", 3},
+ {hidden_bias, "hidden_bias", 4},
+ /*factor=*/3,
+ {hx, "prev_hidden", 5});
+
+ auto workspace = at::empty(
+ {hx.size(0), hx.size(1) * GRU_WORKSPACE_MULTIPLIER}, hx.options());
+ auto hy = at::empty_like(hx, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+ AT_DISPATCH_FLOATING_TYPES_AND2(
+ at::ScalarType::Half,
+ at::ScalarType::BFloat16,
+ input_gates.scalar_type(),
+ "_thnn_fused_gru_cell_xpu",
+ [&] {
+ if (canUse32BitIndexMath(
+ workspace)) { // See Note [64-bit index math check elision]
+ gru_forward_impl(
+ input_gates,
+ hidden_gates,
+ input_bias,
+ hidden_bias,
+ hx,
+ hy,
+ workspace);
+ } else {
+ gru_forward_impl(
+ input_gates,
+ hidden_gates,
+ input_bias,
+ hidden_bias,
+ hx,
+ hy,
+ workspace);
+ }
+ });
+ return std::make_tuple(std::move(hy), std::move(workspace));
+}
+
+void checkGRUBackwardSizes(
+ const TensorArg& grad_hy,
+ const TensorArg& workspace) {
+ CheckedFrom c = "fused_gru_cell_backward";
+ checkDim(c, grad_hy, 2);
+ checkSize(
+ c,
+ workspace,
+ {grad_hy->size(0), grad_hy->size(1) * GRU_WORKSPACE_MULTIPLIER});
+}
+
+std::tuple
+_thnn_fused_gru_cell_backward_kernel(
+ const Tensor& grad_hy,
+ const Tensor& workspace,
+ bool has_bias) {
+ checkGRUBackwardSizes({grad_hy, "grad_hy", 1}, {workspace, "workspace", 2});
+
+ int64_t hidden_size = workspace.size(1) / GRU_WORKSPACE_MULTIPLIER;
+ auto grad_input_gates =
+ at::empty({workspace.size(0), hidden_size * 3}, workspace.options());
+ auto grad_hidden_gates =
+ at::empty({workspace.size(0), hidden_size * 3}, workspace.options());
+ auto grad_hx = at::empty_like(grad_hy, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+ AT_DISPATCH_FLOATING_TYPES_AND2(
+ at::ScalarType::Half,
+ at::ScalarType::BFloat16,
+ grad_hy.scalar_type(),
+ "_thnn_fused_gru_cell_backward_xpu",
+ [&] {
+ if (canUse32BitIndexMath(
+ workspace)) { // See Note [64-bit index math check elision]
+ gru_backward_impl(
+ grad_hy, workspace, grad_input_gates, grad_hidden_gates, grad_hx);
+ } else {
+ gru_backward_impl(
+ grad_hy, workspace, grad_input_gates, grad_hidden_gates, grad_hx);
+ }
+ });
+
+ at::Tensor grad_input_bias, grad_hidden_bias;
+ if (has_bias) {
+ grad_input_bias = grad_input_gates.sum(0, /*keepdim=*/false);
+ grad_hidden_bias = grad_hidden_gates.sum(0, /*keepdim=*/false);
+ }
+
+ return std::make_tuple(
+ std::move(grad_input_gates),
+ std::move(grad_hidden_gates),
+ std::move(grad_hx),
+ std::move(grad_input_bias),
+ std::move(grad_hidden_bias));
+}
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/RNNKernels.h b/src/ATen/native/xpu/sycl/RNNKernels.h
new file mode 100644
index 000000000..07f0e3f78
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/RNNKernels.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include
+
+namespace at::native::xpu {
+
+TORCH_XPU_API std::tuple _thnn_fused_lstm_cell_kernel(
+ const Tensor& input_gates,
+ const Tensor& hidden_gates,
+ const Tensor& cx,
+ const std::optional& input_bias_opt,
+ const std::optional& hidden_bias_opt);
+
+TORCH_XPU_API std::tuple
+_thnn_fused_lstm_cell_backward_kernel(
+ const std::optional& grad_hy_opt,
+ const std::optional& grad_cy_opt,
+ const Tensor& cx,
+ const Tensor& cy,
+ const Tensor& workspace,
+ bool has_bias);
+
+TORCH_XPU_API std::tuple _thnn_fused_gru_cell_kernel(
+ const Tensor& input_gates,
+ const Tensor& hidden_gates,
+ const Tensor& hx,
+ const std::optional& input_bias_opt,
+ const std::optional& hidden_bias_opt);
+
+TORCH_XPU_API std::tuple
+_thnn_fused_gru_cell_backward_kernel(
+ const Tensor& grad_hy,
+ const Tensor& workspace,
+ bool has_bias);
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ResizeKernel.cpp b/src/ATen/native/xpu/sycl/ResizeKernel.cpp
index 237a1c213..f1ee7f944 100644
--- a/src/ATen/native/xpu/sycl/ResizeKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ResizeKernel.cpp
@@ -25,8 +25,9 @@ void resize_bytes_xpu(StorageImpl* storage, size_t size_bytes) {
c10::xpu::XPUGuard guard(device.index());
at::DataPtr data = allocator->allocate(size_bytes);
if (storage->data_ptr()) {
- auto q = at::xpu::getCurrentSYCLQueue();
+ at::globalContext().lazyInitDevice(c10::DeviceType::XPU);
+ auto q = at::xpu::getCurrentSYCLQueue();
q.memcpy(
data.get(), storage->data(), std::min(storage->nbytes(), size_bytes));
}
diff --git a/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.cpp b/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.cpp
index 533630175..7f6f33805 100644
--- a/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.cpp
+++ b/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.cpp
@@ -86,7 +86,7 @@ template
inline void _rrelu_with_noise_xpu_train(
Tensor& output,
const Tensor& input_,
- const Tensor& noise_,
+ Tensor& noise_,
const Scalar& lower_,
const Scalar& upper_,
std::optional generator) {
@@ -153,7 +153,7 @@ inline void _rrelu_with_noise_xpu_train(
Tensor& rrelu_with_noise_kernel(
const Tensor& self,
- const Tensor& noise,
+ Tensor& noise,
const Scalar& lower,
const Scalar& upper,
bool training,
diff --git a/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.h b/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.h
index 8371c38ab..fa7e568ea 100644
--- a/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.h
+++ b/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.h
@@ -7,7 +7,7 @@ namespace at::native::xpu {
TORCH_XPU_API Tensor& rrelu_with_noise_kernel(
const Tensor& self,
- const Tensor& noise,
+ Tensor& noise,
const Scalar& lower,
const Scalar& upper,
bool training,
diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
index 28d812f2c..0a0c7e718 100644
--- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
+++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
@@ -210,7 +210,8 @@ template <
int outer_loop,
bool is_masked,
typename calc_t,
- typename vec_t>
+ typename vec_t,
+ bool is_safe_softmax>
struct DispatchSoftmaxForwardKernelFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
@@ -240,7 +241,8 @@ struct DispatchSoftmaxForwardKernelFunctor
if (index >= dim_size_)
break;
- reg_in[i] = *(reinterpret_cast(in_data_ + group_offset + index));
+ reg_in[i] =
+ *(reinterpret_cast(in_data_ + group_offset + index));
if constexpr (is_masked) {
auto vec_offset = group_offset + index;
#pragma unroll(vec_size)
@@ -309,6 +311,10 @@ struct DispatchSoftmaxForwardKernelFunctor
if constexpr (LogSoftMax) {
reg_in[i][j] =
static_cast(reg_in[i][j] - max_value - sum_value);
+ } else if (
+ is_safe_softmax &&
+ max_value == std::numeric_limits::lowest()) {
+ reg_in[i][j] = static_cast(0);
} else if (sum_value == 0) {
reg_in[i][j] = nan_;
} else {
@@ -386,7 +392,8 @@ template <
bool LogSoftMax,
int outer_loop,
bool is_masked = false,
- typename calc_t = decltype(nullptr)>
+ typename calc_t = decltype(nullptr),
+ bool is_safe_softmax = false>
bool dispatch_softmax_forward_kernel(
const scalar_t* in_data,
scalar_t* out_data,
@@ -412,7 +419,8 @@ bool dispatch_softmax_forward_kernel(
outer_loop,
is_masked,
calc_t,
- vec_t>;
+ vec_t,
+ /*is_safe_softmax = */ false>;
int sub_group_num, global_size_row, local_size_row, range, local_size;
int max_group_size =
@@ -460,8 +468,8 @@ bool dispatch_softmax_forward_kernel(
outer_loop,
is_masked,
DummyFunctor,
- vec_t>;
-
+ vec_t,
+ is_safe_softmax>;
int sub_group_num, global_size_row, local_size_row, range, local_size;
int max_group_size =
get_wgroup_size(
@@ -506,7 +514,8 @@ template <
typename IndexType,
bool LogSoftMax,
typename vec_t,
- int align_bytes>
+ int align_bytes,
+ bool is_safe_softmax>
struct SoftmaxForwardKernelFunctor {
void operator()(sycl::nd_item<1> item) const {
IndexType local_id = item.get_local_id(0);
@@ -562,6 +571,10 @@ struct SoftmaxForwardKernelFunctor {
if (LogSoftMax)
out_data_[group_offset + linear_idx] = static_cast(
in_data_[group_offset + linear_idx] - max_value - sum_value);
+ else if (
+ is_safe_softmax &&
+ max_value == std::numeric_limits::lowest())
+ out_data_[group_offset + linear_idx] = static_cast(0);
else
out_data_[group_offset + linear_idx] = static_cast(
std::exp(in_data_[group_offset + linear_idx] - max_value) *
@@ -576,6 +589,10 @@ struct SoftmaxForwardKernelFunctor {
if (LogSoftMax)
in_val[j] =
static_cast(in_val[j] - max_value - sum_value);
+ else if (
+ is_safe_softmax &&
+ max_value == std::numeric_limits::lowest())
+ in_val[j] = static_cast(0);
else
in_val[j] = static_cast(
std::exp(in_val[j] - max_value) * sum_value);
@@ -610,7 +627,8 @@ template <
typename scalar_t,
typename accscalar_t,
typename IndexType,
- bool LogSoftMax>
+ bool LogSoftMax,
+ bool is_safe_softmax>
void softmax_forward_kernel(
const scalar_t* in_data,
scalar_t* out_data,
@@ -625,7 +643,8 @@ void softmax_forward_kernel(
IndexType,
LogSoftMax,
vec_t,
- align_bytes>;
+ align_bytes,
+ is_safe_softmax>;
int local_size = std::min(
(dim_size + vec_size - 1) / vec_size,
@@ -645,7 +664,8 @@ template <
typename accscalar_t,
typename IndexType,
bool LogSoftMax,
- typename vec_t>
+ typename vec_t,
+ bool is_safe_softmax>
struct SpatialSoftmaxForwardKernelFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
void operator()(sycl::nd_item<3> item) const {
@@ -658,14 +678,16 @@ struct SpatialSoftmaxForwardKernelFunctor
// get max value
accscalar_t max_value[vec_size];
auto offset = local_row_id * inner_size_ + global_col * vec_size;
- vec_t value = *(reinterpret_cast(in_data_ + group_offset + offset));
+ vec_t value =
+ *(reinterpret_cast(in_data_ + group_offset + offset));
#pragma unroll(vec_size)
for (int j = 0; j < vec_size; ++j) {
max_value[j] = accscalar_t(value[j]);
}
for (int i = local_row_id + block_row_; i < dim_size_; i += block_row_) {
offset = i * inner_size_ + global_col * vec_size;
- value = *(reinterpret_cast(in_data_ + group_offset + offset));
+ value =
+ *(reinterpret_cast(in_data_ + group_offset + offset));
#pragma unroll(vec_size)
for (int j = 0; j < vec_size; ++j) {
max_value[j] = std::max(max_value[j], accscalar_t(value[j]));
@@ -695,7 +717,8 @@ struct SpatialSoftmaxForwardKernelFunctor
}
for (int i = local_row_id + block_row_; i < dim_size_; i += block_row_) {
offset = i * inner_size_ + global_col * vec_size;
- value = *(reinterpret_cast(in_data_ + group_offset + offset));
+ value =
+ *(reinterpret_cast(in_data_ + group_offset + offset));
#pragma unroll(vec_size)
for (int j = 0; j < vec_size; ++j) {
sum_value[j] += std::exp(value[j] - max_value[j]);
@@ -736,6 +759,10 @@ struct SpatialSoftmaxForwardKernelFunctor
if (LogSoftMax)
in_val[j] =
static_cast(in_val[j] - max_value[j] - sum_value[j]);
+ else if (
+ is_safe_softmax &&
+ max_value[j] == -std::numeric_limits::infinity())
+ in_val[j] = static_cast(0);
else
in_val[j] = static_cast(
std::exp(in_val[j] - max_value[j]) * sum_value[j]);
@@ -787,7 +814,8 @@ template <
typename scalar_t,
typename accscalar_t,
typename IndexType,
- bool LogSoftMax>
+ bool LogSoftMax,
+ bool is_safe_softmax>
void spatial_softmax_forward(
const scalar_t* in_data,
scalar_t* out_data,
@@ -801,7 +829,8 @@ void spatial_softmax_forward(
accscalar_t,
IndexType,
LogSoftMax,
- vec_t>;
+ vec_t,
+ is_safe_softmax>;
int local_size, block_row;
get_wgroup_size_spatial(
@@ -818,7 +847,8 @@ void spatial_softmax_forward(
accscalar_t,
IndexType,
LogSoftMax,
- vec_t>(
+ vec_t,
+ is_safe_softmax>(
in_data,
out_data,
dim_size,
@@ -827,7 +857,6 @@ void spatial_softmax_forward(
local_size,
block_row,
group_num);
-
auto& queue = getCurrentSYCLQueue();
sycl_kernel_submit(global_range, local_range, queue, kfn);
}
@@ -1387,7 +1416,11 @@ void spatial_softmax_backward_kernel(
sycl_kernel_submit(global_range, local_range, queue, kfn);
}
-template
+template <
+ typename scalar_t,
+ typename accscalar_t,
+ bool LogSoftMax,
+ bool is_safe_softmax>
void spatial_softmax_forward(
const Tensor& output,
const Tensor& input,
@@ -1432,7 +1465,10 @@ void spatial_softmax_forward(
accscalar_t, \
uint32_t, \
LogSoftMax, \
- outer_loop>( \
+ outer_loop, \
+ /*is_masked = */ false, \
+ /*calc_t = */ decltype(nullptr), \
+ /*is_safe_softmax = */ is_safe_softmax>( \
input.const_data_ptr(), \
output.mutable_data_ptr(), \
dim_size, \
@@ -1446,7 +1482,8 @@ void spatial_softmax_forward(
scalar_t, \
accscalar_t, \
IndexType, \
- LogSoftMax>( \
+ LogSoftMax, \
+ is_safe_softmax>( \
input.const_data_ptr(), \
output.mutable_data_ptr(), \
dim_size, \
@@ -1460,7 +1497,8 @@ void spatial_softmax_forward(
scalar_t, \
accscalar_t, \
IndexType, \
- LogSoftMax>( \
+ LogSoftMax, \
+ is_safe_softmax>( \
input.const_data_ptr(), \
output.mutable_data_ptr(), \
dim_size, \
@@ -1749,7 +1787,8 @@ Tensor& masked_softmax_forward(
LogSoftMax, \
outer_loop, \
true, \
- decltype(input_calc)>( \
+ decltype(input_calc), \
+ /*is_safe_softmax = */ false>( \
input.const_data_ptr(), \
output.mutable_data_ptr