diff --git a/.github/actions/inductor-xpu-e2e-test/action.yml b/.github/actions/inductor-xpu-e2e-test/action.yml index 4f81534fd..4e1fce079 100644 --- a/.github/actions/inductor-xpu-e2e-test/action.yml +++ b/.github/actions/inductor-xpu-e2e-test/action.yml @@ -51,7 +51,7 @@ runs: shell: bash run: | source activate e2e_ci - source .github/scripts/env.sh + source .github/scripts/env.sh ${{ inputs.pytorch }} if [[ ${{ inputs.suite }} == *"torchbench"* ]]; then if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then cd ../ && rm -rf audio && git clone --single-branch -b main https://github.com/pytorch/audio.git @@ -94,7 +94,7 @@ runs: shell: bash run: | source activate e2e_ci - source .github/scripts/env.sh + source .github/scripts/env.sh ${{ inputs.pytorch }} cp .github/scripts/inductor_xpu_test.sh ../pytorch cd ../pytorch diff --git a/.github/ci_expected_accuracy/check_expected.py b/.github/ci_expected_accuracy/check_expected.py index 48543c930..6d321e34b 100644 --- a/.github/ci_expected_accuracy/check_expected.py +++ b/.github/ci_expected_accuracy/check_expected.py @@ -16,14 +16,14 @@ # load csv files -test_data= pd.read_csv(args.csv_file) +test_data= pd.read_csv(args.csv_file, comment='#') # test_data = test_data.reset_index() # make sure indexes pair with number of rows # test_data = test_data.sort_values(by=["name"], ascending=True) test_names = [row["name"] for index, row in test_data.iterrows()] current_path = pathlib.Path(__file__).parent.resolve() refer_file = str(current_path) + "/" + args.category + "_" + args.suite + "_" + args.mode + ".csv" -refer_data= pd.read_csv(refer_file) +refer_data= pd.read_csv(refer_file, comment='#') # refer_data = refer_data.reset_index() # make sure indexes pair with number of rows # refer_data = refer_data.sort_values(by=["name"], ascending=True) refer_names = [row["name"] for index, row in refer_data.iterrows()] diff --git a/.github/ci_expected_accuracy/inductor_huggingface_training.csv b/.github/ci_expected_accuracy/inductor_huggingface_training.csv index a75d3d225..e2d5645e2 100644 --- a/.github/ci_expected_accuracy/inductor_huggingface_training.csv +++ b/.github/ci_expected_accuracy/inductor_huggingface_training.csv @@ -13,7 +13,8 @@ CamemBert,pass,pass,pass,pass,pass DebertaForMaskedLM,pass,pass,pass,pass,pass DebertaForQuestionAnswering,pass,pass,pass,pass,pass DebertaV2ForMaskedLM,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip -DebertaV2ForQuestionAnswering,pass,pass,pass,pass,pass +# Skip DebertaV2ForQuestionAnswering issue: https://github.com/intel/torch-xpu-ops/issues/1216 +DebertaV2ForQuestionAnswering,fail_accuracy,fail_accuracy,fail_accuracy,pass,pass DistilBertForMaskedLM,pass,pass,pass,pass,pass DistilBertForQuestionAnswering,pass,pass,pass,pass,pass DistillGPT2,pass,pass,pass,pass,pass diff --git a/.github/ci_expected_accuracy/inductor_torchbench_inference.csv b/.github/ci_expected_accuracy/inductor_torchbench_inference.csv index 4825aa41f..832923854 100644 --- a/.github/ci_expected_accuracy/inductor_torchbench_inference.csv +++ b/.github/ci_expected_accuracy/inductor_torchbench_inference.csv @@ -102,5 +102,6 @@ torch_multimodal_clip,pass,pass,pass,eager_fail_to_run,eager_fail_to_run tts_angular,pass,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run vgg16,pass,pass,pass,pass,pass vision_maskrcnn,pass,pass,pass,eager_fail_to_run,eager_fail_to_run -yolov3,pass,pass,pass,pass,pass +# Skip yolov3 for known torchbench issue: https://github.com/intel/torch-xpu-ops/issues/1229 +yolov3,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run hf_Roberta_base,pass,pass,pass,pass,pass diff --git a/.github/ci_expected_accuracy/inductor_torchbench_training.csv b/.github/ci_expected_accuracy/inductor_torchbench_training.csv index dc766eac0..36a646a14 100644 --- a/.github/ci_expected_accuracy/inductor_torchbench_training.csv +++ b/.github/ci_expected_accuracy/inductor_torchbench_training.csv @@ -102,5 +102,6 @@ torch_multimodal_clip,pass,pass,pass,eager_fail_to_run,eager_fail_to_run tts_angular,pass,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run vgg16,pass,pass,pass,pass,pass vision_maskrcnn,pass,pass,pass,eager_fail_to_run,eager_fail_to_run -yolov3,pass,pass,pass,pass,pass +# Skip yolov3 for known torchbench issue: https://github.com/intel/torch-xpu-ops/issues/1229 +yolov3,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run,eager_fail_to_run hf_Roberta_base,pass,pass,pass,pass,pass diff --git a/.github/scripts/apply_torch_pr.py b/.github/scripts/apply_torch_pr.py index b4b441263..bbe89ed7d 100644 --- a/.github/scripts/apply_torch_pr.py +++ b/.github/scripts/apply_torch_pr.py @@ -12,9 +12,7 @@ # Fallback to CPU for XPU FP64 "https://github.com/pytorch/pytorch/pull/126516", # Modify the tolerance level in TIMM benchmark - "https://github.com/pytorch/pytorch/pull/129735", - # [XPU] Update XPU C Shim Header - "https://github.com/pytorch/pytorch/pull/141086", + "https://github.com/pytorch/pytorch/pull/143739", ] ) parser.add_argument('--extra-pr-list', '-e', nargs='+',default=[]) @@ -59,7 +57,7 @@ def appyly_pr(pr_info, re_apply_msg): pr_file = pr_info["diff_url"].split("/")[-1] urllib.request.urlretrieve(pr_info["diff_url"], pr_file) # apply diff - apply_cmd = "git apply --3way " + pr_file + " && rm -f " + pr_file + apply_cmd = "git apply --3way " + pr_file apply_info = subprocess.Popen(apply_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) apply_message = apply_info.communicate()[0].decode("utf-8") apply_status = apply_info.returncode diff --git a/.github/scripts/env.sh b/.github/scripts/env.sh index 56d8e3930..9cfd67477 100644 --- a/.github/scripts/env.sh +++ b/.github/scripts/env.sh @@ -1,4 +1,11 @@ #!/bin/bash -source /opt/intel/oneapi/compiler/latest/env/vars.sh -source /opt/intel/oneapi/umf/latest/env/vars.sh -source /opt/intel/oneapi/pti/latest/env/vars.sh + +if [ "$1" != "nightly_wheel" ];then + source /opt/intel/oneapi/compiler/latest/env/vars.sh + source /opt/intel/oneapi/umf/latest/env/vars.sh + source /opt/intel/oneapi/pti/latest/env/vars.sh + source /opt/intel/oneapi/ccl/latest/env/vars.sh + source /opt/intel/oneapi/mpi/latest/env/vars.sh +else + echo "Don't need to source DL-Essential for nightly wheel" +fi diff --git a/.github/scripts/inductor_summary.py b/.github/scripts/inductor_summary.py index e11147664..8edd8970c 100644 --- a/.github/scripts/inductor_summary.py +++ b/.github/scripts/inductor_summary.py @@ -3,6 +3,8 @@ import pandas as pd from scipy.stats import gmean from styleframe import StyleFrame, Styler, utils +import numpy as np +from openpyxl import Workbook parser = argparse.ArgumentParser(description="Generate report") parser.add_argument('-s', '--suite', default=["huggingface"], nargs='*', type=str, help='model suite name') @@ -665,6 +667,73 @@ def update_summary(excel, scenario, suite): sf.set_row_height(j, 30) sf.to_excel(sheet_name=suite + '_' + scenario + '_Summary', excel_writer=excel) +def summary_conclusion(scenario, excel): + excel.book.save(excel) + df = pd.read_excel(excel, sheet_name = None, header = None) + #df = pd.DataFrame(excel) + if scenario == 'performance': + sheet_names = list(df.keys()) + sheet_names = [s for s in sheet_names if 'Summary' in s and 'performance' in s] + sheet_names.sort() + print(f"Merge excel as below:\n{sheet_names}") + print("\n") + features = [[]] * 21 + for sheet_name in sheet_names: + df_sheet = df[sheet_name] + df_sheet = df_sheet.values + features = np.hstack((features, df_sheet)) + + if len(sheet_names) == 1: + print("sheet not merge") + elif len(sheet_names) == 2: + print("2 sheets merge") + if 'huggingface' in sheet_names[0]: + features[:, 4:5] = features[:, 14:15] + features[:, 6:7] = features[:, 16:17] + else: + features[:, 4:5] = features[:, 14:15] + else: + print("3 sheets merge") + features[:, 4:5] = features[:, 24:25] + features[:, 6:7] = features[:, 16:17] + + df_concat = StyleFrame(pd.DataFrame(features).iloc[:,:10]) + for i in range(10): + df_concat.set_column_width(i, 22) + for j in range(1, 23): + df_concat.set_row_height(j, 30) + df_concat.to_excel(sheet_name='Perf_Summary', excel_writer=excel, index=False) + else: + sheet_names = list(df.keys()) + sheet_names = [s for s in sheet_names if 'Summary' in s and 'accuracy' in s] + sheet_names.sort() + print(f"Merge excel as below:\n{sheet_names}") + print("\n") + features = [[]] * 11 + for sheet_name in sheet_names: + df_sheet = df[sheet_name] + df_sheet = df_sheet.values + features = np.hstack((features, df_sheet)) + if len(sheet_names) == 1: + print("sheet not merge") + elif len(sheet_names) == 2: + print("2 sheets merge") + if 'huggingface' in sheet_names[0]: + features[:, 3:4] = features[:, 12:13] + features[:, 5:6] = features[:, 14:15] + else: + features[:, 3:4] = features[:, 12:13] + else: + print("3 sheets merge") + features[:, 3:4] = features[:, 21:22] + features[:, 5:6] = features[:, 14:15] + + df_concat = StyleFrame(pd.DataFrame(features).iloc[:,:9]) + for i in range(10): + df_concat.set_column_width(i, 22) + for j in range(1, 13): + df_concat.set_row_height(j, 30) + df_concat.to_excel(sheet_name='Acc_Summary', excel_writer=excel, index=False) def generate_report(excel, scenario_list, precision_list, mode_list, suite_list): for sc in scenario_list: @@ -693,8 +762,19 @@ def excel_postprocess(file, scenario, precison, mode, suite): wdt.merge_cells(start_row=1, end_row=1, start_column=13, end_column=16) wb.save(file) + if len(scenario) == 2: + wb.move_sheet("Perf_Summary", -(len(wb.worksheets)-1)) + wb.move_sheet("Acc_Summary", -(len(wb.worksheets)-1)) + elif len(scenario) == 1 and sc == 'accuracy': + wb.move_sheet("Acc_Summary", -(len(wb.worksheets)-1)) + else: + wb.move_sheet("Perf_Summary", -(len(wb.worksheets)-1)) + if __name__ == '__main__': excel = StyleFrame.ExcelWriter('inductor_log/Inductor_E2E_Test_Report.xlsx') generate_report(excel, args.scenario, args.precision, args.mode, args.suite) + for sc in args.scenario: + summary_conclusion(sc, excel) excel_postprocess(excel, args.scenario, args.precision, args.mode, args.suite) + excel.close() diff --git a/.github/scripts/spec.py b/.github/scripts/spec.py new file mode 100644 index 000000000..b8bf6d59a --- /dev/null +++ b/.github/scripts/spec.py @@ -0,0 +1,7 @@ +import torch + +DEVICE_NAME = 'xpu' + +MANUAL_SEED_FN = torch.xpu.manual_seed +EMPTY_CACHE_FN = torch.xpu.empty_cache +DEVICE_COUNT_FN = torch.xpu.device_count diff --git a/.github/workflows/_linux_transformers.yml b/.github/workflows/_linux_transformers.yml new file mode 100644 index 000000000..b21864e9b --- /dev/null +++ b/.github/workflows/_linux_transformers.yml @@ -0,0 +1,355 @@ +name: Linux Transformers Test + +on: + pull_request: + branches: + - main + paths: + - '.github/scripts/spec.py' + - '.github/workflows/_linux_transformers.yml' + workflow_dispatch: + inputs: + pytorch: + required: false + type: string + default: 'nightly' + description: Pytorch branch/commit + python: + required: false + type: string + default: '3.10' + description: Python version + runner: + required: true + type: string + default: 'linux.idc.xpu' + description: Runner label + driver: + required: false + type: string + default: 'lts' + description: Driver lts/rolling + nightly_whl: + required: false + type: string + default: '' + description: Pytorch nightly wheel version + transformers: + required: false + type: string + default: 'v4.47.0' + description: Transformers version + +permissions: read-all + +jobs: + Torch-XPU-Transformers-Tests: + runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }} + env: + NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} + DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} + python: ${{ inputs.python != '' && inputs.python || '3.10' }} + pytorch: ${{ inputs.pytorch != '' && inputs.pytorch || 'nightly' }} + transformers: ${{ inputs.transformers != '' && inputs.transformers || 'v4.47.0' }} + PYTORCH_DEBUG_XPU_FALLBACK: '1' + TRANSFORMERS_TEST_DEVICE_SPEC: 'spec.py' + steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + with: + path: torch-xpu-ops + - name: Checkout Transformers + uses: actions/checkout@v4 + with: + repository: huggingface/transformers + ref: ${{ env.transformers }} + path: transformers + - name: Prepare OS environment + run: | + sudo apt-get update + sudo apt-get install -y \ + espeak-ng \ + git-lfs \ + pkg-config \ + libavcodec-dev \ + libavdevice-dev \ + libavfilter-dev \ + libavformat-dev \ + libavutil-dev \ + libswresample-dev \ + libswscale-dev + git lfs install + - name: Prepare Conda ENV + run: | + which conda && conda clean -ay + conda remove --all -y -n huggingface_transformers_test || rm -rf $(dirname ${CONDA_EXE})/../envs/huggingface_transformers_test + conda create -y -n huggingface_transformers_test python=${{ env.python }} + source activate huggingface_transformers_test + - name: Prepare Stock XPU Pytorch + run: | + pwd + source activate huggingface_transformers_test + if [ -z "${{ inputs.nightly_whl }}" ]; then + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + else + pip install torch==$(echo ${{ inputs.nightly_whl }}) torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + fi + - name: Prepare Transformers + run: | + pwd + source activate huggingface_transformers_test + cd transformers + pip install -e . + pip install -e ".[dev-torch,testing,video]" + rm -rf tests_log && mkdir -p tests_log + rm -rf reports + cp ${{ github.workspace }}/torch-xpu-ops/.github/scripts/spec.py ./ + - name: Report installed versions + run: | + source activate huggingface_transformers_test + echo "pip installed packages:" + pip list | tee ${{ github.workspace }}/transformers/tests_log/pip_list.txt + echo "lspci gpu devices:" + lspci -d ::0380 | tee ${{ github.workspace }}/transformers/tests_log/lspci_0380.txt + echo "GPU render nodes:" + cat /sys/class/drm/render*/device/device | tee ${{ github.workspace }}/transformers/tests_log/device_IDs.txt + echo "xpu-smi output:" + xpu-smi discovery -y --json --dump -1 + - name: Sanitry check installed packages + run: | + source activate huggingface_transformers_test + # These checks are to exit earlier if for any reason Transformers + # reinstalled torch packages back to CUDA versions (not expected). + pip show torch | grep Version | grep xpu + pip show torchaudio | grep Version | grep xpu + pip show torchvision | grep Version | grep xpu + python -c 'import torch; exit(not torch.xpu.is_available())' + - name: Run -k backbone tests + env: + TEST_CASE: 'tests_backbone' + run: | + source activate huggingface_transformers_test + cd transformers + python3 -m pytest -rsf --make-reports=$TEST_CASE -k backbone tests || \ + (echo "FAILED_CASES=$FAILED_CASES,$TEST_CASE" >> $GITHUB_ENV) + - name: Run tests/*.py + env: + TEST_CASE: 'tests_py' + run: | + source activate huggingface_transformers_test + cd transformers + python3 -m pytest -rsf --make-reports=$TEST_CASE tests/*.py || true + - name: Run tests/benchmark + env: + TEST_CASE: 'tests_benchmark' + run: | + source activate huggingface_transformers_test + cd transformers + python3 -m pytest -rsf --make-reports=$TEST_CASE tests/benchmark || true + - name: Run tests/generation + env: + TEST_CASE: 'tests_generation' + run: | + source activate huggingface_transformers_test + cd transformers + # Excluding tests due to: + # * torch.distributed.* not yet supported by XPU + pattern="not TestFSDPGeneration" + python3 -m pytest -rsf --make-reports=$TEST_CASE tests/generation -k "$pattern" || true + - name: Run tests/models + env: + TEST_CASE: 'tests_models' + run: | + source activate huggingface_transformers_test + cd transformers + # Excluding tests due to: + # * https://github.com/huggingface/transformers/issues/35252 (CUDA specific tests) + # * https://github.com/pytorch/pytorch/issues/140965 (aten::_linalg_eigvals) + pattern=" \ + not test_model_parallelization and \ + not test_model_parallel_equal_results and \ + not test_resize_embeddings_untied and \ + not test_resize_tokens_embeddings" + python3 -m pytest -rsf --make-reports=$TEST_CASE tests/models -k "$pattern" || true + - name: Run tests/pipelines + env: + TEST_CASE: 'tests_pipelines' + run: | + source activate huggingface_transformers_test + cd transformers + # Some tests are known to fail w/o clear pattern + # TODO: drop ||true after triage and fixes + python3 -m pytest -rsf --make-reports=$TEST_CASE tests/pipelines || true + - name: Run tests/trainer + env: + TEST_CASE: 'tests_trainer' + run: | + source activate huggingface_transformers_test + cd transformers + # Excluding tests due to: + # * Some ray tests hang, reason unknown + # * torch.distributed.* not yet supported by XPU + pattern=" \ + not ray and \ + not TestTrainerDistributed and \ + not TestTrainerDistributedXPU and \ + not TestFSDPTrainer" + python3 -m pytest -rsf --make-reports=$TEST_CASE tests/trainer -k "$pattern" || \ + (echo "FAILED_CASES=$FAILED_CASES,$TEST_CASE" >> $GITHUB_ENV) + - name: Run tests/utils + env: + TEST_CASE: 'tests_utils' + run: | + source activate huggingface_transformers_test + cd transformers + # Excluding tests due to: + # * Network proxy connection issue, reason unknown + pattern="not test_load_img_url_timeout" + python3 -m pytest -rsf --make-reports=$TEST_CASE tests/utils -k "$pattern" || \ + (echo "FAILED_CASES=$FAILED_CASES,$TEST_CASE" >> $GITHUB_ENV) + - name: Check for errors in tests + run: | + FAILED_CASES=$(echo $FAILED_CASES | sed 's/^,//') + echo "Failed cases: [$(echo $FAILED_CASES | sed 's/,/, /g')]" + test -z "$FAILED_CASES" + - name: Print results table + if: ${{ ! cancelled() }} + run: | + # Helper function to return number preceeding given pattern, i.e: + # === 25 failed, 11 warnings, 0 errors === + # Call as follows: + # parse_stat $line "failed" + function parse_stat() { + stat=$(cat $1 | grep $2 | sed "s/.* \([0-9]*\) $2.*/\1/") + if [ -n "$stat" ]; then echo $stat; else echo "0"; fi + } + cd transformers + { + echo "### Results" + echo "| Test group | Errors | Failed | Deselected | Passed | Skipped |" + echo "| --- | --- | --- | --- | --- | --- |" + for stat in $(find reports -name stats.txt); do + # Each stat.txt is located in: reports/$test_group/stats.txt + test_group=$(echo $stat | cut -f 2 -d/) + # Get failed, passed, skipped, etc. counters + failed=$(parse_stat $stat failed) + passed=$(parse_stat $stat passed) + deselected=$(parse_stat $stat deselected) + skipped=$(parse_stat $stat skipped) + warnings=$(parse_stat $stat warnings) + errors=$(parse_stat $stat errors) + echo "| $test_group | $errors | $failed | $deselected | $passed | $skipped |" + done + } >> $GITHUB_STEP_SUMMARY + - name: Print failure lines + if: ${{ ! cancelled() }} + run: | + cd transformers + { + echo "### Failure lines" + echo "| Test group |File | Error | Comment |" + echo "| --- | --- | --- | --- |" + rm -rf _failures.txt + for failure in $(find reports -name failures_line.txt); do + # Each failure_line.txt is located in: reports/$test_group/failure_line.txt + test_group=$(echo $failure | cut -f2 -d/) + tail -n +2 $failure | sed "s/^/$test_group /" >> _failures.txt + done + # failures_line.txt file does not have test case information, + # so we can just sort the output and report uniq values + sort _failures.txt | uniq > _failures_uniq.txt + while read line; do + test_group=$(echo $line | cut -f1 -d" ") + file=$(echo $line | cut -f2 -d" " | sed "s/\(.*\):$/\1/") + error=$(echo $line | cut -f3 -d" " | sed "s/\(.*\):$/\1/") + # Failure comments often contain special characters which complicate + # parsing failure lines. But fortunately we know for sure where comments + # start. So we just output all contents starting from this position and + # wrap everything in
 to avoid collisions with Markdown formatting.
+              comment="
$(echo $line | cut -f4- -d' ' | sed 's/\(.*\):$/\1/')
" + echo "| $test_group | $file | $error | $comment |" + done <_failures_uniq.txt + } >> $GITHUB_STEP_SUMMARY + - name: Print not implemented XPU backend ops + run: | + cd transformers + { + echo "### Not implemented ops" + echo "| Test group | Operator | Status |" + echo "| --- | --- | --- |" + rm -rf _ops.txt && touch _ops.txt + for log in $(find reports -name failures_line.txt); do + # Each failure_line.txt is located in: reports/$test_group/failure_line.txt + test_group=$(echo $log | cut -f2 -d/) + ops=$(grep NotImplementedError $log | grep "for the XPU device" | sed "s/.*The operator '\(.*\)' is not.*/\1/") + for op in $ops; do + echo "| $test_group |
$op
| not implemented |" >> _ops.txt + done + done + for log in $(find reports -name warnings.txt); do + # Each warnings.txt is located in: reports/$test_group/warnings.txt + test_group=$(echo $log | cut -f2 -d/) + ops=$(grep UserWarning $log | grep "on the XPU backend" | sed "s/.*The operator '\(.*\) on the XPU.*/\1/") + for op in $ops; do + echo "| $test_group |
$op
| fallback to CPU happens |" >> _ops.txt + done + done + sort _ops.txt | uniq + } >> $GITHUB_STEP_SUMMARY + - name: Print annotations + if: ${{ ! cancelled() }} + run: | + source activate huggingface_transformers_test + { + echo "### Annotations" + echo "| | |" + echo "| --- | --- |" + echo "| jobs.$GITHUB_JOB.versions.os | $(source /etc/os-release && echo $VERSION_ID) |" + echo "| jobs.$GITHUB_JOB.versions.linux-kernel | $(uname -r) |" + echo "| jobs.$GITHUB_JOB.versions.python | $(python --version | cut -f2 -d' ') |" + packages=" \ + level-zero \ + libigc1 \ + libigc2 \ + libze1 \ + libze-intel-gpu1 \ + intel-i915-dkms \ + intel-level-zero-gpu \ + intel-opencl-icd" + for package in $packages; do + package_version=$(dpkg -l | grep $package | grep ii | head -1 | sed "s/ */ /g" | cut -f3 -d" ") + echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |" + done + packages="accelerate \ + numpy \ + torch \ + torchaudio \ + torchvision \ + transformers" + for package in $packages; do + package_version=$(python -c "import $package; print($package.__version__)" || true) + echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |" + done + # printing annotations for GPU cards + var="[$(cat /sys/class/drm/render*/device/vendor || true)]" + echo "| jobs.$GITHUB_JOB.drm.render_nodes_vendor_ids | $(echo $var | sed 's/ /,/g') |" + var="[$(cat /sys/class/drm/render*/device/device || true)]" + echo "| jobs.$GITHUB_JOB.drm.render_nodes_device_ids | $(echo $var | sed 's/ /,/g') |" + var=$(python -c "import torch; print(torch.version.xpu)" || true) + echo "| jobs.$GITHUB_JOB.torch.version.xpu | $var |" + var=$(python -c "import torch; print(torch.xpu.device_count())" || true) + echo "| jobs.$GITHUB_JOB.torch.xpu.device_count | $var |" + # printing annotations with key environment variables + echo "| jobs.$GITHUB_JOB.env.ZE_AFFINITY_MASK | $ZE_AFFINITY_MASK |" + echo "| jobs.$GITHUB_JOB.env.NEOReadDebugKeys | $NEOReadDebugKeys |" + echo "| jobs.$GITHUB_JOB.env.PYTORCH_ENABLE_XPU_FALLBACK | $PYTORCH_ENABLE_XPU_FALLBACK |" + echo "| jobs.$GITHUB_JOB.env.PYTORCH_DEBUG_XPU_FALLBACK | $PYTORCH_DEBUG_XPU_FALLBACK |" + } >> $GITHUB_STEP_SUMMARY + - name: Upload Test log + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: Torch-XPU-Transformers-Log-${{ github.event.pull_request.number || github.sha }} + path: | + ${{ github.workspace }}/transformers/reports + ${{ github.workspace }}/transformers/tests_log diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index d2f717230..b724d4259 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -65,19 +65,21 @@ jobs: conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y source activate xpu_op_${ZE_AFFINITY_MASK} cd ../ && rm -rf pytorch - git clone https://github.com/pytorch/pytorch pytorch - cd pytorch && git checkout $(echo ${{ inputs.pytorch }} |sed 's/^nightly_wheel$/nightly/') - # apply PRs for stock pytorch pip install requests - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py - git status && git show -s - git submodule sync && git submodule update --init --recursive - if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then - echo "Don't replace torch-xpu-ops!" - else - rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ - # Workaround for torch-xpu-ops ci test - sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + git clone https://github.com/pytorch/pytorch pytorch + if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd pytorch && git checkout $(echo ${{ inputs.pytorch }}) + # apply PRs for stock pytorch + python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + git status && git show -s + git submodule sync && git submodule update --init --recursive + if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then + echo "Don't replace torch-xpu-ops!" + else + rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ + # Workaround for torch-xpu-ops ci test + sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt + fi fi - name: Triton Installation run: | @@ -96,15 +98,15 @@ jobs: - name: Build Pytorch XPU run: | source activate xpu_op_${ZE_AFFINITY_MASK} - source .github/scripts/env.sh + source .github/scripts/env.sh ${{ inputs.pytorch }} pip install mkl-static==2025.0.1 mkl-include==2025.0.1 - cd ../pytorch if [[ ${{ inputs.abi }} == '0' ]]; then export _GLIBCXX_USE_CXX11_ABI=0 else export _GLIBCXX_USE_CXX11_ABI=1 fi if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then + cd ../pytorch export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} pip install -r requirements.txt WERROR=1 python setup.py bdist_wheel @@ -112,12 +114,21 @@ jobs: git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd .. else pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + cd ../pytorch + git reset --hard && git checkout ${TORCH_COMMIT_ID} + TORCH_XPU_OPS_COMMIT=$(> "${GITHUB_ENV}" echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCHBENCH_COMMIT_ID=$(> "${GITHUB_ENV}" + echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" diff --git a/.github/workflows/nightly_ondemand_rolling.yml b/.github/workflows/nightly_ondemand_rolling.yml index 0a27b2b50..7515c5003 100644 --- a/.github/workflows/nightly_ondemand_rolling.yml +++ b/.github/workflows/nightly_ondemand_rolling.yml @@ -63,7 +63,7 @@ permissions: read-all concurrency: group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.keep_torch_xpu_ops }}-${{ inputs.ut }}-${{ inputs.triton }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }} - cancel-in-progress: true + cancel-in-progress: ${{ github.event_name != 'schedule' }} jobs: Linux-Nightly-Ondemand-UT-Tests-Rolling: @@ -158,7 +158,7 @@ jobs: fi echo "TORCH_BRANCH_ID=$(git rev-parse --abbrev-ref HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCHBENCH_COMMIT_ID=$(> "${GITHUB_ENV}" + echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" diff --git a/.github/workflows/nightly_ondemand_whl.yml b/.github/workflows/nightly_ondemand_whl.yml index 6b8d0b58f..a742f2b2d 100644 --- a/.github/workflows/nightly_ondemand_whl.yml +++ b/.github/workflows/nightly_ondemand_whl.yml @@ -53,11 +53,11 @@ permissions: read-all concurrency: group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.ut }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }} - cancel-in-progress: true + cancel-in-progress: ${{ github.event_name != 'schedule' }} jobs: Linux-Nightly-Ondemand-UT-WHL-Tests: - if: github.event_name == 'schedule' || ${{ inputs.ut_suite }} + if: github.event_name == 'schedule' || ${{ inputs.ut }} uses: ./.github/workflows/_linux_ut.yml with: ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu' || inputs.ut }} @@ -75,19 +75,20 @@ jobs: ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu' || inputs.ut }} python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} outputs: - TORCH_BRANCH_ID: ${{ steps.pinned.outputs.TORCH_BRANCH_ID }} - TORCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCH_COMMIT_ID }} - DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }} - KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }} - BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }} - OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }} - GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }} + TORCH_BRANCH_ID: ${{ steps.installed.outputs.TORCH_BRANCH_ID }} + TORCH_COMMIT_ID: ${{ steps.installed.outputs.TORCH_COMMIT_ID }} + TORCH_XPU_OPS_COMMIT: ${{ steps.installed.outputs.TORCH_XPU_OPS_COMMIT }} TORCHBENCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCHBENCH_COMMIT_ID }} TORCHVISION_COMMIT_ID: ${{ steps.pinned.outputs.TORCHVISION_COMMIT_ID }} TORCHAUDIO_COMMIT_ID: ${{ steps.pinned.outputs.TORCHAUDIO_COMMIT_ID }} TRANSFORMERS_VERSION: ${{ steps.pinned.outputs.TRANSFORMERS_VERSION }} TIMM_COMMIT_ID: ${{ steps.pinned.outputs.TIMM_COMMIT_ID }} TRITON_COMMIT_ID: ${{ steps.pinned.outputs.TRITON_COMMIT_ID }} + DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }} + KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }} + BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }} + OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }} + GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }} TIMEOUT_MODELS: ${{ steps.summary.outputs.TIMEOUT_MODELS }} steps: - name: Checkout torch-xpu-ops @@ -101,38 +102,43 @@ jobs: pip install mkl-static==2025.0.1 mkl-include==2025.0.1 pip install pandas scipy tqdm - name: Prepare Stock Pytorch + id: installed run: | pwd source activate e2e_ci - source .github/scripts/env.sh + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + echo "TORCH_BRANCH_ID=$(python -c 'import torch; print(torch.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') + echo "TORCH_COMMIT_ID=${TORCH_COMMIT_ID}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" cd ../ && rm -rf pytorch git clone https://github.com/pytorch/pytorch pytorch - cd pytorch && git checkout $(echo ${{ env.pytorch }} |sed 's/^nightly_wheel$/nightly/') + cd pytorch && git checkout ${TORCH_COMMIT_ID} # apply PRs for stock pytorch pip install requests - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py + # python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py git status && git show -s pip install -r requirements.txt - cd ../ - pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + TORCH_XPU_OPS_COMMIT=$(> "${GITHUB_ENV}" + rm -rf third_party/torch-xpu-ops + git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops + cd third_party/torch-xpu-ops + git checkout ${TORCH_XPU_OPS_COMMIT} - name: Identify pinned versions id: pinned run: | source activate e2e_ci - source .github/scripts/env.sh + echo "TORCHVISION_COMMIT_ID=$(python -c 'import torchvision; print(torchvision.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + echo "TORCHAUDIO_COMMIT_ID=$(python -c 'import torchaudio; print(torchaudio.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + echo "TRITON_COMMIT_ID=$(python -c 'import triton; print(triton.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" cd ../pytorch - echo "TRITON_COMMIT_ID=$(pip list |grep -w pytorch-triton-xpu |awk '{print $2}')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCH_BRANCH_ID=nightly" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCH_COMMIT_ID=$(pip list |grep -w torch |awk '{print $2}')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCHBENCH_COMMIT_ID=$(> "${GITHUB_ENV}" - echo "TORCHVISION_COMMIT_ID=$(pip list |grep -w torchvision |awk '{print $2}')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCHAUDIO_COMMIT_ID=$(pip list |grep -w torchaudio |awk '{print $2}')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "DRIVER_VERSION=$(dkms status 2>&1 |grep 'intel-i915-dkms' |sed 's/.*\///;s/,.*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "BUNDLE_VERSION=$(dpcpp --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + echo "BUNDLE_VERSION=$(pip list |grep cmplr |head -n 1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" . /etc/os-release echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" @@ -271,6 +277,7 @@ jobs: repo="${{ github.repository }}" TORCH_BRANCH_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_BRANCH_ID }}" TORCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_COMMIT_ID }}" + TORCH_XPU_OPS_COMMIT="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_XPU_OPS_COMMIT }}" DRIVER_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.DRIVER_VERSION }}" KERNEL_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.KERNEL_VERSION }}" BUNDLE_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.BUNDLE_VERSION }}" @@ -307,7 +314,7 @@ jobs: fi # Test report echo -e "**${test_status}** $test_type WHL Test on $(date +'%F'), See: $build_url\n" > ${{ github.workspace }}/report.txt - printf "Torch-xpu-ops | PyTorch | Triton\n--- | --- | ---\n${GITHUB_WORKFLOW_SHA:0:7} on ${GITHUB_REF_NAME} | " >> ${{ github.workspace }}/report.txt + printf "Torch-xpu-ops | PyTorch | Triton\n--- | --- | ---\n${TORCH_XPU_OPS_COMMIT:0:7} on pinned | " >> ${{ github.workspace }}/report.txt printf "[${TORCH_COMMIT_ID:0:7}](https://github.com/pytorch/pytorch/commit/${TORCH_COMMIT_ID:0:7}) on $TORCH_BRANCH_ID | " >> ${{ github.workspace }}/report.txt echo -e "[${TRITON_COMMIT_ID:0:7}](https://github.com/intel/intel-xpu-backend-for-triton/commit/${TRITON_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt printf "Transformers | Timm | Torchbench | Torchvision | Torchaudio\n--- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 53f93e629..fe6e428f5 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -90,7 +90,7 @@ jobs: cd ../pytorch echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" >> "${GITHUB_ENV}" echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" >> "${GITHUB_ENV}" - echo "TORCHBENCH_COMMIT_ID=$(> "${GITHUB_ENV}" + echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" >> "${GITHUB_ENV}" echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" >> "${GITHUB_ENV}" echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" >> "${GITHUB_ENV}" echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" >> "${GITHUB_ENV}" @@ -144,9 +144,9 @@ jobs: run: | rm -rf ${{ github.workspace }}/upload_files cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files - failed_case=$(grep "Real failed: models: *[1-9]" ${{ github.workspace }}/upload_files/summary_accuracy.log |wc -l || true) + failed_case=$(grep "Real failed models: *[1-9]" ${{ github.workspace }}/upload_files/summary_accuracy.log |wc -l || true) if [ ${failed_case} -ne 0 ];then - grep -E "Real failed: models: [1-9]|Summary for" ${{ github.workspace }}/summary_accuracy.log + grep -E "Real failed models: [1-9]|Summary for" ${{ github.workspace }}/upload_files/summary_accuracy.log exit 1 fi - name: Upload Inductor XPU E2E Data diff --git a/src/ATen/native/transformers/Attention.cpp b/src/ATen/native/transformers/Attention.cpp index bb8b4602b..3090dfbee 100644 --- a/src/ATen/native/transformers/Attention.cpp +++ b/src/ATen/native/transformers/Attention.cpp @@ -93,36 +93,6 @@ static bool check_for_seq_len_1_nested_tensor( return true; } -int64_t _fused_sdp_choice_xpu( - const Tensor& query, - const Tensor& key, - const Tensor& value, - const std::optional& attn_mask_, - double dropout_p, - bool is_causal, - std::optional scale, - bool enable_gqa) { - // We have implemented efficient_attention backend with xetla, flash_attention - // backend is not supported now, which will be implemented in the future. So - // we provide two backends here. - sdp::sdp_params kernel_params{ - query, key, value, attn_mask_, dropout_p, is_causal, enable_gqa}; - // Because TORCHCHECK checks if condition is true we negate debug so that - // The statements will be printed when debug is true - bool print_debug = false; - sdp::SDPBackend backend = - sdp::can_use_mem_efficient_attention(kernel_params, print_debug) - ? sdp::SDPBackend::efficient_attention - : sdp::SDPBackend::math; - if (backend == sdp::SDPBackend::error) { - TORCH_CHECK( - false, - "No viable backend for scaled_dot_product_attention was found. ", - "This is likely due to turning off both the math kernel and the fused kernels."); - } - return static_cast(backend); -} - std::tuple native_multi_head_attention_xpu( const Tensor& query, const Tensor& key, @@ -204,8 +174,12 @@ std::tuple native_multi_head_attention_xpu( value.view({value.size(0), -1, num_head, dim_per_head}).transpose(1, 2); sdp::sdp_params kernel_params{q, k, v, mask, 0.0, false, false}; - auto backend = static_cast( - _fused_sdp_choice_xpu(q, k, v, mask, 0.0, false, {}, false)); + + sdp::SDPBackend backend = sdp::SDPBackend::math; + if (_fused_sdp_choice_stub.is_device_supported(q.device().type())) { + backend = static_cast(_fused_sdp_choice_stub( + q.device().type(), q, k, v, mask, 0.0, false, std::nullopt, false)); + } // strides from packed projection for nested tensors when seq_len is 1 will // be and will trigger a contiguous call in the kernel, so we prevent this diff --git a/src/ATen/native/transformers/SDPUtils.cpp b/src/ATen/native/transformers/SDPUtils.cpp index db4409493..eca5f9829 100644 --- a/src/ATen/native/transformers/SDPUtils.cpp +++ b/src/ATen/native/transformers/SDPUtils.cpp @@ -4,6 +4,8 @@ namespace sdp { +using c10::array_of; + bool check_all_tensors_on_device(sdp_params const& params, bool debug) { // Check that all tensors are on the GPU device // This should be handled by the stub dispatch, but whe call diff --git a/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp b/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp index f0620c530..4a34e70d1 100644 --- a/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp +++ b/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp @@ -29,7 +29,7 @@ Tensor adaptive_avg_pool2d_backward_xpu( (input.ndimension() == 3 || input.ndimension() == 4), "non-empty 3D or 4D (batch mode) tensor expected for input"); - globalContext().alertNotDeterministic("_adaptive_avg_pool2d_backward"); + globalContext().alertNotDeterministic("adaptive_avg_pool2d_backward_xpu"); Tensor grad_input; if (input.numel() != 0) { diff --git a/src/ATen/native/xpu/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/DilatedMaxPool2d.cpp index 600d29e85..a08227b47 100644 --- a/src/ATen/native/xpu/DilatedMaxPool2d.cpp +++ b/src/ATen/native/xpu/DilatedMaxPool2d.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -40,6 +41,62 @@ TORCH_IMPL_FUNC(max_pool2d_with_indices_out_xpu) bool ceil_mode, const Tensor& output, const Tensor& indices) { + const int kH = safe_downcast(kernel_size[0]); + const int kW = kernel_size.size() == 1 + ? kH + : safe_downcast(kernel_size[1]); + const int padH = safe_downcast(padding[0]); + const int padW = + padding.size() == 1 ? padH : safe_downcast(padding[1]); + + const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1; + const int64_t nInputPlane = input.size(-3); + const int64_t inputHeight = input.size(-2); + const int64_t inputWidth = input.size(-1); + + const int64_t outputHeight = output.size(-2); + const int64_t outputWidth = output.size(-1); + if (outputHeight == 1 && outputWidth == 1 && inputHeight <= kH && + inputWidth <= kW && padH == 0 && padW == 0) { + auto smf = input.suggest_memory_format(); + Tensor input_ = input.contiguous(smf); + bool is_3d = input.ndimension() == 3; + Tensor indices_, output_; + if (is_3d) { + indices_ = indices.contiguous(); + output_ = output.contiguous(); + } else { + indices_ = indices.contiguous(smf); + output_ = output.contiguous(smf); + } + if (!is_3d) { + input_.resize_({nbatch, nInputPlane, 1, inputHeight * inputWidth}, smf); + output_.resize_( + {nbatch, nInputPlane, 1, outputHeight * outputWidth}, smf); + indices_.resize_( + {nbatch, nInputPlane, 1, outputHeight * outputWidth}, smf); + at::max_outf(input_, 3, true, output_, indices_); + } else { + at::max_outf(input_, 2, true, output_, indices_); + } + + if (!is_3d) { + input_.resize_({nbatch, nInputPlane, inputHeight, inputWidth}, smf); + output_.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, smf); + indices_.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, smf); + } + + if ((is_3d && !indices.is_contiguous()) || + (!is_3d && !indices.is_contiguous(smf))) { + indices.copy_(indices_); + } + + if ((is_3d && !output.is_contiguous()) || + (!is_3d && !output.is_contiguous(smf))) { + output.copy_(output_); + } + return; + } xpu::max_pool2d_with_indices_kernel( input, kernel_size, diff --git a/src/ATen/native/xpu/RNN.cpp b/src/ATen/native/xpu/RNN.cpp new file mode 100644 index 000000000..74152f293 --- /dev/null +++ b/src/ATen/native/xpu/RNN.cpp @@ -0,0 +1,46 @@ +#include +#include + +namespace at::native { + +std::tuple _thnn_fused_lstm_cell_xpu( + const Tensor& input_gates, + const Tensor& hidden_gates, + const Tensor& cx, + const std::optional& input_bias_opt, + const std::optional& hidden_bias_opt) { + return native::xpu::_thnn_fused_lstm_cell_kernel( + input_gates, hidden_gates, cx, input_bias_opt, hidden_bias_opt); +} + +std::tuple _thnn_fused_lstm_cell_backward_xpu( + const std::optional& grad_hy_opt, + const std::optional& grad_cy_opt, + const Tensor& cx, + const Tensor& cy, + const Tensor& workspace, + bool has_bias) { + return native::xpu::_thnn_fused_lstm_cell_backward_kernel( + grad_hy_opt, grad_cy_opt, cx, cy, workspace, has_bias); +} + +std::tuple _thnn_fused_gru_cell_xpu( + const Tensor& input_gates, + const Tensor& hidden_gates, + const Tensor& hx, + const std::optional& input_bias, + const std::optional& hidden_bias) { + return native::xpu::_thnn_fused_gru_cell_kernel( + input_gates, hidden_gates, hx, input_bias, hidden_bias); +} + +std::tuple +_thnn_fused_gru_cell_backward_xpu( + const Tensor& grad_hy, + const Tensor& workspace, + bool has_bias) { + return native::xpu::_thnn_fused_gru_cell_backward_kernel( + grad_hy, workspace, has_bias); +} + +} // namespace at::native diff --git a/src/ATen/native/xpu/RreluWithNoise.cpp b/src/ATen/native/xpu/RreluWithNoise.cpp index f66833983..fb4e2c333 100644 --- a/src/ATen/native/xpu/RreluWithNoise.cpp +++ b/src/ATen/native/xpu/RreluWithNoise.cpp @@ -6,7 +6,7 @@ namespace native { Tensor& rrelu_with_noise_out_xpu( const Tensor& self, - const Tensor& noise, + Tensor& noise, const Scalar& lower, const Scalar& upper, bool training, @@ -18,7 +18,7 @@ Tensor& rrelu_with_noise_out_xpu( Tensor rrelu_with_noise_xpu( const Tensor& self, - const Tensor& noise, + Tensor& noise, const Scalar& lower, const Scalar& upper, bool training, @@ -30,7 +30,7 @@ Tensor rrelu_with_noise_xpu( Tensor& rrelu_with_noise_xpu_( Tensor& self, - const Tensor& noise, + Tensor& noise, const Scalar& lower, const Scalar& upper, bool training, diff --git a/src/ATen/native/xpu/SoftMax.cpp b/src/ATen/native/xpu/SoftMax.cpp index e816d48c8..f155165ce 100644 --- a/src/ATen/native/xpu/SoftMax.cpp +++ b/src/ATen/native/xpu/SoftMax.cpp @@ -76,6 +76,17 @@ TORCH_IMPL_FUNC(log_softmax_xpu_out) xpu::_log_softmax_kernel(input, dim, half_to_float, output); } +Tensor _safe_softmax_xpu( + const Tensor& self, + int64_t dim, + std::optional dtype) { + // TODO: uncomment after XPU softmax support half_to_float=true + // if (self.scalar_type() == ScalarType::Half && dtype == ScalarType::Float) + // return xpu::_safe_softmax_kernel(self, dim_, true); + Tensor converted = dtype.has_value() ? self.toType(dtype.value()) : self; + return xpu::_safe_softmax_kernel(converted, dim, false); +} + Tensor masked_softmax_xpu( const Tensor& input_, const Tensor& mask_, diff --git a/src/ATen/native/xpu/UpSampleBilinear2d.cpp b/src/ATen/native/xpu/UpSampleBilinear2d.cpp index ee8c37ac0..aec707193 100644 --- a/src/ATen/native/xpu/UpSampleBilinear2d.cpp +++ b/src/ATen/native/xpu/UpSampleBilinear2d.cpp @@ -30,6 +30,7 @@ TORCH_IMPL_FUNC(upsample_bilinear2d_backward_out_xpu) std::optional scales_h, std::optional scales_w, const Tensor& grad_input) { + globalContext().alertNotDeterministic("upsample_bilinear2d_backward_out_xpu"); xpu::upsample_bilinear2d_backward_out_kernel( grad_input, grad_output, diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template index 8492a98be..72f2aacdd 100644 --- a/src/ATen/native/xpu/XPUFallback.template +++ b/src/ATen/native/xpu/XPUFallback.template @@ -184,9 +184,7 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "_linalg_svd.U", "lu_unpack.out", "ormqr", - "_scaled_dot_product_efficient_attention", "_scaled_mm", - "_thnn_fused_gru_cell", "_to_sparse_csr", "triangular_solve.X", "_validate_compressed_sparse_indices", diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp index e21c0160c..d94db11c9 100644 --- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp +++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp @@ -5,9 +5,9 @@ #pragma GCC diagnostic ignored "-Wreturn-type" #include +#include #include #include -#include #include #include diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h index d530560e6..b07041fcb 100644 --- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h +++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h @@ -1,6 +1,6 @@ #pragma once -#include +#include namespace at::native::xpu { diff --git a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp index 57ac0d114..fb034f988 100644 --- a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp +++ b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp @@ -531,6 +531,8 @@ Tensor embedding_bag_backward_xpu_max( const Tensor& max_indices_t, int64_t num_weights, int64_t padding_idx) { + globalContext().alertNotDeterministic("embedding_bag_backward_xpu_max"); + auto max_indices = max_indices_t.contiguous(); auto grad_weight = at::zeros({num_weights, grad.size(1)}, grad.options()); int64_t stride = grad_weight.stride(0); diff --git a/src/ATen/native/xpu/sycl/Indexing.cpp b/src/ATen/native/xpu/sycl/Indexing.cpp index d429ecfbe..bcbd50c42 100644 --- a/src/ATen/native/xpu/sycl/Indexing.cpp +++ b/src/ATen/native/xpu/sycl/Indexing.cpp @@ -207,7 +207,7 @@ void index_select_kernel( }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), - AT_EXPAND(AT_FLOAT8_TYPES), + AT_EXPAND(AT_FLOAT8_TYPES), kComplexHalf, kHalf, kBool, @@ -1081,7 +1081,8 @@ void take_kernel(TensorIterator& iter, const TensorBase& input) { canUse32BitIndexMath(input) ? ScalarType::Int : ScalarType::Long, "take_xpu_index", [&] { - const scalar_t* indexed_ptr = input.template const_data_ptr(); + const scalar_t* indexed_ptr = + input.template const_data_ptr(); TakeFunctor f(indexed_ptr); take_put_kernel_template(iter, input, f); }); @@ -1114,6 +1115,14 @@ void put_kernel( TensorIterator& iter, const TensorBase& output, const bool accumulate) { + // Nondeterministic when index contains duplicate entries and we do not + // accumulate If we accumulate on GPU, we use atomicGPUAdd, which is + // non-deterministic + if (!accumulate || + (accumulate && iter.tensor(1).device().type() == DeviceType::XPU)) { + at::globalContext().alertNotDeterministic("put_"); + } + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( at::ScalarType::BFloat16, at::ScalarType::Half, diff --git a/src/ATen/native/xpu/sycl/LerpKernels.cpp b/src/ATen/native/xpu/sycl/LerpKernels.cpp index 1648f193b..9d7551290 100644 --- a/src/ATen/native/xpu/sycl/LerpKernels.cpp +++ b/src/ATen/native/xpu/sycl/LerpKernels.cpp @@ -57,15 +57,29 @@ struct LerpScalarFunctor { opmath_t weight_val_; }; +void lerp_scalar_kernel( + at::TensorIteratorBase& iter, + const c10::Scalar& weight); + void lerp_tensor_kernel(at::TensorIteratorBase& iter) { auto dtype = iter.common_dtype(); if (at::isComplexType(dtype)) { AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "lerp_xpu", [&] { + if (iter.is_cpu_scalar(3)) { + auto weight_val = iter.scalar_value(3); + iter.remove_operand(3); + return lerp_scalar_kernel(iter, weight_val); + } gpu_kernel(iter, LerpTensorComplexFunctor()); }); } else { AT_DISPATCH_FLOATING_TYPES_AND2( at::ScalarType::Half, at::ScalarType::BFloat16, dtype, "lerp_xpu", [&] { + if (iter.is_cpu_scalar(3)) { + auto weight_val = iter.scalar_value(3); + iter.remove_operand(3); + return lerp_scalar_kernel(iter, weight_val); + } gpu_kernel(iter, LerpTensorFunctor()); }); } diff --git a/src/ATen/native/xpu/sycl/LossCTCKernels.cpp b/src/ATen/native/xpu/sycl/LossCTCKernels.cpp index 9d26a48c7..3dd44968d 100644 --- a/src/ATen/native/xpu/sycl/LossCTCKernels.cpp +++ b/src/ATen/native/xpu/sycl/LossCTCKernels.cpp @@ -1248,7 +1248,7 @@ Tensor ctc_loss_backward_kernel( bool zero_infinity) { // See Note [Writing Nondeterministic Operations] // Nondeterministic because of atomicAdd usage - globalContext().alertNotDeterministic("ctc_loss_backward_kernel"); + globalContext().alertNotDeterministic("ctc_loss_backward_xpu"); return AT_DISPATCH_FLOATING_TYPES( log_probs.scalar_type(), "ctc_loss_backward_xpu", [&] { if (targets.scalar_type() == kLong) { diff --git a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp index 4b93cb3c3..8b018de6b 100644 --- a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp @@ -186,7 +186,7 @@ void nll_loss2d_forward_kernel( int64_t reduction, int64_t ignore_index) { if (reduction != at::Reduction::None) { - at::globalContext().alertNotDeterministic("nll_loss2d_forward_kernel"); + at::globalContext().alertNotDeterministic("nll_loss2d_forward_xpu"); } total_weight.resize_({}); diff --git a/src/ATen/native/xpu/sycl/RNNKernels.cpp b/src/ATen/native/xpu/sycl/RNNKernels.cpp new file mode 100644 index 000000000..bad6bdf69 --- /dev/null +++ b/src/ATen/native/xpu/sycl/RNNKernels.cpp @@ -0,0 +1,968 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace at::native::xpu { + +using at::native::canUse32BitIndexMath; +using at::xpu::detail::getTensorInfo; +using at::xpu::detail::IndexToOffset; +using at::xpu::detail::TensorInfo; + +std::tuple rnn_get_launch_config( + int64_t max_threads_per_group, + int64_t numel) { + int64_t num_groups = + (numel + max_threads_per_group - 1) / max_threads_per_group; + auto hw_max_groups = syclMaxWorkItemsPerTile() / max_threads_per_group; + num_groups = num_groups > hw_max_groups ? hw_max_groups : num_groups; + return std::make_tuple(num_groups, max_threads_per_group); +} + +// Factor will be 3 for GRU and 4 for LSTM +void checkSizes( + CheckedFrom c, + const TensorArg& input_gates, + const TensorArg& hidden_gates, + const TensorArg& input_bias, + const TensorArg& hidden_bias, + int64_t factor, + const TensorArg& prev_hidden) { + checkDim(c, input_gates, 2); + checkSameSize(c, input_gates, hidden_gates); + int64_t gates_size = input_gates->size(1); + + if (input_bias->defined()) { + checkDim(c, input_bias, 1); + checkNumel(c, input_bias, gates_size); + checkSameSize(c, input_bias, hidden_bias); + } + + checkDim(c, prev_hidden, 2); + checkNumel(c, prev_hidden, input_gates->size(0) * gates_size / factor); + + checkAllSameGPU( + c, {input_gates, hidden_gates, input_bias, hidden_bias, prev_hidden}); +} + +bool allContiguous(at::TensorList tensors) { + return std::all_of(tensors.begin(), tensors.end(), [](const at::Tensor& t) { + return !t.defined() || t.is_contiguous(); + }); +} + +template +TensorInfo tryGetTensorInfo(const at::Tensor& t) { + return t.defined() ? getTensorInfo(t) : TensorInfo{}; +} + +void collapseDims(){}; +template +void collapseDims(TensorInfo& info, Args&... infos) { + info.collapseDims(); + collapseDims(infos...); +} + +#define DEVICE_LINEAR_GET(D_TENSOR, INDEX) \ + D_TENSOR.data[IndexToOffset::get(INDEX, D_TENSOR)] + +// Biases are always 1D +#define DEVICE_BIAS_GET(D_TENSOR, INDEX) \ + D_TENSOR.data[IndexToOffset::get(INDEX, D_TENSOR)] + +#define H2F(input) static_cast(input) +#define F2H(input) static_cast(input) + +template +inline T sigmoid(T in) { + T one = static_cast(1.0); + return one / (one + std::exp(-in)); +} + +template +struct LstmCellForwardFunctor { + void operator()(sycl::nd_item<1> item) const { + bool has_bias = bias1_.data != nullptr; + + for (index_type linearIndex = item.get_global_id(0); + linearIndex < totalElements_; + linearIndex += item.get_group_range(0) * item.get_local_range(0)) { + index_type offset = (linearIndex / hsz_) * 4 * hsz_ + linearIndex % hsz_; + + scalar_t iig = DEVICE_LINEAR_GET(input_, offset + 0 * hsz_); + scalar_t ifg = DEVICE_LINEAR_GET(input_, offset + 1 * hsz_); + scalar_t icg = DEVICE_LINEAR_GET(input_, offset + 2 * hsz_); + scalar_t iog = DEVICE_LINEAR_GET(input_, offset + 3 * hsz_); + + scalar_t hig = DEVICE_LINEAR_GET(hidden_, offset + 0 * hsz_); + scalar_t hfg = DEVICE_LINEAR_GET(hidden_, offset + 1 * hsz_); + scalar_t hcg = DEVICE_LINEAR_GET(hidden_, offset + 2 * hsz_); + scalar_t hog = DEVICE_LINEAR_GET(hidden_, offset + 3 * hsz_); + + scalar_t* wig = &DEVICE_LINEAR_GET(workspace_, offset + 0 * hsz_); + scalar_t* wfg = &DEVICE_LINEAR_GET(workspace_, offset + 1 * hsz_); + scalar_t* wcg = &DEVICE_LINEAR_GET(workspace_, offset + 2 * hsz_); + scalar_t* wog = &DEVICE_LINEAR_GET(workspace_, offset + 3 * hsz_); + + scalar_t cx = DEVICE_LINEAR_GET(_cx_, linearIndex); + + scalar_t* hy = &DEVICE_LINEAR_GET(_hy_, linearIndex); + scalar_t* cy = &DEVICE_LINEAR_GET(_cy_, linearIndex); + + scalar_t b1i, b1f, b1c, b1o; + scalar_t b2i, b2f, b2c, b2o; + + if (has_bias) { + b1i = DEVICE_BIAS_GET(bias1_, linearIndex % hsz_ + 0 * hsz_); + b1f = DEVICE_BIAS_GET(bias1_, linearIndex % hsz_ + 1 * hsz_); + b1c = DEVICE_BIAS_GET(bias1_, linearIndex % hsz_ + 2 * hsz_); + b1o = DEVICE_BIAS_GET(bias1_, linearIndex % hsz_ + 3 * hsz_); + + b2i = DEVICE_BIAS_GET(bias2_, linearIndex % hsz_ + 0 * hsz_); + b2f = DEVICE_BIAS_GET(bias2_, linearIndex % hsz_ + 1 * hsz_); + b2c = DEVICE_BIAS_GET(bias2_, linearIndex % hsz_ + 2 * hsz_); + b2o = DEVICE_BIAS_GET(bias2_, linearIndex % hsz_ + 3 * hsz_); + } else { + b1i = F2H(0.0); + b1f = F2H(0.0); + b1c = F2H(0.0); + b1o = F2H(0.0); + b2i = F2H(0.0); + b2f = F2H(0.0); + b2c = F2H(0.0); + b2o = F2H(0.0); + } + + accscalar_t ig, fg, cg, og; + accscalar_t f_hy, f_cy; + + ig = sigmoid(H2F(iig) + H2F(hig) + H2F(b1i) + H2F(b2i)); + fg = sigmoid(H2F(ifg) + H2F(hfg) + H2F(b1f) + H2F(b2f)); + cg = std::tanh(H2F(icg) + H2F(hcg) + H2F(b1c) + H2F(b2c)); + og = sigmoid(H2F(iog) + H2F(hog) + H2F(b1o) + H2F(b2o)); + + f_cy = (fg * H2F(cx)) + (ig * cg); + f_hy = og * std::tanh(f_cy); + + *hy = F2H(f_hy); + *cy = F2H(f_cy); + + // SAVE FOR BACKWARDS + // Also need cy and cx but can be saved easily in python + *wig = F2H(ig); + *wfg = F2H(fg); + *wcg = F2H(cg); + *wog = F2H(og); + } + } + + LstmCellForwardFunctor( + TensorInfo input, + TensorInfo hidden, + TensorInfo bias1, + TensorInfo bias2, + TensorInfo _cx, + TensorInfo _hy, + TensorInfo _cy, + TensorInfo workspace, + index_type hsz, + index_type totalElements) + : input_(input), + hidden_(hidden), + bias1_(bias1), + bias2_(bias2), + _cx_(_cx), + _hy_(_hy), + _cy_(_cy), + workspace_(workspace), + hsz_(hsz), + totalElements_(totalElements) {} + + private: + TensorInfo input_; + TensorInfo hidden_; + TensorInfo bias1_; + TensorInfo bias2_; + TensorInfo _cx_; + TensorInfo _hy_; + TensorInfo _cy_; + TensorInfo workspace_; + index_type hsz_; + index_type totalElements_; +}; + +template +struct LstmCellBackwardFunctor { + void operator()(sycl::nd_item<1> item) const { + bool has_gradoutput = gradoutput_.data != nullptr; + bool has_gradoutputcell = gradoutputcell_.data != nullptr; + + for (index_type linearIndex = item.get_global_id(0); + linearIndex < totalElements_; + linearIndex += item.get_group_range(0) * item.get_local_range(0)) { + index_type offset = (linearIndex / hsz_) * 4 * hsz_ + linearIndex % hsz_; + + scalar_t ig = DEVICE_LINEAR_GET(storage_, offset + 0 * hsz_); + scalar_t fg = DEVICE_LINEAR_GET(storage_, offset + 1 * hsz_); + scalar_t cg = DEVICE_LINEAR_GET(storage_, offset + 2 * hsz_); + scalar_t og = DEVICE_LINEAR_GET(storage_, offset + 3 * hsz_); + + scalar_t* ih = &DEVICE_LINEAR_GET(gradInGates_, offset + 0 * hsz_); + scalar_t* fh = &DEVICE_LINEAR_GET(gradInGates_, offset + 1 * hsz_); + scalar_t* ch = &DEVICE_LINEAR_GET(gradInGates_, offset + 2 * hsz_); + scalar_t* oh = &DEVICE_LINEAR_GET(gradInGates_, offset + 3 * hsz_); + + // will return hidden grads here + scalar_t cx = DEVICE_LINEAR_GET(_cx_, linearIndex); + scalar_t cy = DEVICE_LINEAR_GET(_cy_, linearIndex); + + scalar_t* gi = &DEVICE_LINEAR_GET(gradInputCx_, linearIndex); + + accscalar_t go = has_gradoutput + ? H2F(DEVICE_LINEAR_GET(gradoutput_, linearIndex)) + : 0.f; + accscalar_t goc = has_gradoutputcell + ? H2F(DEVICE_LINEAR_GET(gradoutputcell_, linearIndex)) + : 0.f; + + accscalar_t gcx = std::tanh(H2F(cy)); + + accscalar_t gog = go * gcx; + gcx = go * H2F(og) * (1 - gcx * gcx) + goc; + + accscalar_t gig = gcx * H2F(cg); + accscalar_t gfg = gcx * H2F(cx); + accscalar_t gcg = gcx * H2F(ig); + + gcx = gcx * H2F(fg); + + gig = gig * (1 - H2F(ig)) * H2F(ig); + gfg = gfg * (1 - H2F(fg)) * H2F(fg); + gcg = gcg * (1 - H2F(cg) * H2F(cg)); + gog = gog * (1 - H2F(og)) * H2F(og); + + *ih = F2H(gig); + *fh = F2H(gfg); + *ch = F2H(gcg); + *oh = F2H(gog); + + *gi = F2H(gcx); + } + } + + LstmCellBackwardFunctor( + TensorInfo storage, + TensorInfo gradInGates, + TensorInfo _cx, + TensorInfo _cy, + TensorInfo gradoutput, + TensorInfo gradoutputcell, + TensorInfo gradInputCx, + index_type hsz, + index_type totalElements) + : storage_(storage), + gradInGates_(gradInGates), + _cx_(_cx), + _cy_(_cy), + gradoutput_(gradoutput), + gradoutputcell_(gradoutputcell), + gradInputCx_(gradInputCx), + hsz_(hsz), + totalElements_(totalElements) {} + + private: + TensorInfo storage_; + TensorInfo gradInGates_; + TensorInfo _cx_; + TensorInfo _cy_; + TensorInfo gradoutput_; + TensorInfo gradoutputcell_; + TensorInfo gradInputCx_; + index_type hsz_; + index_type totalElements_; +}; + +template +struct GruCellForwardFunctor { + void operator()(sycl::nd_item<1> item) const { + bool has_bias = Bias1_.data != nullptr; + + for (index_type linearIndex = item.get_global_id(0); + linearIndex < totalElements_; + linearIndex += item.get_group_range(0) * item.get_local_range(0)) { + index_type offset = (linearIndex / hsz_) * 3 * hsz_ + linearIndex % hsz_; + + scalar_t ir = DEVICE_LINEAR_GET(Input_, offset + 0 * hsz_); + scalar_t ii = DEVICE_LINEAR_GET(Input_, offset + 1 * hsz_); + scalar_t in = DEVICE_LINEAR_GET(Input_, offset + 2 * hsz_); + scalar_t hr = DEVICE_LINEAR_GET(Hidden_, offset + 0 * hsz_); + scalar_t hi = DEVICE_LINEAR_GET(Hidden_, offset + 1 * hsz_); + scalar_t hn = DEVICE_LINEAR_GET(Hidden_, offset + 2 * hsz_); + + scalar_t hx = DEVICE_LINEAR_GET(_hx_, linearIndex); + scalar_t* hy = &DEVICE_LINEAR_GET(_hy_, linearIndex); + + scalar_t b1r, b1i, b1n, b2r, b2i, b2n; + + if (has_bias) { + b1r = DEVICE_BIAS_GET(Bias1_, linearIndex % hsz_ + 0 * hsz_); + b1i = DEVICE_BIAS_GET(Bias1_, linearIndex % hsz_ + 1 * hsz_); + b1n = DEVICE_BIAS_GET(Bias1_, linearIndex % hsz_ + 2 * hsz_); + + b2r = DEVICE_BIAS_GET(Bias2_, linearIndex % hsz_ + 0 * hsz_); + b2i = DEVICE_BIAS_GET(Bias2_, linearIndex % hsz_ + 1 * hsz_); + b2n = DEVICE_BIAS_GET(Bias2_, linearIndex % hsz_ + 2 * hsz_); + } else { + b1r = F2H(0.0); + b1i = F2H(0.0); + b1n = F2H(0.0); + b2r = F2H(0.0); + b2i = F2H(0.0); + b2n = F2H(0.0); + } + + offset = (linearIndex / hsz_) * 5 * hsz_ + linearIndex % hsz_; + + accscalar_t rg, ig, ng; + + rg = sigmoid(H2F(ir) + H2F(hr) + H2F(b1r) + H2F(b2r)); + ig = sigmoid(H2F(ii) + H2F(hi) + H2F(b1i) + H2F(b2i)); + + ng = H2F(in) + H2F(b1n) + rg * (H2F(hn) + H2F(b2n)); + ng = std::tanh(ng); + *hy = F2H(ng + ig * (H2F(hx) - ng)); + + // SAVE FOR BACKWARDS + DEVICE_LINEAR_GET(storage_, offset + 0 * hsz_) = F2H(rg); + DEVICE_LINEAR_GET(storage_, offset + 1 * hsz_) = F2H(ig); + DEVICE_LINEAR_GET(storage_, offset + 2 * hsz_) = F2H(ng); + DEVICE_LINEAR_GET(storage_, offset + 3 * hsz_) = hx; + DEVICE_LINEAR_GET(storage_, offset + 4 * hsz_) = F2H(H2F(hn) + H2F(b2n)); + } + } + + GruCellForwardFunctor( + TensorInfo Input, + const TensorInfo Hidden, + const TensorInfo Bias1, + const TensorInfo Bias2, + const TensorInfo _hx, + const TensorInfo _hy, + const TensorInfo storage, + const index_type hsz, + const index_type totalElements) + : Input_(Input), + Hidden_(Hidden), + Bias1_(Bias1), + Bias2_(Bias2), + _hx_(_hx), + _hy_(_hy), + storage_(storage), + hsz_(hsz), + totalElements_(totalElements) {} + + private: + TensorInfo Input_; + const TensorInfo Hidden_; + const TensorInfo Bias1_; + const TensorInfo Bias2_; + const TensorInfo _hx_; + const TensorInfo _hy_; + const TensorInfo storage_; + const index_type hsz_; + const index_type totalElements_; +}; + +template +struct GruCellBackwardFunctor { + void operator()(sycl::nd_item<1> item) const { + for (index_type linearIndex = item.get_global_id(0); + linearIndex < totalElements_; + linearIndex += item.get_group_range(0) * item.get_local_range(0)) { + index_type offset = (linearIndex / hsz_) * 5 * hsz_ + linearIndex % hsz_; + + scalar_t rg = DEVICE_LINEAR_GET(storage_, offset + 0 * hsz_); + scalar_t ig = DEVICE_LINEAR_GET(storage_, offset + 1 * hsz_); + scalar_t ng = DEVICE_LINEAR_GET(storage_, offset + 2 * hsz_); + scalar_t hx = DEVICE_LINEAR_GET(storage_, offset + 3 * hsz_); + scalar_t hn = DEVICE_LINEAR_GET(storage_, offset + 4 * hsz_); + + scalar_t go = DEVICE_LINEAR_GET(gradOutput_, linearIndex); + + offset = (linearIndex / hsz_) * 3 * hsz_ + linearIndex % hsz_; + + accscalar_t gig = H2F(go) * (H2F(hx) - H2F(ng)) * (1 - H2F(ig)) * H2F(ig); + accscalar_t ghx = H2F(go) * H2F(ig); + accscalar_t gin = H2F(go) * (1 - H2F(ig)) * (1 - H2F(ng) * H2F(ng)); + accscalar_t ghn = gin * H2F(rg); + accscalar_t grg = gin * H2F(hn) * (1 - H2F(rg)) * H2F(rg); + + DEVICE_LINEAR_GET(gradInInput_, offset + 0 * hsz_) = F2H(grg); + DEVICE_LINEAR_GET(gradInInput_, offset + 1 * hsz_) = F2H(gig); + DEVICE_LINEAR_GET(gradInInput_, offset + 2 * hsz_) = F2H(gin); + + DEVICE_LINEAR_GET(gradInHidden_, offset + 0 * hsz_) = F2H(grg); + DEVICE_LINEAR_GET(gradInHidden_, offset + 1 * hsz_) = F2H(gig); + DEVICE_LINEAR_GET(gradInHidden_, offset + 2 * hsz_) = F2H(ghn); + DEVICE_LINEAR_GET(gradInputHx_, linearIndex) = F2H(ghx); + } + } + + GruCellBackwardFunctor( + TensorInfo gradInInput, + TensorInfo gradInHidden, + TensorInfo gradOutput, + TensorInfo gradInputHx, + TensorInfo storage, + index_type hsz, + index_type totalElements) + : gradInInput_(gradInInput), + gradInHidden_(gradInHidden), + gradOutput_(gradOutput), + gradInputHx_(gradInputHx), + storage_(storage), + hsz_(hsz), + totalElements_(totalElements) {} + + private: + TensorInfo gradInInput_; + TensorInfo gradInHidden_; + TensorInfo gradOutput_; + TensorInfo gradInputHx_; + TensorInfo storage_; + index_type hsz_; + index_type totalElements_; +}; + +#undef DEVICE_LINEAR_GET +#undef DEVICE_BIAS_GET +#undef H2F +#undef F2H + +template +void lstm_forward_impl( + const Tensor& input_gates, + const Tensor& hidden_gates, + const Tensor& input_bias, + const Tensor& hidden_bias, + const Tensor& cx, + const Tensor& hy, + const Tensor& cy, + const Tensor& workspace) { + using accscalar_t = at::acc_type_device; + + int64_t numel = cx.numel(); + if (numel == 0) + return; + + using KernelT = LstmCellForwardFunctor; + auto max_wg_size = syclMaxWorkGroupSize(); + auto config = rnn_get_launch_config(max_wg_size, numel); + auto nwg = std::get<0>(config); + auto local_range = std::get<1>(config); + + auto input_gatesI = getTensorInfo(input_gates); + auto hidden_gatesI = getTensorInfo(hidden_gates); + auto input_biasI = tryGetTensorInfo(input_bias); + auto hidden_biasI = tryGetTensorInfo(hidden_bias); + auto cxI = getTensorInfo(cx); + auto hyI = getTensorInfo(hy); + auto cyI = getTensorInfo(cy); + auto workspaceI = getTensorInfo(workspace); + index_type hidden_size = cxI.sizes[cxI.dims - 1]; + + if (allContiguous( + {input_gates, + hidden_gates, + input_bias, + hidden_bias, + cx, + hy, + cy, + workspace})) { + collapseDims( + input_gatesI, + hidden_gatesI, + input_biasI, + hidden_biasI, + cxI, + hyI, + cyI, + workspaceI); + KernelT kfn( + input_gatesI, + hidden_gatesI, + input_biasI, + hidden_biasI, + cxI, + hyI, + cyI, + workspaceI, + hidden_size, + numel); + sycl_kernel_submit( + nwg * local_range, local_range, getCurrentSYCLQueue(), kfn); + } else { + KernelT kfn( + input_gatesI, + hidden_gatesI, + input_biasI, + hidden_biasI, + cxI, + hyI, + cyI, + workspaceI, + hidden_size, + numel); + sycl_kernel_submit( + nwg * local_range, local_range, getCurrentSYCLQueue(), kfn); + } +} + +template +void lstm_backward_impl( + const Tensor& grad_hy, + const Tensor& grad_cy, + const Tensor& cx, + const Tensor& cy, + const Tensor& workspace, + const Tensor& grad_gates, + const Tensor& grad_cx) { + using accscalar_t = at::acc_type_device; + + int64_t numel = cx.numel(); + if (numel == 0) + return; + + using KernelT = LstmCellBackwardFunctor; + auto max_wg_size = syclMaxWorkGroupSize(); + auto config = rnn_get_launch_config(max_wg_size, numel); + auto nwg = std::get<0>(config); + auto local_range = std::get<1>(config); + + auto grad_hyI = tryGetTensorInfo(grad_hy); + auto grad_cyI = tryGetTensorInfo(grad_cy); + auto cxI = getTensorInfo(cx); + auto cyI = getTensorInfo(cy); + auto workspaceI = getTensorInfo(workspace); + auto grad_gatesI = getTensorInfo(grad_gates); + auto grad_cxI = getTensorInfo(grad_cx); + index_type hidden_size = cxI.sizes[cxI.dims - 1]; + + if (allContiguous( + {grad_hy, grad_cy, cx, cy, workspace, grad_gates, grad_cx})) { + collapseDims( + grad_hyI, grad_cyI, cxI, cyI, workspaceI, grad_gatesI, grad_cxI); + KernelT kfn( + workspaceI, + grad_gatesI, + cxI, + cyI, + grad_hyI, + grad_cyI, + grad_cxI, + hidden_size, + numel); + sycl_kernel_submit( + nwg * local_range, local_range, getCurrentSYCLQueue(), kfn); + } else { + KernelT kfn( + workspaceI, + grad_gatesI, + cxI, + cyI, + grad_hyI, + grad_cyI, + grad_cxI, + hidden_size, + numel); + sycl_kernel_submit( + nwg * local_range, local_range, getCurrentSYCLQueue(), kfn); + } +} + +template +void gru_forward_impl( + const Tensor& input_gates, + const Tensor& hidden_gates, + const Tensor& input_bias, + const Tensor& hidden_bias, + const Tensor& hx, + const Tensor& hy, + const Tensor& workspace) { + using accscalar_t = at::acc_type_device; + + int64_t numel = hx.numel(); + if (numel == 0) + return; + + using KernelT = GruCellForwardFunctor; + auto max_wg_size = syclMaxWorkGroupSize(); + auto config = rnn_get_launch_config(max_wg_size, numel); + auto nwg = std::get<0>(config); + auto local_range = std::get<1>(config); + + auto input_gatesI = getTensorInfo(input_gates); + auto hidden_gatesI = getTensorInfo(hidden_gates); + auto input_biasI = tryGetTensorInfo(input_bias); + auto hidden_biasI = tryGetTensorInfo(hidden_bias); + auto hxI = getTensorInfo(hx); + auto hyI = getTensorInfo(hy); + auto workspaceI = getTensorInfo(workspace); + index_type hidden_size = hxI.sizes[hxI.dims - 1]; + + if (allContiguous( + {input_gates, + hidden_gates, + input_bias, + hidden_bias, + hx, + hy, + workspace})) { + collapseDims( + input_gatesI, + hidden_gatesI, + input_biasI, + hidden_biasI, + hxI, + hyI, + workspaceI); + KernelT kfn( + input_gatesI, + hidden_gatesI, + input_biasI, + hidden_biasI, + hxI, + hyI, + workspaceI, + hidden_size, + numel); + sycl_kernel_submit( + nwg * local_range, local_range, getCurrentSYCLQueue(), kfn); + } else { + KernelT kfn( + input_gatesI, + hidden_gatesI, + input_biasI, + hidden_biasI, + hxI, + hyI, + workspaceI, + hidden_size, + numel); + sycl_kernel_submit( + nwg * local_range, local_range, getCurrentSYCLQueue(), kfn); + } +} + +template +void gru_backward_impl( + const Tensor& grad_hy, + const Tensor& workspace, + const Tensor& grad_input_gates, + const Tensor& grad_hidden_gates, + const Tensor& grad_hx) { + using accscalar_t = at::acc_type_device; + + int64_t numel = grad_hy.numel(); + if (numel == 0) + return; + + using KernelT = GruCellBackwardFunctor; + auto max_wg_size = syclMaxWorkGroupSize(); + auto config = rnn_get_launch_config(max_wg_size, numel); + auto nwg = std::get<0>(config); + auto local_range = std::get<1>(config); + + auto grad_hyI = getTensorInfo(grad_hy); + auto workspaceI = getTensorInfo(workspace); + auto grad_input_gatesI = + getTensorInfo(grad_input_gates); + auto grad_hidden_gatesI = + getTensorInfo(grad_hidden_gates); + auto grad_hxI = getTensorInfo(grad_hx); + index_type hidden_size = grad_hyI.sizes[grad_hyI.dims - 1]; + + if (allContiguous( + {grad_hy, workspace, grad_input_gates, grad_hidden_gates, grad_hx})) { + collapseDims( + grad_hyI, workspaceI, grad_input_gatesI, grad_hidden_gatesI, grad_hxI); + KernelT kfn( + grad_input_gatesI, + grad_hidden_gatesI, + grad_hyI, + grad_hxI, + workspaceI, + hidden_size, + numel); + sycl_kernel_submit( + nwg * local_range, local_range, getCurrentSYCLQueue(), kfn); + } else { + KernelT kfn( + grad_input_gatesI, + grad_hidden_gatesI, + grad_hyI, + grad_hxI, + workspaceI, + hidden_size, + numel); + sycl_kernel_submit( + nwg * local_range, local_range, getCurrentSYCLQueue(), kfn); + } +} + +// Note [64-bit index math check elision] +// It's enough to perform the check for 64-bit math on the largest tensor only. +// If 32-bit is enough for it, it will suffice for all other tensors too, and we +// can save some work using this trick. + +std::tuple _thnn_fused_lstm_cell_kernel( + const Tensor& input_gates, + const Tensor& hidden_gates, + const Tensor& cx, + const std::optional& input_bias_opt, + const std::optional& hidden_bias_opt) { + // See [Note: hacky wrapper removal for optional tensor] + c10::MaybeOwned input_bias_maybe_owned = + at::borrow_from_optional_tensor(input_bias_opt); + const Tensor& input_bias = *input_bias_maybe_owned; + const Tensor& hidden_bias = hidden_bias_opt.value_or(Tensor()); + + checkSizes( + "_thnn_fused_lstm_cell_xpu", + {input_gates, "input_gates", 1}, + {hidden_gates, "hidden_gates", 2}, + {input_bias, "input_bias", 3}, + {hidden_bias, "hidden_bias", 4}, + /*factor=*/4, + {cx, "prev_hidden", 5}); + + auto workspace = at::empty_like(input_gates, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + auto hy = at::empty_like(cx, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + auto cy = at::empty_like(cx, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + input_gates.scalar_type(), + "_thnn_fused_lstm_cell_xpu", + [&] { + if (canUse32BitIndexMath( + workspace)) { // See Note [64-bit index math check elision] + lstm_forward_impl( + input_gates, + hidden_gates, + input_bias, + hidden_bias, + cx, + hy, + cy, + workspace); + } else { + lstm_forward_impl( + input_gates, + hidden_gates, + input_bias, + hidden_bias, + cx, + hy, + cy, + workspace); + } + }); + return std::make_tuple(std::move(hy), std::move(cy), std::move(workspace)); +} + +void checkLSTMBackwardSizes( + const TensorArg& grad_hy, + const TensorArg& grad_cy, + const TensorArg& cx, + const TensorArg& cy, + const TensorArg& workspace) { + CheckedFrom c = "fused_lstm_cell_backward"; + const TensorArg& defined_grad = grad_hy->defined() ? grad_hy : grad_cy; + checkDim(c, defined_grad, 2); + auto exp_size = defined_grad->sizes(); + if (grad_hy->defined()) { + checkSize(c, grad_hy, exp_size); + } + if (grad_cy->defined()) { + checkSize(c, grad_cy, exp_size); + } + checkSize(c, cx, exp_size); + checkSize(c, cy, exp_size); + checkDim(c, workspace, 2); + checkNumel(c, workspace, exp_size[0] * exp_size[1] * 4); +} + +std::tuple _thnn_fused_lstm_cell_backward_kernel( + const std::optional& grad_hy_opt, + const std::optional& grad_cy_opt, + const Tensor& cx, + const Tensor& cy, + const Tensor& workspace, + bool has_bias) { + // See [Note: hacky wrapper removal for optional tensor] + c10::MaybeOwned grad_hy_maybe_owned = + at::borrow_from_optional_tensor(grad_hy_opt); + const Tensor& grad_hy = *grad_hy_maybe_owned; + const Tensor& grad_cy = grad_cy_opt.value_or(Tensor()); + + if (!grad_hy.defined() && !grad_cy.defined()) { + return std::tuple(); + } + checkLSTMBackwardSizes( + {grad_hy, "grad_hy", 1}, + {grad_cy, "grad_cy", 2}, + {cx, "cx", 3}, + {cy, "cy", 4}, + {workspace, "workspace", 5}); + + auto grad_gates = at::empty_like(workspace, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + auto grad_cx = at::empty_like(cx, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + workspace.scalar_type(), + "_thnn_fused_lstm_cell_backward_xpu", + [&] { + if (canUse32BitIndexMath( + workspace)) { // See Note [64-bit index math check elision] + lstm_backward_impl( + grad_hy, grad_cy, cx, cy, workspace, grad_gates, grad_cx); + } else { + lstm_backward_impl( + grad_hy, grad_cy, cx, cy, workspace, grad_gates, grad_cx); + } + }); + + auto grad_bias = + has_bias ? grad_gates.sum(0, /*keepdim=*/false) : at::Tensor{}; + return std::make_tuple( + std::move(grad_gates), std::move(grad_cx), std::move(grad_bias)); +} + +static constexpr int64_t GRU_WORKSPACE_MULTIPLIER = 5; + +std::tuple _thnn_fused_gru_cell_kernel( + const Tensor& input_gates, + const Tensor& hidden_gates, + const Tensor& hx, + const std::optional& input_bias_opt, + const std::optional& hidden_bias_opt) { + // See [Note: hacky wrapper removal for optional tensor] + c10::MaybeOwned input_bias_maybe_owned = + at::borrow_from_optional_tensor(input_bias_opt); + const Tensor& input_bias = *input_bias_maybe_owned; + const Tensor& hidden_bias = hidden_bias_opt.value_or(Tensor()); + + checkSizes( + "_thnn_fused_gru_cell_xpu", + {input_gates, "input_gates", 1}, + {hidden_gates, "hidden_gates", 2}, + {input_bias, "input_bias", 3}, + {hidden_bias, "hidden_bias", 4}, + /*factor=*/3, + {hx, "prev_hidden", 5}); + + auto workspace = at::empty( + {hx.size(0), hx.size(1) * GRU_WORKSPACE_MULTIPLIER}, hx.options()); + auto hy = at::empty_like(hx, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + input_gates.scalar_type(), + "_thnn_fused_gru_cell_xpu", + [&] { + if (canUse32BitIndexMath( + workspace)) { // See Note [64-bit index math check elision] + gru_forward_impl( + input_gates, + hidden_gates, + input_bias, + hidden_bias, + hx, + hy, + workspace); + } else { + gru_forward_impl( + input_gates, + hidden_gates, + input_bias, + hidden_bias, + hx, + hy, + workspace); + } + }); + return std::make_tuple(std::move(hy), std::move(workspace)); +} + +void checkGRUBackwardSizes( + const TensorArg& grad_hy, + const TensorArg& workspace) { + CheckedFrom c = "fused_gru_cell_backward"; + checkDim(c, grad_hy, 2); + checkSize( + c, + workspace, + {grad_hy->size(0), grad_hy->size(1) * GRU_WORKSPACE_MULTIPLIER}); +} + +std::tuple +_thnn_fused_gru_cell_backward_kernel( + const Tensor& grad_hy, + const Tensor& workspace, + bool has_bias) { + checkGRUBackwardSizes({grad_hy, "grad_hy", 1}, {workspace, "workspace", 2}); + + int64_t hidden_size = workspace.size(1) / GRU_WORKSPACE_MULTIPLIER; + auto grad_input_gates = + at::empty({workspace.size(0), hidden_size * 3}, workspace.options()); + auto grad_hidden_gates = + at::empty({workspace.size(0), hidden_size * 3}, workspace.options()); + auto grad_hx = at::empty_like(grad_hy, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + grad_hy.scalar_type(), + "_thnn_fused_gru_cell_backward_xpu", + [&] { + if (canUse32BitIndexMath( + workspace)) { // See Note [64-bit index math check elision] + gru_backward_impl( + grad_hy, workspace, grad_input_gates, grad_hidden_gates, grad_hx); + } else { + gru_backward_impl( + grad_hy, workspace, grad_input_gates, grad_hidden_gates, grad_hx); + } + }); + + at::Tensor grad_input_bias, grad_hidden_bias; + if (has_bias) { + grad_input_bias = grad_input_gates.sum(0, /*keepdim=*/false); + grad_hidden_bias = grad_hidden_gates.sum(0, /*keepdim=*/false); + } + + return std::make_tuple( + std::move(grad_input_gates), + std::move(grad_hidden_gates), + std::move(grad_hx), + std::move(grad_input_bias), + std::move(grad_hidden_bias)); +} + +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/RNNKernels.h b/src/ATen/native/xpu/sycl/RNNKernels.h new file mode 100644 index 000000000..07f0e3f78 --- /dev/null +++ b/src/ATen/native/xpu/sycl/RNNKernels.h @@ -0,0 +1,36 @@ +#pragma once + +#include + +namespace at::native::xpu { + +TORCH_XPU_API std::tuple _thnn_fused_lstm_cell_kernel( + const Tensor& input_gates, + const Tensor& hidden_gates, + const Tensor& cx, + const std::optional& input_bias_opt, + const std::optional& hidden_bias_opt); + +TORCH_XPU_API std::tuple +_thnn_fused_lstm_cell_backward_kernel( + const std::optional& grad_hy_opt, + const std::optional& grad_cy_opt, + const Tensor& cx, + const Tensor& cy, + const Tensor& workspace, + bool has_bias); + +TORCH_XPU_API std::tuple _thnn_fused_gru_cell_kernel( + const Tensor& input_gates, + const Tensor& hidden_gates, + const Tensor& hx, + const std::optional& input_bias_opt, + const std::optional& hidden_bias_opt); + +TORCH_XPU_API std::tuple +_thnn_fused_gru_cell_backward_kernel( + const Tensor& grad_hy, + const Tensor& workspace, + bool has_bias); + +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ResizeKernel.cpp b/src/ATen/native/xpu/sycl/ResizeKernel.cpp index 237a1c213..f1ee7f944 100644 --- a/src/ATen/native/xpu/sycl/ResizeKernel.cpp +++ b/src/ATen/native/xpu/sycl/ResizeKernel.cpp @@ -25,8 +25,9 @@ void resize_bytes_xpu(StorageImpl* storage, size_t size_bytes) { c10::xpu::XPUGuard guard(device.index()); at::DataPtr data = allocator->allocate(size_bytes); if (storage->data_ptr()) { - auto q = at::xpu::getCurrentSYCLQueue(); + at::globalContext().lazyInitDevice(c10::DeviceType::XPU); + auto q = at::xpu::getCurrentSYCLQueue(); q.memcpy( data.get(), storage->data(), std::min(storage->nbytes(), size_bytes)); } diff --git a/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.cpp b/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.cpp index 533630175..7f6f33805 100644 --- a/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.cpp +++ b/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.cpp @@ -86,7 +86,7 @@ template inline void _rrelu_with_noise_xpu_train( Tensor& output, const Tensor& input_, - const Tensor& noise_, + Tensor& noise_, const Scalar& lower_, const Scalar& upper_, std::optional generator) { @@ -153,7 +153,7 @@ inline void _rrelu_with_noise_xpu_train( Tensor& rrelu_with_noise_kernel( const Tensor& self, - const Tensor& noise, + Tensor& noise, const Scalar& lower, const Scalar& upper, bool training, diff --git a/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.h b/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.h index 8371c38ab..fa7e568ea 100644 --- a/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.h +++ b/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.h @@ -7,7 +7,7 @@ namespace at::native::xpu { TORCH_XPU_API Tensor& rrelu_with_noise_kernel( const Tensor& self, - const Tensor& noise, + Tensor& noise, const Scalar& lower, const Scalar& upper, bool training, diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp index 28d812f2c..0a0c7e718 100644 --- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp +++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp @@ -210,7 +210,8 @@ template < int outer_loop, bool is_masked, typename calc_t, - typename vec_t> + typename vec_t, + bool is_safe_softmax> struct DispatchSoftmaxForwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { [[intel::reqd_sub_group_size(SIMD)]] void operator()( @@ -240,7 +241,8 @@ struct DispatchSoftmaxForwardKernelFunctor if (index >= dim_size_) break; - reg_in[i] = *(reinterpret_cast(in_data_ + group_offset + index)); + reg_in[i] = + *(reinterpret_cast(in_data_ + group_offset + index)); if constexpr (is_masked) { auto vec_offset = group_offset + index; #pragma unroll(vec_size) @@ -309,6 +311,10 @@ struct DispatchSoftmaxForwardKernelFunctor if constexpr (LogSoftMax) { reg_in[i][j] = static_cast(reg_in[i][j] - max_value - sum_value); + } else if ( + is_safe_softmax && + max_value == std::numeric_limits::lowest()) { + reg_in[i][j] = static_cast(0); } else if (sum_value == 0) { reg_in[i][j] = nan_; } else { @@ -386,7 +392,8 @@ template < bool LogSoftMax, int outer_loop, bool is_masked = false, - typename calc_t = decltype(nullptr)> + typename calc_t = decltype(nullptr), + bool is_safe_softmax = false> bool dispatch_softmax_forward_kernel( const scalar_t* in_data, scalar_t* out_data, @@ -412,7 +419,8 @@ bool dispatch_softmax_forward_kernel( outer_loop, is_masked, calc_t, - vec_t>; + vec_t, + /*is_safe_softmax = */ false>; int sub_group_num, global_size_row, local_size_row, range, local_size; int max_group_size = @@ -460,8 +468,8 @@ bool dispatch_softmax_forward_kernel( outer_loop, is_masked, DummyFunctor, - vec_t>; - + vec_t, + is_safe_softmax>; int sub_group_num, global_size_row, local_size_row, range, local_size; int max_group_size = get_wgroup_size( @@ -506,7 +514,8 @@ template < typename IndexType, bool LogSoftMax, typename vec_t, - int align_bytes> + int align_bytes, + bool is_safe_softmax> struct SoftmaxForwardKernelFunctor { void operator()(sycl::nd_item<1> item) const { IndexType local_id = item.get_local_id(0); @@ -562,6 +571,10 @@ struct SoftmaxForwardKernelFunctor { if (LogSoftMax) out_data_[group_offset + linear_idx] = static_cast( in_data_[group_offset + linear_idx] - max_value - sum_value); + else if ( + is_safe_softmax && + max_value == std::numeric_limits::lowest()) + out_data_[group_offset + linear_idx] = static_cast(0); else out_data_[group_offset + linear_idx] = static_cast( std::exp(in_data_[group_offset + linear_idx] - max_value) * @@ -576,6 +589,10 @@ struct SoftmaxForwardKernelFunctor { if (LogSoftMax) in_val[j] = static_cast(in_val[j] - max_value - sum_value); + else if ( + is_safe_softmax && + max_value == std::numeric_limits::lowest()) + in_val[j] = static_cast(0); else in_val[j] = static_cast( std::exp(in_val[j] - max_value) * sum_value); @@ -610,7 +627,8 @@ template < typename scalar_t, typename accscalar_t, typename IndexType, - bool LogSoftMax> + bool LogSoftMax, + bool is_safe_softmax> void softmax_forward_kernel( const scalar_t* in_data, scalar_t* out_data, @@ -625,7 +643,8 @@ void softmax_forward_kernel( IndexType, LogSoftMax, vec_t, - align_bytes>; + align_bytes, + is_safe_softmax>; int local_size = std::min( (dim_size + vec_size - 1) / vec_size, @@ -645,7 +664,8 @@ template < typename accscalar_t, typename IndexType, bool LogSoftMax, - typename vec_t> + typename vec_t, + bool is_safe_softmax> struct SpatialSoftmaxForwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { void operator()(sycl::nd_item<3> item) const { @@ -658,14 +678,16 @@ struct SpatialSoftmaxForwardKernelFunctor // get max value accscalar_t max_value[vec_size]; auto offset = local_row_id * inner_size_ + global_col * vec_size; - vec_t value = *(reinterpret_cast(in_data_ + group_offset + offset)); + vec_t value = + *(reinterpret_cast(in_data_ + group_offset + offset)); #pragma unroll(vec_size) for (int j = 0; j < vec_size; ++j) { max_value[j] = accscalar_t(value[j]); } for (int i = local_row_id + block_row_; i < dim_size_; i += block_row_) { offset = i * inner_size_ + global_col * vec_size; - value = *(reinterpret_cast(in_data_ + group_offset + offset)); + value = + *(reinterpret_cast(in_data_ + group_offset + offset)); #pragma unroll(vec_size) for (int j = 0; j < vec_size; ++j) { max_value[j] = std::max(max_value[j], accscalar_t(value[j])); @@ -695,7 +717,8 @@ struct SpatialSoftmaxForwardKernelFunctor } for (int i = local_row_id + block_row_; i < dim_size_; i += block_row_) { offset = i * inner_size_ + global_col * vec_size; - value = *(reinterpret_cast(in_data_ + group_offset + offset)); + value = + *(reinterpret_cast(in_data_ + group_offset + offset)); #pragma unroll(vec_size) for (int j = 0; j < vec_size; ++j) { sum_value[j] += std::exp(value[j] - max_value[j]); @@ -736,6 +759,10 @@ struct SpatialSoftmaxForwardKernelFunctor if (LogSoftMax) in_val[j] = static_cast(in_val[j] - max_value[j] - sum_value[j]); + else if ( + is_safe_softmax && + max_value[j] == -std::numeric_limits::infinity()) + in_val[j] = static_cast(0); else in_val[j] = static_cast( std::exp(in_val[j] - max_value[j]) * sum_value[j]); @@ -787,7 +814,8 @@ template < typename scalar_t, typename accscalar_t, typename IndexType, - bool LogSoftMax> + bool LogSoftMax, + bool is_safe_softmax> void spatial_softmax_forward( const scalar_t* in_data, scalar_t* out_data, @@ -801,7 +829,8 @@ void spatial_softmax_forward( accscalar_t, IndexType, LogSoftMax, - vec_t>; + vec_t, + is_safe_softmax>; int local_size, block_row; get_wgroup_size_spatial( @@ -818,7 +847,8 @@ void spatial_softmax_forward( accscalar_t, IndexType, LogSoftMax, - vec_t>( + vec_t, + is_safe_softmax>( in_data, out_data, dim_size, @@ -827,7 +857,6 @@ void spatial_softmax_forward( local_size, block_row, group_num); - auto& queue = getCurrentSYCLQueue(); sycl_kernel_submit(global_range, local_range, queue, kfn); } @@ -1387,7 +1416,11 @@ void spatial_softmax_backward_kernel( sycl_kernel_submit(global_range, local_range, queue, kfn); } -template +template < + typename scalar_t, + typename accscalar_t, + bool LogSoftMax, + bool is_safe_softmax> void spatial_softmax_forward( const Tensor& output, const Tensor& input, @@ -1432,7 +1465,10 @@ void spatial_softmax_forward( accscalar_t, \ uint32_t, \ LogSoftMax, \ - outer_loop>( \ + outer_loop, \ + /*is_masked = */ false, \ + /*calc_t = */ decltype(nullptr), \ + /*is_safe_softmax = */ is_safe_softmax>( \ input.const_data_ptr(), \ output.mutable_data_ptr(), \ dim_size, \ @@ -1446,7 +1482,8 @@ void spatial_softmax_forward( scalar_t, \ accscalar_t, \ IndexType, \ - LogSoftMax>( \ + LogSoftMax, \ + is_safe_softmax>( \ input.const_data_ptr(), \ output.mutable_data_ptr(), \ dim_size, \ @@ -1460,7 +1497,8 @@ void spatial_softmax_forward( scalar_t, \ accscalar_t, \ IndexType, \ - LogSoftMax>( \ + LogSoftMax, \ + is_safe_softmax>( \ input.const_data_ptr(), \ output.mutable_data_ptr(), \ dim_size, \ @@ -1749,7 +1787,8 @@ Tensor& masked_softmax_forward( LogSoftMax, \ outer_loop, \ true, \ - decltype(input_calc)>( \ + decltype(input_calc), \ + /*is_safe_softmax = */ false>( \ input.const_data_ptr(), \ output.mutable_data_ptr(), \ dim_size, \ @@ -1922,7 +1961,7 @@ void masked_softmax_backward( #undef SIMD32 } // namespace impl -template +template void host_softmax( const Tensor& input_, const int64_t dim_, @@ -1953,8 +1992,11 @@ void host_softmax( "host_softmax", [&] { using accscalar_t = acc_type_device; - impl::spatial_softmax_forward( - output, input, dim); + impl::spatial_softmax_forward< + scalar_t, + accscalar_t, + LogSoftMax, + is_safe_softmax>(output, input, dim); }); } // return output; @@ -2045,6 +2087,29 @@ void _log_softmax_backward_kernel( grad.contiguous(), output.contiguous(), dim, half_to_float, grad_input); } +Tensor _safe_softmax_kernel( + const Tensor& self, + int64_t dim, + const bool half_to_float) { + auto output_options = + self.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT); + if (half_to_float) { + output_options = output_options.dtype(ScalarType::Float); + } + Tensor output = at::empty_like(self, output_options); + AT_DISPATCH_FLOATING_TYPES_AND2( + ScalarType::Half, + ScalarType::BFloat16, + self.scalar_type(), + "_safe_softmax", + [&] { + host_softmax( + self.contiguous(), dim, half_to_float, output); + }); + + return output; +} + Tensor masked_softmax_kernel( const Tensor& input_, const Tensor& mask_, diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.h b/src/ATen/native/xpu/sycl/SoftMaxKernels.h index 0fc08496b..fc26fec3e 100644 --- a/src/ATen/native/xpu/sycl/SoftMaxKernels.h +++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.h @@ -32,6 +32,9 @@ TORCH_XPU_API void _log_softmax_backward_kernel( bool half_to_float, const Tensor& grad_input); +TORCH_XPU_API Tensor +_safe_softmax_kernel(const Tensor& self, int64_t dim, const bool half_to_float); + TORCH_XPU_API Tensor masked_softmax_kernel( const Tensor& input_, const Tensor& mask_, diff --git a/src/ATen/xpu/EmptyTensor.cpp b/src/ATen/xpu/EmptyTensor.cpp index 3f5e998f8..6411bb221 100644 --- a/src/ATen/xpu/EmptyTensor.cpp +++ b/src/ATen/xpu/EmptyTensor.cpp @@ -54,6 +54,7 @@ TensorBase empty_strided_xpu( IntArrayRef stride, ScalarType dtype, c10::optional device_opt) { + at::globalContext().lazyInitDevice(c10::DeviceType::XPU); const auto device = device_or_default(device_opt); TORCH_INTERNAL_ASSERT(device.is_xpu()); const c10::DeviceGuard device_guard(device); diff --git a/test/regressions/test_safe_softmax.py b/test/regressions/test_safe_softmax.py new file mode 100644 index 000000000..7b390080a --- /dev/null +++ b/test/regressions/test_safe_softmax.py @@ -0,0 +1,44 @@ +import torch +from torch.testing._internal.common_utils import TestCase + +cpu_device = torch.device("cpu") +xpu_device = torch.device("xpu") + + +class TestSafeSoftMax(TestCase): + def test_sm(self): + for dtype in [torch.float, torch.float16, torch.bfloat16]: + x_cpu = torch.randn(128,128,128).to(dtype) + x_xpu = x_cpu.to(xpu_device) + r_cpu = torch.ops.aten._safe_softmax(x_cpu, -1) + r_xpu = torch.ops.aten._safe_softmax(x_xpu, -1) + self.assertEqual(r_xpu.to(cpu_device), r_cpu) + x_cpu[0,0,:] = -float("inf") + x_xpu = x_cpu.to(xpu_device) + r_cpu = torch.ops.aten._safe_softmax(x_cpu, -1) + r_xpu = torch.ops.aten._safe_softmax(x_xpu, -1) + self.assertEqual(r_xpu.to(cpu_device), r_cpu) + + x_cpu = torch.randn(128,128,128).to(dtype) + x_xpu = x_cpu.to(xpu_device) + r_cpu = torch.ops.aten._safe_softmax(x_cpu, 1) + r_xpu = torch.ops.aten._safe_softmax(x_xpu, 1) + self.assertEqual(r_xpu.to(cpu_device), r_cpu) + x_cpu[0,:,0] = -float("inf") + x_xpu = x_cpu.to(xpu_device) + r_cpu = torch.ops.aten._safe_softmax(x_cpu, 1) + r_xpu = torch.ops.aten._safe_softmax(x_xpu, 1) + self.assertEqual(r_xpu.to(cpu_device), r_cpu) + + x_cpu = torch.randn(128,128,128).to(dtype) + x_xpu = x_cpu.to(xpu_device) + r_cpu = torch.ops.aten._safe_softmax(x_cpu, 0) + r_xpu = torch.ops.aten._safe_softmax(x_xpu, 0) + self.assertEqual(r_xpu.to(cpu_device), r_cpu) + x_cpu[:,0,0] = -float("inf") + x_xpu = x_cpu.to(xpu_device) + r_cpu = torch.ops.aten._safe_softmax(x_cpu, 0) + r_xpu = torch.ops.aten._safe_softmax(x_xpu, 0) + self.assertEqual(r_xpu.to(cpu_device), r_cpu) + + diff --git a/test/xpu/extended/run_test_with_skip_bmg.py b/test/xpu/extended/run_test_with_skip_bmg.py new file mode 100644 index 000000000..6499550f5 --- /dev/null +++ b/test/xpu/extended/run_test_with_skip_bmg.py @@ -0,0 +1,22 @@ +import os +import pytest +import sys +from skip_list_common import skip_dict +from skip_list_win import skip_dict as skip_dict_win +from skip_list_win_bmg import skip_dict as skip_dict_win_bmg + +IS_WINDOWS = sys.platform == "win32" + +skip_list = skip_dict["test_ops_xpu.py"] +if IS_WINDOWS: + skip_list += skip_dict_win["test_ops_xpu.py"] + skip_dict_win_bmg["test_ops_xpu.py"] + +skip_options = "not " + skip_list[0] +for skip_case in skip_list[1:]: + skip_option = " and not " + skip_case + skip_options += skip_option + +os.environ["PYTORCH_TEST_WITH_SLOW"]="1" +test_command = ["-k", skip_options, "test_ops_xpu.py", "-v"] +res = pytest.main(test_command) +sys.exit(res) diff --git a/test/xpu/extended/run_test_with_skip_lnl.py b/test/xpu/extended/run_test_with_skip_lnl.py new file mode 100644 index 000000000..a795ca07a --- /dev/null +++ b/test/xpu/extended/run_test_with_skip_lnl.py @@ -0,0 +1,22 @@ +import os +import pytest +import sys +from skip_list_common import skip_dict +from skip_list_win import skip_dict as skip_dict_win +from skip_list_win_lnl import skip_dict as skip_dict_win_lnl + +IS_WINDOWS = sys.platform == "win32" + +skip_list = skip_dict["test_ops_xpu.py"] +if IS_WINDOWS: + skip_list += skip_dict_win["test_ops_xpu.py"] + skip_dict_win_lnl["test_ops_xpu.py"] + +skip_options = "not " + skip_list[0] +for skip_case in skip_list[1:]: + skip_option = " and not " + skip_case + skip_options += skip_option + +os.environ["PYTORCH_TEST_WITH_SLOW"]="1" +test_command = ["-k", skip_options, "test_ops_xpu.py", "-v"] +res = pytest.main(test_command) +sys.exit(res) diff --git a/test/xpu/extended/run_test_with_skip_mtl.py b/test/xpu/extended/run_test_with_skip_mtl.py new file mode 100644 index 000000000..6ed39a64e --- /dev/null +++ b/test/xpu/extended/run_test_with_skip_mtl.py @@ -0,0 +1,22 @@ +import os +import pytest +import sys +from skip_list_common import skip_dict +from skip_list_win import skip_dict as skip_dict_win +from skip_list_win_mtl import skip_dict as skip_dict_win_mtl + +IS_WINDOWS = sys.platform == "win32" + +skip_list = skip_dict["test_ops_xpu.py"] +if IS_WINDOWS: + skip_list += skip_dict_win["test_ops_xpu.py"] + skip_dict_win_mtl["test_ops_xpu.py"] + +skip_options = "not " + skip_list[0] +for skip_case in skip_list[1:]: + skip_option = " and not " + skip_case + skip_options += skip_option + +os.environ["PYTORCH_TEST_WITH_SLOW"]="1" +test_command = ["-k", skip_options, "test_ops_xpu.py", "-v"] +res = pytest.main(test_command) +sys.exit(res) \ No newline at end of file diff --git a/test/xpu/extended/skip_list_arc.py b/test/xpu/extended/skip_list_arc.py index e1e701b84..c8e26ccf3 100644 --- a/test/xpu/extended/skip_list_arc.py +++ b/test/xpu/extended/skip_list_arc.py @@ -7,5 +7,21 @@ "test_compare_cpu_bincount_xpu_int64", "test_compare_cpu_bincount_xpu_int8", "test_compare_cpu_bincount_xpu_uint8", + # RuntimeError: Kernel is incompatible with all devices in devs + # https://github.com/intel/torch-xpu-ops/issues/1150 + "test_compare_cpu_logcumsumexp_xpu_float16", + "test_compare_cpu_logcumsumexp_xpu_float32", + "test_compare_cpu_nn_functional_pdist_xpu_float32", + "test_compare_cpu_tril_indices_xpu_int32", + "test_compare_cpu_tril_indices_xpu_int64", + "test_compare_cpu_triu_indices_xpu_int32", + "test_compare_cpu_triu_indices_xpu_int64", + "test_backward_logcumsumexp_xpu_float32", + "test_backward_nn_functional_pdist_xpu_float32", + "test_forward_ad_logcumsumexp_xpu_float32", + "test_operator_logcumsumexp_xpu_float32", + "test_operator_nn_functional_pdist_xpu_float32", + "test_view_replay_logcumsumexp_xpu_float32", + "test_view_replay_nn_functional_pdist_xpu_float32", ), } diff --git a/test/xpu/extended/skip_list_common.py b/test/xpu/extended/skip_list_common.py index 6b5fd653e..643d631eb 100644 --- a/test/xpu/extended/skip_list_common.py +++ b/test/xpu/extended/skip_list_common.py @@ -194,5 +194,9 @@ # Greatest absolute difference: 0.0625 at index (1,) (up to 0.001 allowed) # Greatest relative difference: 0.00640869140625 at index (1,) (up to 0.001 allowed) "test_compare_cpu_xlogy_xpu_bfloat16", + "test_compare_cpu_div_trunc_rounding_xpu_float64", + "test_compare_cpu_div_trunc_rounding_xpu_float16", + "test_compare_cpu_div_floor_rounding_xpu_float16", + "test_compare_cpu_div_floor_rounding_xpu_bfloat16", ), } diff --git a/test/xpu/extended/skip_list_win_bmg.py b/test/xpu/extended/skip_list_win_bmg.py new file mode 100644 index 000000000..2ee1dd31e --- /dev/null +++ b/test/xpu/extended/skip_list_win_bmg.py @@ -0,0 +1,13 @@ +skip_dict = { + "test_ops_xpu.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1173 + # Fatal Python error: Illegal instruction + "test_compare_cpu_grid_sampler_2d_xpu_float64", + "test_compare_cpu_cosh_xpu_complex64", + "test_compare_cpu_nn_functional_softshrink_xpu_bfloat16", + "test_compare_cpu_nn_functional_softshrink_xpu_float16", + "test_compare_cpu_nn_functional_softshrink_xpu_float32", + "test_compare_cpu_nn_functional_softshrink_xpu_float64", + "test_compare_cpu_square_xpu_complex128", + ), +} diff --git a/test/xpu/extended/skip_list_win_lnl.py b/test/xpu/extended/skip_list_win_lnl.py new file mode 100644 index 000000000..2ee1dd31e --- /dev/null +++ b/test/xpu/extended/skip_list_win_lnl.py @@ -0,0 +1,13 @@ +skip_dict = { + "test_ops_xpu.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1173 + # Fatal Python error: Illegal instruction + "test_compare_cpu_grid_sampler_2d_xpu_float64", + "test_compare_cpu_cosh_xpu_complex64", + "test_compare_cpu_nn_functional_softshrink_xpu_bfloat16", + "test_compare_cpu_nn_functional_softshrink_xpu_float16", + "test_compare_cpu_nn_functional_softshrink_xpu_float32", + "test_compare_cpu_nn_functional_softshrink_xpu_float64", + "test_compare_cpu_square_xpu_complex128", + ), +} diff --git a/test/xpu/extended/skip_list_win_mtl.py b/test/xpu/extended/skip_list_win_mtl.py new file mode 100644 index 000000000..8ec6baac6 --- /dev/null +++ b/test/xpu/extended/skip_list_win_mtl.py @@ -0,0 +1,51 @@ +skip_dict = { + # failed on MTL windows, skip first for Preci + "test_ops_xpu.py": ( + "test_compare_cpu_cosh_xpu_complex128", + "test_compare_cpu_frexp_xpu_bfloat16", + "test_compare_cpu_frexp_xpu_float16", + "test_compare_cpu_frexp_xpu_float32", + "test_compare_cpu_frexp_xpu_float64", + "test_compare_cpu_max_pool2d_with_indices_backward_xpu_bfloat16", + "test_compare_cpu_max_pool2d_with_indices_backward_xpu_float16", + "test_compare_cpu_max_pool2d_with_indices_backward_xpu_float32", + "test_compare_cpu_max_pool2d_with_indices_backward_xpu_float64", + "test_compare_cpu_nn_functional_avg_pool2d_xpu_bfloat16", + "test_compare_cpu_nn_functional_avg_pool3d_xpu_float32", + "test_compare_cpu_nn_functional_avg_pool3d_xpu_float64", + "test_compare_cpu_nn_functional_batch_norm_xpu_float16", + "test_compare_cpu_nn_functional_interpolate_bicubic_xpu_float32", + "test_compare_cpu_nn_functional_interpolate_bicubic_xpu_float64", + "test_compare_cpu_nn_functional_interpolate_bilinear_xpu_float32", + "test_compare_cpu_nn_functional_interpolate_bilinear_xpu_float64", + "test_compare_cpu_nn_functional_max_pool2d_xpu_bfloat16", + "test_compare_cpu_nn_functional_max_pool2d_xpu_float16", + "test_compare_cpu_nn_functional_max_pool2d_xpu_float32", + "test_compare_cpu_nn_functional_max_pool2d_xpu_float64", + "test_compare_cpu_norm_nuc_xpu_complex128", + "test_compare_cpu_norm_nuc_xpu_complex64", + "test_compare_cpu_norm_nuc_xpu_float32", + "test_compare_cpu_norm_nuc_xpu_float64", + "test_compare_cpu_sinh_xpu_complex128", + "test_compare_cpu_softmax_with_dtype_xpu_bfloat16", + "test_compare_cpu_softmax_with_dtype_xpu_complex128", + "test_compare_cpu_softmax_with_dtype_xpu_complex64", + "test_compare_cpu_softmax_with_dtype_xpu_float64", + "test_compare_cpu_softmax_with_dtype_xpu_int32", + "test_compare_cpu_softmax_with_dtype_xpu_int64", + "test_compare_cpu_softmax_with_dtype_xpu_uint8", + "test_compare_cpu_softmax_xpu_float64", + "test_compare_cpu_square_xpu_complex128", + "test_backward_norm_nuc_xpu_float32", + "test_cow_input_norm_nuc_xpu_float32", + "test_forward_ad_norm_nuc_xpu_float32", + "test_operator_norm_nuc_xpu_float32", + "test_view_replay_norm_nuc_xpu_float32", + "test_compare_cpu_nn_functional_avg_pool2d_xpu_float32", + "test_compare_cpu_nn_functional_avg_pool2d_xpu_float64", + "test_compare_cpu_softmax_with_dtype_xpu_bool", + "test_compare_cpu_softmax_with_dtype_xpu_float32", + "test_compare_cpu_softmax_with_dtype_xpu_int16", + "test_compare_cpu_softmax_with_dtype_xpu_int8", + ), +} diff --git a/test/xpu/run_test_with_skip_bmg.py b/test/xpu/run_test_with_skip_bmg.py new file mode 100644 index 000000000..9bd360296 --- /dev/null +++ b/test/xpu/run_test_with_skip_bmg.py @@ -0,0 +1,24 @@ +import os +import sys +from skip_list_common import skip_dict +from skip_list_win import skip_dict as skip_dict_win +from skip_list_win_bmg import skip_dict as skip_dict_win_bmg +from xpu_test_utils import launch_test + + +res = 0 +IS_WINDOWS = sys.platform == "win32" + +for key in skip_dict: + skip_list = skip_dict[key] + if IS_WINDOWS and key in skip_dict_win: + skip_list += skip_dict_win[key] + if IS_WINDOWS and key in skip_dict_win_bmg: + skip_list += skip_dict_win_bmg[key] + res += launch_test(key, skip_list) + +if os.name == "nt": + sys.exit(res) +else: + exit_code = os.WEXITSTATUS(res) + sys.exit(exit_code) \ No newline at end of file diff --git a/test/xpu/run_test_with_skip_lnl.py b/test/xpu/run_test_with_skip_lnl.py new file mode 100644 index 000000000..4413626ea --- /dev/null +++ b/test/xpu/run_test_with_skip_lnl.py @@ -0,0 +1,24 @@ +import os +import sys +from skip_list_common import skip_dict +from skip_list_win import skip_dict as skip_dict_win +from skip_list_win_lnl import skip_dict as skip_dict_win_lnl +from xpu_test_utils import launch_test + + +res = 0 +IS_WINDOWS = sys.platform == "win32" + +for key in skip_dict: + skip_list = skip_dict[key] + if IS_WINDOWS and key in skip_dict_win: + skip_list += skip_dict_win[key] + if IS_WINDOWS and key in skip_dict_win_lnl: + skip_list += skip_dict_win_lnl[key] + res += launch_test(key, skip_list) + +if os.name == "nt": + sys.exit(res) +else: + exit_code = os.WEXITSTATUS(res) + sys.exit(exit_code) \ No newline at end of file diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py index 7c3aa7f8e..fdf481f9c 100644 --- a/test/xpu/skip_list_common.py +++ b/test/xpu/skip_list_common.py @@ -2,30 +2,62 @@ "test_ops_xpu.py": ( # Skip list of base line - # Need to revisit when the ops are enabled - # AssertionError: The supported dtypes for xxx on device type xpu are incorrect! + # XPU implementation doesn't claimn FP8 now + # https://github.com/intel/torch-xpu-ops/issues/461 + "float8", + + # workarounds for the following tests + # https://github.com/intel/torch-xpu-ops/issues/1214 + "test_python_ref__refs_exp_xpu_complex128", + "test_python_ref__refs_sigmoid_xpu_complex128", + "test_python_ref_executor__refs_log2_executor_aten_xpu_complex128", + "test_python_ref_executor__refs_exp_executor_aten_xpu_complex128", + "test_python_ref_torch_fallback__refs_log2_xpu_complex128", + "test_python_ref_torch_fallback__refs_log10_xpu_complex128", + "test_python_ref_torch_fallback__refs_sigmoid_xpu_complex128", + "test_python_ref_executor__refs_log10_executor_aten_xpu_complex128", + "test_noncontiguous_samples_histogram_xpu_float32", + + # TODO: Fix the following tests + "test_out_warning_torch__scaled_mm_xpu", + + # To be removed from this file. + # CUDA and XPU both XFAIL now. + "test_out_narrow_copy_xpu_float32", + # This case is marked as skip but XPU failed. However, CUDA and XPU throw the same runtime error. + "test_out_histc_xpu_float32", + + # AssertionError: The supported dtypes for __rmod__ on device type xpu are incorrect! + # The following dtypes worked in forward but are not listed by the OpInfo: {torch.int8, torch.int16, torch.int32, torch.int64, torch.uint8}. "test_dtypes___rmod___xpu", + + # Data type is not supported in oneDNN! "test_dtypes_nn_functional_conv1d_xpu", "test_dtypes_nn_functional_conv2d_xpu", "test_dtypes_nn_functional_conv3d_xpu", "test_dtypes_nn_functional_conv_transpose1d_xpu", "test_dtypes_nn_functional_conv_transpose2d_xpu", "test_dtypes_nn_functional_conv_transpose3d_xpu", + + # AssertionError: The supported dtypes for nn.functional.softsign on device type xpu are incorrect! "test_dtypes_nn_functional_softsign_xpu", + + # AssertionError: The supported dtypes for sparse.sampled_addmm on device type xpu are incorrect! - OPs not supported "test_dtypes_sparse_sampled_addmm_xpu", - # AssertionError: RuntimeError not raised + + # OPs not supported "test_errors_dot_xpu", - "test_errors_kthvalue_xpu", "test_errors_vdot_xpu", - # Fallback cases with skipCPUIfNoLapack, AssertionError: Tensor-likes are not close! + + # Linalg OPs not supported "test_noncontiguous_samples_linalg_det_xpu_float32", "test_noncontiguous_samples_linalg_slogdet_xpu_float32", "test_noncontiguous_samples_linalg_solve_ex_xpu_float32", "test_noncontiguous_samples_linalg_solve_xpu_float32", "test_noncontiguous_samples_linalg_tensorsolve_xpu_float32", "test_noncontiguous_samples_logdet_xpu_float32", - "test_noncontiguous_samples_nn_functional_conv3d_xpu_complex64", + # Sparse CSR OPs not supported # RuntimeError: device type of values (xpu) must be CPU or CUDA or Meta # https://github.com/intel/torch-xpu-ops/issues/357 "test_compare_cpu_sparse_sampled_addmm_xpu_float32", @@ -51,6 +83,7 @@ "test_noncontiguous_samples_nn_functional_conv1d_xpu_int64", "test_noncontiguous_samples_nn_functional_conv2d_xpu_int64", + # Linalg OPs not supported # RuntimeError: mode only supports CPU AND CUDA device type, got: xpu # Issue https://github.com/intel/torch-xpu-ops/issues/327 "test_numpy_ref_linalg_tensorinv_xpu_float64", @@ -62,19 +95,20 @@ "test_variant_consistency_eager_nn_functional_conv_transpose3d_xpu_complex64", "test_variant_consistency_eager_nn_functional_conv_transpose3d_xpu_float32", - # Need revisit when the op is enabled - # Unexpected success, xpu passed because it compares to cpu + # Linalg OPs not supported "test_compare_cpu_linalg_lu_factor_ex_xpu_float32", "test_compare_cpu_linalg_lu_factor_xpu_float32", "test_compare_cpu_linalg_lu_xpu_float32", + + # XPU hang. CUDA hang as well. + # https://github.com/pytorch/pytorch/issues/79528 "test_compare_cpu_special_hermite_polynomial_h_xpu_float32", - # XFAIL of CUDA and XPU, unexpected success in fallback + # XFAIL of CUDA and XPU, unexpected success in fallback + # Linalg OPs not supported "test_out_cholesky_inverse_xpu_float32", "test_out_geqrf_xpu_float32", - "test_out_narrow_copy_xpu_float32", "test_out_ormqr_xpu_float32", - "test_out_histc_xpu_float32", # XFAIL of CUDA, XPU got unexpected success "test_python_ref__refs_div_no_rounding_mode_xpu_complex32", @@ -87,6 +121,7 @@ "test_python_ref_torch_fallback__refs_pow_xpu_complex32", # unexpected success because of cpu fallback + # Linalg OPs not supported "test_out_triangular_solve_xpu_float32", # Newly added: @@ -107,15 +142,17 @@ "_jiterator_", # https://github.com/intel/torch-xpu-ops/issues/157 # Segfault: - "test_dtypes_nn_functional_linear_xpu", # https://github.com/intel/torch-xpu-ops/issues/157 "test_dtypes_nn_functional_multi_head_attention_forward_xpu", # https://github.com/intel/torch-xpu-ops/issues/157 + + # Linalg OPs not supported "test_dtypes_pca_lowrank_xpu", # https://github.com/intel/torch-xpu-ops/issues/157 "test_dtypes_svd_lowrank_xpu", # https://github.com/intel/torch-xpu-ops/issues/157 + + # RuntimeError: Long is not supported in oneDNN! "test_noncontiguous_samples_nn_functional_linear_xpu_int64", # https://github.com/intel/torch-xpu-ops/issues/157 + # https://github.com/intel/torch-xpu-ops/issues/157 - # Failures: - "test_compare_cpu_addmm_xpu_float32", - "test_compare_cpu_addmv_xpu_float32", + # Datatype not supported in oneDNN "test_dtypes_addmm_decomposed_xpu", "test_dtypes_addmm_xpu", "test_dtypes_addmv_xpu", @@ -395,11 +432,13 @@ "test_variant_consistency_eager_svd_xpu_complex64", "test_variant_consistency_eager_tensordot_xpu_complex64", "test_variant_consistency_eager_triangular_solve_xpu_complex64", + # oneDNN issues # RuntimeError: value cannot be converted to type float without overflow # https://github.com/intel/torch-xpu-ops/issues/683 "test_conj_view_addbmm_xpu_complex64", "test_neg_conj_view_addbmm_xpu_complex128", + ### Error #0 in TestMathBitsXPU , RuntimeError: Double and complex datatype matmul is not supported in oneDNN # https://github.com/intel/torch-xpu-ops/issues/254 "test_conj_view___rmatmul___xpu_complex64", @@ -609,32 +648,34 @@ "test_conj_view_svd_lowrank_xpu_complex64", "test_neg_conj_view_pca_lowrank_xpu_complex128", "test_neg_conj_view_svd_lowrank_xpu_complex128", + + # oneDNN issues ### Error #1 in TestMathBitsXPU , RuntimeError: could not create a primitive descriptor for a deconvolution forward propagation primitive # https://github.com/intel/torch-xpu-ops/issues/253 "test_conj_view_nn_functional_conv_transpose2d_xpu_complex64", "test_conj_view_nn_functional_conv_transpose3d_xpu_complex64", "test_neg_view_nn_functional_conv_transpose2d_xpu_float64", "test_neg_view_nn_functional_conv_transpose3d_xpu_float64", - # Op impl aligns with CUDA on the supported dtypes. - # RuntimeError: "avg_pool2d_xpu" not implemented for 'Long'. - # Retrieve the case, once avg_pool1d is supported. Test infra will change claimed dtypes in test case once the op is listed - # in XPU supported operators. Then the case will work. - "test_noncontiguous_samples_nn_functional_avg_pool1d_xpu_int64", - "test_noncontiguous_samples_nn_functional_local_response_norm_xpu_int64", - - # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16. - # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error. - #"test_dtypes_polar_xpu", + # implemented aten::histogram to align MPS operators coverage, CUDA doesn't support # but test_dtypes infrastructure leverage CUDA supported datatypes "test_dtypes_histogram_xpu", - # Unexpected success, CUDA got XFAIL because CUDA does not have historgramadd supported" + # Unexpected success, CUDA got XFAIL because CUDA does not have historgramadd supported "test_errors_histogramdd_xpu", # 2025 bundle std::pow complex result is different on host and device "test_python_ref__refs_square_xpu_complex64", "test_python_ref_torch_fallback__refs_square_xpu_complex64", + "test_python_ref_torch_fallback__refs_exp_xpu_complex128", + + # Failed on rolling driver, passed on preci + "test_python_ref__refs_div_trunc_rounding_xpu_float64", + "test_python_ref_executor__refs_div_trunc_rounding_executor_aten_xpu_float64", + "test_python_ref_torch_fallback__refs_div_trunc_rounding_xpu_float64", + + # TODO: passed from source code building version, investigate + "test_python_ref__refs_log2_xpu_complex128", ), "test_binary_ufuncs_xpu.py": ( @@ -661,7 +702,7 @@ "test_autograd_fallback_xpu.py": None, - "test_sort_and_select_xpu.py": ("test_sort_large_slice_xpu",), # Hard code CUDA + "test_sort_and_select_xpu.py": ("test_sort_large_slice_xpu",), # Hard code CUDA, UT has already been rewritten to test/regressions/test_sort.py. "nn/test_embedding_xpu.py": ( # NotImplementedError: Could not run 'aten::_indices' with arguments from the 'SparseXPU' backend. @@ -713,8 +754,12 @@ "test_disable_fastpath_xpu", # We have no mechanism to handle SDPBackend::ERROR so far. Will give a fully support when we support all SDPBackends. "test_dispatch_fails_no_backend_xpu", + + # NestedTensorXPU not supported # Could not run 'aten::_to_copy' with arguments from the 'NestedTensorXPU' backend "test_with_nested_tensor_input_xpu", + + # oneDNN issues # Double and complex datatype matmul is not supported in oneDNN # https://github.com/intel/torch-xpu-ops/issues/253 "test_sdp_math_gradcheck_contiguous_inputs_False_xpu", @@ -920,33 +965,7 @@ "test_cpu_gpu_parity_nn_ConvTranspose2d_xpu_complex32", # CPU fallback fails # RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead. - "test_save_load_nn_GRU_eval_mode_xpu_float32", - "test_save_load_nn_GRUCell_xpu_float32", - "test_save_load_nn_GRU_train_mode_xpu_float32", - # Could not run 'aten::_thnn_fused_lstm_cell' with arguments from the 'CPU' backend. - "_LSTM_", - "_LSTMCell_", - # CPU fallback fails - # Could not run 'aten::_thnn_fused_gru_cell' with arguments from the 'CPU' backend. - "test_to_nn_GRUCell_swap_True_set_grad_False_xpu_float32", - "test_to_nn_GRU_eval_mode_swap_True_set_grad_False_xpu_float32", - "test_to_nn_GRU_train_mode_swap_True_set_grad_False_xpu_float32 ", - "test_cpu_gpu_parity_nn_GRUCell_xpu_float32", - "test_cpu_gpu_parity_nn_GRU_eval_mode_xpu_float32", - "test_cpu_gpu_parity_nn_GRU_train_mode_xpu_float32", - "test_forward_nn_GRUCell_xpu_float32", - "test_forward_nn_GRU_eval_mode_xpu_float32", - "test_forward_nn_GRU_train_mode_xpu_float32", - "test_if_train_and_eval_modes_differ_nn_GRUCell_xpu_float32", - "test_memory_format_nn_GRUCell_xpu_float32", - "test_memory_format_nn_GRU_eval_mode_xpu_float32", - "test_memory_format_nn_GRU_train_mode_xpu_float32", - "test_multiple_device_transfer_nn_GRUCell_xpu_float32", - "test_multiple_device_transfer_nn_GRU_eval_mode_xpu_float32", - "test_multiple_device_transfer_nn_GRU_train_mode_xpu_float32", - "test_non_contiguous_tensors_nn_GRUCell_xpu_float32", - "test_non_contiguous_tensors_nn_GRU_eval_mode_xpu_float32", - "test_non_contiguous_tensors_nn_GRU_train_mode_xpu_float32", + # AssertionError: False is not true "test_to_nn_BatchNorm1d_eval_mode_swap_True_set_grad_True_xpu_float32", "test_to_nn_BatchNorm1d_train_mode_swap_True_set_grad_True_xpu_float32", @@ -991,6 +1010,7 @@ "test_type", # rnn fallback to cpu "test_cudnn_weight_format", + # oneDNN issues # AssertionError: MultiheadAttention does not support NestedTensor outside of its fast path. The fast path was not hit because some Tensor argument's device is neither one of cpu, cuda or privateuseone "test_TransformerEncoderLayer_empty_xpu", "test_transformerencoderlayer_xpu_float16", @@ -1015,12 +1035,8 @@ "test_rnn_retain_variables_xpu_float64", "test_transformerencoderlayer_xpu_float64", "test_variable_sequence_xpu_float64", - # AssertionError: RuntimeError not raised - "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_True_num_channels_3_mode_bicubic_uint8_xpu_uint8", - "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_True_num_channels_3_mode_bilinear_uint8_xpu_uint8", - "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_True_num_channels_5_mode_bicubic_uint8_xpu_uint8", - "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_True_num_channels_5_mode_bilinear_uint8_xpu_uint8", - # upsamplingNearest2d: Failed: Unexpected success + # Unexpected success: CUDA only test case, launch grid_y == 2**16 (larger than CUDA maximum y-dimension limit 65535) and expect fail. + # SYCL don't have this limitation and hence can pass. "test_upsamplingNearest2d_launch_fail_xpu", # Could not run 'aten::_thnn_fused_lstm_cell' with arguments from the 'CPU' backend. "test_RNN_cudnn_weight_norm", @@ -1040,13 +1056,6 @@ ), "test_indexing_xpu.py": ( - # CPU bias cases - # It is kernel assert on XPU implementation not exception on host. - # We are same as CUDA implementation. And CUDA skips these cases. - "test_trivial_fancy_out_of_bounds_xpu", - # index boundary should be checked. - # https://github.com/intel/torch-xpu-ops/issues/783 - "test_advancedindex_xpu_float64", # XPU implementation doesn't claimn FP8 now # https://github.com/intel/torch-xpu-ops/issues/461 "test_index_put_src_datatype_xpu_float8_e5m2", @@ -1104,8 +1113,6 @@ # Sometimes, will raise AssertionError: "Simulate error" does not match "grad can be implicitly created only for scalar outputs" # https://github.com/intel/torch-xpu-ops/issues/1071 "test_reentrant_parent_error_on_cpu_xpu", - # Could not run 'aten::_thnn_fused_lstm_cell' with arguments from the 'CPU' backend. - "test_rnn_backward_to_input_but_not_parameters_xpu", ), "test_reductions_xpu.py": ( @@ -1116,8 +1123,6 @@ "test_unary_ufuncs_xpu.py": ( # AssertionError: Jiterator is only supported on CUDA and ROCm GPUs, none are available. "_jiterator_", - # CPU Fallback fails: Tensor-likes are not close! - "test_reference_numerics_large_tanh_xpu_complex32", # For extreme value processing, Numpy and XPU results are inconsistent # std operations get different behavior on std::complex operarands for extremal cases "test_reference_numerics_extremal__refs_log_xpu_complex64", @@ -1158,7 +1163,8 @@ # Greatest relative difference: 1.9145216356264427e-05 at index (463, 204) (up to 1.3e-06 allowed) "test_reference_numerics_normal__refs_asinh_xpu_complex64", "test_reference_numerics_normal_asinh_xpu_complex64", - # Failed: Unexpected success + "test_batch_vs_slicing__refs_sigmoid_xpu_complex128", + # Unexpected success: CUDA uses thrust::sqrt and has accuracy issue. XPU use std::sqrt and has no issue. "test_reference_numerics_large_rsqrt_xpu_complex32", # Numeric difference # https://github.com/intel/torch-xpu-ops/issues/544 @@ -1178,10 +1184,6 @@ # CUDA XFAIL "test_reference_numerics_large__refs_rsqrt_xpu_complex32", - # Compiler issue in handling tanh with real or imag inf. - # https://github.com/intel/torch-xpu-ops/issues/184, https://jira.devtools.intel.com/browse/CMPLRLIBS-34974 - "test_reference_numerics_large__refs_tanh_xpu_complex32", - # 2025 bundle std::pow complex result is different on host and device "test_exp_xpu_complex64", "test_reference_numerics_extremal__refs_exp2_xpu_complex64", @@ -1193,6 +1195,8 @@ ), "test_masked_xpu.py": ( + # Summary: Sparse CSR for XPU is not supported + # NotImplementedError: Could not run 'aten::_to_sparse_csr' with arguments from the 'SparseXPU' backend. # https://github.com/intel/torch-xpu-ops/issues/357 "test_mask_layout_sparse_coo_masked_amax_xpu_bfloat16", @@ -1329,6 +1333,9 @@ "nn/test_lazy_modules_xpu.py": None, "test_linalg_xpu.py": ( + # Summary: + # All linear algebra related ops are not supported for XPU. + # _convert_weight_to_int4pack not support "_int4_mm_m_", # RuntimeError: Double and complex datatype matmul is not supported in oneDNN @@ -1535,6 +1542,8 @@ # XPU does not support tunable. "test_bmm_tunableop_rocm_xpu_float32", "test_numeric_check_leak_tunableop_rocm_xpu_float32", + "test_dump_results_on_exit_tunableop_xpu_float32", + "test_rotating_buffer_tunableop_xpu_float32", # CUDA bias cases added in latest PyTorch # AttributeError: module 'torch._C' has no attribute '_cuda_tunableop_enable' "test_matmul_check_entries_tunableop_xpu_float16", @@ -1580,6 +1589,8 @@ ), "test_ops_fwd_gradients_xpu.py": ( + # All of the followings are oneDNN issues + # RuntimeError: Double and complex datatype matmul is not supported in oneDNN "test_fn_fwgrad_bwgrad___rmatmul___xpu_complex128", "test_fn_fwgrad_bwgrad___rmatmul___xpu_float64", @@ -1884,6 +1895,8 @@ ), "test_maskedtensor_xpu.py": ( + # Summary: SparseCsrXPU OPs are not supported + # NotImplementedError: Could not run 'aten::_to_sparse_csr' with arguments from the 'SparseXPU' backend. # https://github.com/intel/torch-xpu-ops/issues/357 "test_to_dense_xpu", @@ -1987,13 +2000,12 @@ # ACTUAL: array([-1.108163e+12, 1.108163e+12], dtype=float32) # DESIRED: array([-1.108163e+12, 1.090847e+12], dtype=float32) "test_fq_module_per_tensor_xpu", - # AssertionError: False is not true : Expected dScale=tensor([-0.0173], device='xpu:0') to match scale.grad=tensor([0.0189], device='xpu:0') - "test_learnable_backward_per_channel_cuda_xpu", ), "quantization/core/test_workflow_module_xpu.py": None, "quantization/core/test_quantized_tensor_xpu.py": ( + # Summary: Quantized OPs are not supported for XPU # NotImplementedError: Could not run 'aten::dequantize.self' with arguments from the 'QuantizedXPU' backend "test_compare_per_channel_device_numerics_xpu", # NotImplementedError: Could not run 'aten::dequantize.self' with arguments from the 'QuantizedXPU' backend. @@ -2022,6 +2034,8 @@ ), "test_ops_gradients_xpu.py": ( + # All are oneDNN issues + ### Error #0 in TestBwdGradientsXPU , totally 271 , RuntimeError: Double and complex datatype matmul is not supported in oneDNN "test_fn_grad___rmatmul___xpu_complex128", "test_fn_grad___rmatmul___xpu_float64", @@ -2297,11 +2311,13 @@ "test_fn_gradgrad_pca_lowrank_xpu_complex128", "test_fn_gradgrad_svd_lowrank_xpu_complex128", "test_fn_grad_linalg_norm_xpu_complex128", + ### Error #1 in TestBwdGradientsXPU , totally 4 , RuntimeError: value cannot be converted to type float without overflow "test_fn_grad_addbmm_xpu_complex128", "test_fn_gradgrad_addbmm_xpu_complex128", "test_inplace_grad_addbmm_xpu_complex128", "test_inplace_gradgrad_addbmm_xpu_complex128", + ### Error #4 in TestBwdGradientsXPU , totally 8 , RuntimeError: could not create a primitive descriptor for a deconvolution forward propagation primitive "test_fn_grad_nn_functional_conv_transpose2d_xpu_complex128", "test_fn_grad_nn_functional_conv_transpose2d_xpu_float64", @@ -2322,6 +2338,7 @@ ), "test_torch_xpu.py": ( + # 'torch.xpu' has no attribute ... ### Error #1 in TestTorchDeviceTypeXPU , totally 2 , AttributeError: module 'torch.xpu' has no attribute 'FloatTensor' "test_grad_scaling_state_dict_xpu", ### Error #2 in TestTorchDeviceTypeXPU , totally 1 , AttributeError: 'torch.storage.TypedStorage' object has no attribute 'is_xpu' @@ -2331,6 +2348,7 @@ ### Error #4 in TestTorchDeviceTypeXPU , totally 4 , AttributeError: module 'torch.xpu' has no attribute 'FloatStorage' "test_storage_setitem_xpu_float32", "test_tensor_storage_type_xpu_float32", + ### Error #7 in TestTorchDeviceTypeXPU , totally 1 , TypeError: map2_ is only implemented on CPU tensors "test_broadcast_fn_map2_xpu", ### Error #8 in TestTorchDeviceTypeXPU , totally 1 , TypeError: map_ is only implemented on CPU tensors @@ -2346,16 +2364,8 @@ "test_sync_warning_xpu", ### Error #19 in TestTorchDeviceTypeXPU , totally 1 , RuntimeError: _share_fd_: only available on CPU "test_module_share_memory_xpu", - ### Error #23 in TestTorchDeviceTypeXPU , totally 26 , AssertionError: RuntimeError not raised : expected a non-deterministic error, but it was not raised - "test_nondeterministic_alert_AdaptiveAvgPool2d_xpu", - "test_nondeterministic_alert_CTCLoss_xpu", - "test_nondeterministic_alert_EmbeddingBag_max_xpu", - "test_nondeterministic_alert_MaxPool3d_xpu", - "test_nondeterministic_alert_NLLLoss_xpu", - "test_nondeterministic_alert_interpolate_bilinear_xpu", - "test_nondeterministic_alert_put_accumulate_xpu", - ### Error #24 in TestTorchDeviceTypeXPU , totally 1 , AttributeError: 'TestTorchDeviceTypeXPU' object has no attribute 'check_device_nondeterministic_alert' - "test_nondeterministic_alert_AvgPool3d_xpu", + + # 'torch.xpu' has no attribute ... ### Error #30 in TestTorchDeviceTypeXPU , totally 2 , AttributeError: module 'torch.xpu' has no attribute 'BoolStorage' "test_storage_setitem_xpu_bool", "test_tensor_storage_type_xpu_bool", @@ -2384,11 +2394,7 @@ "test_tensor_storage_type_xpu_bfloat16", ### Error #39 in TestTorchDeviceTypeXPU , totally 1 , AttributeError: module 'torch.xpu' has no attribute 'HalfStorage' "test_tensor_storage_type_xpu_float16", - ### Error #40 in TestTorchDeviceTypeXPU , totally 1 , FAILED test_torch_xpu.py::TestTorch::test_index_add - RuntimeError: expected ... - "test_tensor_storage_type_xpu_uint8", - ### Error #41 in TestTorchDeviceTypeXPU , totally 1 , FAILED test_torch_xpu.py::TestTorch::test_print - AttributeError: module 'tor... - "test_tensor_storage_type_xpu_uint8", - ### Error #42 in TestTorchDeviceTypeXPU , totally 1 , FAILED test_torch_xpu.py::TestTorch::test_storage_error - AttributeError: 'to... + ### Module 'torch.xpu' has no attribute 'ByteStorage' "test_tensor_storage_type_xpu_uint8", # issue 302 , 8 "test_print", @@ -2420,6 +2426,7 @@ ), "test_native_mha_xpu.py": ( + # NestedTensorXPU related OPs # NotImplementedError: Could not run 'aten::_native_multi_head_attention' with arguments from the 'NestedTensorXPU' backend. "test_native_multihead_self_attention_use_nt_False_use_padding_True_pad_all_False_need_weights_False_average_attn_weights_False_fused_False_xpu_float16", "test_native_multihead_self_attention_use_nt_False_use_padding_True_pad_all_False_need_weights_False_average_attn_weights_False_fused_False_xpu_float32", @@ -2476,6 +2483,7 @@ ), "nn/test_convolution_xpu.py": ( + # Summary: all of them are oneDNN related issues # XPU unsupport ops, skip. # https://github.com/intel/torch-xpu-ops/issues/348 "test_cudnn_convolution_relu_xpu_float16", @@ -2507,7 +2515,6 @@ # https://github.com/intel/torch-xpu-ops/issues/774 "_jiterator_", - # RuntimeError: Short is not supported in oneDNN! Need oneDNN's support, suggest to keep skip. "test_dispatch_meta_outplace_nn_functional_linear_xpu_int16", "test_dispatch_symbolic_meta_outplace_nn_functional_linear_xpu_int16", @@ -2519,7 +2526,6 @@ "test_meta_outplace_nn_functional_linear_xpu_int64", # RuntimeError: Double and complex datatype matmul is not supported in oneDNN - "test_dispatch_meta_inplace_addbmm_xpu_complex", "test_dispatch_meta_outplace_addbmm_xpu_complex", "test_dispatch_symbolic_meta_inplace_addbmm_xpu_complex", @@ -3254,7 +3260,10 @@ "test_type_promotion_xpu.py": None, - "test_distributions_xpu.py": None, + "test_distributions_xpu.py": ( + # TODO: Passed on lts driver version, but failed on rolling driver version + "test_gamma_gpu_sample_xpu", + ), "test_optim_xpu.py": ( # oneDNN issues diff --git a/test/xpu/skip_list_win_bmg.py b/test/xpu/skip_list_win_bmg.py new file mode 100644 index 000000000..a91d4f4a5 --- /dev/null +++ b/test/xpu/skip_list_win_bmg.py @@ -0,0 +1,39 @@ +skip_dict = { + # tensor(0.-0.j, device='xpu:0', dtype=torch.complex32) tensor(nan+nanj, device='xpu:0', dtype=torch.complex32) (1.5707964+0j) + "test_unary_ufuncs_xpu.pyy": ( + "test_reference_numerics_small_acos_xpu_complex32", + "test_reference_numerics_small_asin_xpu_complex32", + "test_reference_numerics_small_asinh_xpu_complex32", + "test_reference_numerics_small_atan_xpu_complex32", + "test_reference_numerics_small_atanh_xpu_complex32", + # Need to check compiler std::sin() on inf+infj + "test_reference_numerics_extremal__refs_sin_xpu_complex128", + "test_reference_numerics_extremal__refs_sin_xpu_complex64", + "test_reference_numerics_extremal_nn_functional_tanhshrink_xpu_complex128", + "test_reference_numerics_extremal_nn_functional_tanhshrink_xpu_complex64", + "test_reference_numerics_extremal_sin_xpu_complex128", + "test_reference_numerics_extremal_sin_xpu_complex64", + "test_reference_numerics_extremal_sinh_xpu_complex128", + "test_reference_numerics_extremal_sinh_xpu_complex64", + "test_reference_numerics_large__refs_sin_xpu_complex32", + "test_reference_numerics_large_sin_xpu_complex32", + # Known issue of exp accuracy + # tensor(13437.7000-501.j, device='xpu:0', dtype=torch.complex128) tensor(inf+infj, device='xpu:0', dtype=torch.complex128) (-inf+infj) + "test_reference_numerics_large__refs_exp_xpu_complex128", + "test_reference_numerics_large_exp_xpu_complex128", + "test_reference_numerics_small_exp_xpu_complex32", + ":test_reference_numerics_normal_special_i1_xpu_float32", + "test_reference_numerics_normal_sigmoid_xpu_complex32", + "test_reference_numerics_small_sigmoid_xpu_complex32", + ), + # https://github.com/intel/torch-xpu-ops/issues/1171 + # AssertionError: 'Assertion maxind >= 0 && maxind < outputImageSize failed' not found in '\nAssertHandler::printMessage\n' : The expected error was not found + "nn\test_pooling_xpu.py": ( + "test_MaxUnpool_index_errors_case1_xpu", + "test_MaxUnpool_index_errors_case2_xpu", + "test_MaxUnpool_index_errors_case4_xpu", + "test_MaxUnpool_index_errors_case6_xpu", + "test_MaxUnpool_index_errors_case7_xpu", + "test_MaxUnpool_index_errors_case9_xpu", + ), +} diff --git a/test/xpu/skip_list_win_lnl.py b/test/xpu/skip_list_win_lnl.py new file mode 100644 index 000000000..a9e8bfc3f --- /dev/null +++ b/test/xpu/skip_list_win_lnl.py @@ -0,0 +1,38 @@ +skip_dict = { + # tensor(0.-0.j, device='xpu:0', dtype=torch.complex32) tensor(nan+nanj, device='xpu:0', dtype=torch.complex32) (1.5707964+0j) + "test_unary_ufuncs_xpu.pyy": ( + "test_reference_numerics_small_acos_xpu_complex32", + "test_reference_numerics_small_asin_xpu_complex32", + "test_reference_numerics_small_asinh_xpu_complex32", + "test_reference_numerics_small_atan_xpu_complex32", + "test_reference_numerics_small_atanh_xpu_complex32", + # Need to check compiler std::sin() on inf+infj + "test_reference_numerics_extremal__refs_sin_xpu_complex128", + "test_reference_numerics_extremal__refs_sin_xpu_complex64", + "test_reference_numerics_extremal_nn_functional_tanhshrink_xpu_complex128", + "test_reference_numerics_extremal_nn_functional_tanhshrink_xpu_complex64", + "test_reference_numerics_extremal_sin_xpu_complex128", + "test_reference_numerics_extremal_sin_xpu_complex64", + "test_reference_numerics_extremal_sinh_xpu_complex128", + "test_reference_numerics_extremal_sinh_xpu_complex64", + "test_reference_numerics_large__refs_sin_xpu_complex32", + "test_reference_numerics_large_sin_xpu_complex32", + # Known issue of exp accuracy + # tensor(13437.7000-501.j, device='xpu:0', dtype=torch.complex128) tensor(inf+infj, device='xpu:0', dtype=torch.complex128) (-inf+infj) + "test_reference_numerics_large__refs_exp_xpu_complex128", + "test_reference_numerics_large_exp_xpu_complex128", + "test_reference_numerics_small_exp_xpu_complex32", + ":test_reference_numerics_normal_special_i1_xpu_float32", + "test_reference_numerics_normal_sigmoid_xpu_complex32", + ), + # https://github.com/intel/torch-xpu-ops/issues/1171 + # AssertionError: 'Assertion maxind >= 0 && maxind < outputImageSize failed' not found in '\nAssertHandler::printMessage\n' : The expected error was not found + "nn\test_pooling_xpu.py": ( + "test_MaxUnpool_index_errors_case1_xpu", + "test_MaxUnpool_index_errors_case2_xpu", + "test_MaxUnpool_index_errors_case4_xpu", + "test_MaxUnpool_index_errors_case6_xpu", + "test_MaxUnpool_index_errors_case7_xpu", + "test_MaxUnpool_index_errors_case9_xpu", + ), +} diff --git a/test/xpu/test_decomp_xpu.py b/test/xpu/test_decomp_xpu.py index d659197d9..2e39ca90d 100644 --- a/test/xpu/test_decomp_xpu.py +++ b/test/xpu/test_decomp_xpu.py @@ -39,6 +39,7 @@ def _op_assert_ref(test_case, op, test_dtype, i, orig, decomp, ref, args, kwargs (torch.float16, torch.ops.aten.nll_loss_forward.default): 1e-2, (torch.bfloat16, torch.ops.aten.nll_loss_forward.default): 1e-1, (torch.float16, torch.ops.aten.nll_loss2d_forward.default): 1e-2, + (torch.float16, torch.ops.aten.nll_loss2d_backward.default): 1e-4, (torch.bfloat16, torch.ops.aten.nll_loss2d_forward.default): 2e-1, (torch.float16, torch.ops.aten.hardswish.default): 2e-7, (torch.bfloat16, torch.ops.aten.hardswish.default): 2e-7, diff --git a/test/xpu/test_indexing_xpu.py b/test/xpu/test_indexing_xpu.py index d57567318..b4299789e 100644 --- a/test/xpu/test_indexing_xpu.py +++ b/test/xpu/test_indexing_xpu.py @@ -13,6 +13,7 @@ from test_indexing import NumpyTests,TestIndexing import torch + torch.Tensor.is_cuda = torch.Tensor.is_xpu def __test_index_put_accumulate_with_optional_tensors(self, device): # TODO: replace with a better solution. diff --git a/test/xpu/test_torch_xpu.py b/test/xpu/test_torch_xpu.py index 9c54ffdcc..8dce5989c 100644 --- a/test/xpu/test_torch_xpu.py +++ b/test/xpu/test_torch_xpu.py @@ -1439,8 +1439,10 @@ def test_nondeterministic_alert_AvgPool3d(self, device): res = module(input) grad = torch.ones_like(res) - self.check_device_nondeterministic_alert(grad, 'avg_pool3d_backward') - + self.check_nondeterministic_alert( + lambda: res.backward(grad, retain_graph=True), + 'avg_pool3d_backward_' + torch.device(device).type, + torch.device(device).type == 'cuda' or torch.device(device).type == 'xpu') @skipIfMPS @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707") @@ -1478,7 +1480,7 @@ def test_nondeterministic_alert_MaxPool3d(self, device): self.check_nondeterministic_alert( lambda: res.backward(grad, retain_graph=True), - 'max_pool3d_with_indices_backward' + torch.device(device).type, + 'max_pool3d_with_indices_backward_' + torch.device(device).type, torch.device(device).type == 'cuda' or torch.device(device).type == 'xpu') @skipIfMPS @@ -1770,10 +1772,9 @@ def test_nondeterministic_alert_NLLLoss(self, device): input = torch.randn(2, 3, 5, 5, device=device) target = torch.rand(2, 5, 5, device=device).mul(3).floor().long() - self.check_nondeterministic_alert( lambda: module(input, target), - 'nll_loss2d_forward_out_' + torch.device(device).type + '_template', + 'nll_loss2d_forward_' + torch.device(device).type, torch.device(device).type == 'cuda' or torch.device(device).type == 'xpu') @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707") @@ -1788,7 +1789,7 @@ def test_nondeterministic_alert_CTCLoss(self, device): self.check_nondeterministic_alert( lambda: res.backward(grad, retain_graph=True), - 'ctc_loss_backward_gpu', + 'ctc_loss_backward_' + torch.device(device).type, torch.device(device).type == 'cuda' or torch.device(device).type == 'xpu') @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707") diff --git a/test/xpu/test_unary_ufuncs_xpu.py b/test/xpu/test_unary_ufuncs_xpu.py index 0e05a8e7c..a6c12a2ad 100644 --- a/test/xpu/test_unary_ufuncs_xpu.py +++ b/test/xpu/test_unary_ufuncs_xpu.py @@ -1,6 +1,7 @@ # Owner(s): ["module: intel"] -from torch.testing._internal.common_device_type import instantiate_device_type_tests +import torch +from torch.testing._internal.common_device_type import instantiate_device_type_tests, onlyXPU from torch.testing._internal.common_utils import run_tests try: @@ -11,6 +12,38 @@ with XPUPatchForImport(False): from test_unary_ufuncs import TestUnaryUfuncs + @onlyXPU + def _nonzero_static_large(self, device): + # large enough to have multiple iters per SM even on H100 + # with 132 sms + size_inp = 1024 * 16 * 132 + 1024 * 16 + x = torch.zeros(size_inp, device=device) + # unique indices + indices = torch.randperm(size_inp, device=device)[: size_inp // 2] + sorted, _ = torch.sort(indices) + x[sorted] = 1 + res = torch.nonzero_static(x, size=size_inp // 2).view(-1) + self.assertEqual(res, sorted) + # no oob writes + out = torch.full((size_inp,), 10, device=device, dtype=torch.int64) + res = torch.nonzero_static(x, size=size_inp // 4, out=out[: size_inp // 2]) + self.assertEqual(out[: size_inp // 4], sorted[: size_inp // 4]) + self.assertEqual( + out[size_inp // 4 :], + torch.tensor(10, device="xpu").expand_as(out[size_inp // 4 :]), + ) + # correct fill for 2d + x = x.view(2, size_inp // 2) + ref = x.nonzero() + res = x.nonzero_static(size=size_inp // 2 + 2) + self.assertEqual(res.shape, [size_inp // 2 + 2, 2]) + self.assertEqual(ref, res[: size_inp // 2]) + self.assertEqual( + res[size_inp // 2 :], + torch.tensor(-1, device="xpu").expand_as(res[size_inp // 2 :]), + ) + TestUnaryUfuncs.test_nonzero_static_large = _nonzero_static_large + instantiate_device_type_tests(TestUnaryUfuncs, globals(),only_for=("xpu"), allow_xpu=True) if __name__ == "__main__": diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index 8dc208ed4..4f8ef5635 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -223,6 +223,8 @@ "nn.functional.ctc_loss", "nn.functional.channel_shuffle", "nn.functional.multi_head_attention_forward", + "nn.GRUCell", + "nn.LSTMCell", "sigmoid", "logsigmoid", "sgn", diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml index d5e307cdf..f19a57c7f 100644 --- a/yaml/native/native_functions.yaml +++ b/yaml/native/native_functions.yaml @@ -2036,6 +2036,10 @@ dispatch: XPU: softmax_xpu_out +- func: _safe_softmax(Tensor self, int dim, ScalarType? dtype=None) -> Tensor + dispatch: + XPU: _safe_softmax_xpu + - func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor structured_delegate: _softmax_backward_data.out @@ -5988,12 +5992,6 @@ XPU: native_multi_head_attention_xpu autogen: _native_multi_head_attention.out -# This aten function is kept so that we can test the choice function from Python -- func: _fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None, bool enable_gqa=False) -> int - dispatch: - XPU: _fused_sdp_choice_xpu - tags: nondeterministic_seeded - - func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor structured_delegate: argmin.out device_check: NoCheck # TensorIterator @@ -7597,6 +7595,34 @@ dispatch: XPU: ctc_loss_backward_tensor +- func: lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor) + +- func: gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor + +# Fused RNN kernels +- func: _thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor, Tensor) + dispatch: + XPU: _thnn_fused_lstm_cell_xpu + autogen: _thnn_fused_lstm_cell.out + +# NB: The composite version of this function below is a simple wrapper that duplicates some of the outputs +# It is necessary to avoid triggering TensorImpl use count checks in debug mode +# NB: this is function is NOT differentiable +- func: _thnn_fused_lstm_cell_backward_impl(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor) + dispatch: + XPU: _thnn_fused_lstm_cell_backward_xpu + autogen: _thnn_fused_lstm_cell_backward_impl.out + +- func: _thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor) + dispatch: + XPU: _thnn_fused_gru_cell_xpu + autogen: _thnn_fused_gru_cell.out + +- func: _thnn_fused_gru_cell_backward(Tensor grad_hy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + dispatch: + XPU: _thnn_fused_gru_cell_backward_xpu + autogen: _thnn_fused_gru_cell_backward.out + - func: hardshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase @@ -8209,17 +8235,18 @@ variants: function tags: pointwise -- func: rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!) +- func: rrelu_with_noise.out(Tensor self, Tensor(b!) noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!) python_module: nn tags: nondeterministic_seeded dispatch: XPU: rrelu_with_noise_out_xpu -- func: rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor +- func: rrelu_with_noise(Tensor self, Tensor(b!) noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor python_module: nn dispatch: XPU: rrelu_with_noise_xpu tags: nondeterministic_seeded + autogen: rrelu_with_noise_functional - func: rrelu_with_noise_backward(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result) -> Tensor python_module: nn @@ -8227,7 +8254,7 @@ CompositeExplicitAutograd: rrelu_with_noise_backward autogen: rrelu_with_noise_backward.out -- func: rrelu_with_noise_(Tensor(a!) self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!) +- func: rrelu_with_noise_(Tensor(a!) self, Tensor(b!) noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!) python_module: nn tags: nondeterministic_seeded dispatch: