From 4db0b0cd1ca51d9cfd890be2eb3527b165782220 Mon Sep 17 00:00:00 2001 From: Feng Yuan Date: Tue, 9 Jul 2024 22:05:35 +0800 Subject: [PATCH 1/5] [PyTorch uplift breaks XPU] Follow utils usage of latest PyTorch definition (#545) Signed-off-by: Feng Yuan --- src/ATen/native/xpu/sycl/IndexingUtils.h | 2 +- test/xpu/extended/run_test_with_skip.py | 12 ++++++++++++ test/xpu/run_test_with_skip.py | 16 +++++++--------- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/ATen/native/xpu/sycl/IndexingUtils.h b/src/ATen/native/xpu/sycl/IndexingUtils.h index 1c6d9c373..26eb2f1ea 100644 --- a/src/ATen/native/xpu/sycl/IndexingUtils.h +++ b/src/ATen/native/xpu/sycl/IndexingUtils.h @@ -99,7 +99,7 @@ static std::tuple computeLinearIndex( static std:: tuple> makeLinearIndex(Tensor self, IOptTensorListRef orig, bool check_range) { - checkIndexTensorTypes(orig, /*allow_int*/ true); + checkIndexTensorTypes(orig); // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more // LongTensors auto indices = expandTensors(self, orig); diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py index c7c2ff404..943d46465 100644 --- a/test/xpu/extended/run_test_with_skip.py +++ b/test/xpu/extended/run_test_with_skip.py @@ -128,6 +128,18 @@ # Greatest absolute difference: 0.03125 at index (610,) (up to 0.001 allowed) # Greatest relative difference: 0.00396728515625 at index (610,) (up to 0.001 allowed) "test_compare_cpu_hypot_xpu_bfloat16", + + # Regressions due to PyTorch uplift (Numeric difference in float and bfloat) + # https://github.com/intel/torch-xpu-ops/issues/549 + # Example fail log + # FAILED test_ops_xpu.py::TestCommonXPU::test_compare_cpu_nn_functional_batch_norm_xpu_float16 - AssertionError: Tensor-likes are not close! + # Mismatched elements: 3 / 72 (4.2%) + # Greatest absolute difference: 0.0029296875 at index (0, 1, 1, 0) (up to 0.001 allowed) + # Greatest relative difference: 0.0032501220703125 at index (2, 1, 2, 1) (up to 0.001 allowed) + "test_compare_cpu_nn_functional_batch_norm_xpu_float16", + "test_compare_cpu_std_mean_xpu_bfloat16", + "test_compare_cpu_sub_xpu_float16", + "test_compare_cpu_var_mean_xpu_bfloat16", ) diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py index ea50fbc29..9e8b02d3c 100644 --- a/test/xpu/run_test_with_skip.py +++ b/test/xpu/run_test_with_skip.py @@ -2995,23 +2995,21 @@ def launch_test(test_case, skip_list=None, exe_list=None): res += launch_test("nn/test_convolution_xpu.py", skip_list) # test_dynamic_shapes - - -res += launch_test("test_dynamic_shapes_xpu.py") +skip_list = ( + # Regression after PyTorch uplift + # https://github.com/intel/torch-xpu-ops/issues/549 + # AssertionError: 3 != 3.0 + "test_symnode_hashing", +) +res += launch_test("test_dynamic_shapes_xpu.py", skip_list) # test_load_state_dict - - res += launch_test("nn/test_load_state_dict_xpu.py") # test_module_hooks - - res += launch_test("nn/test_module_hooks_xpu.py") # test_parametrization - - res += launch_test("nn/test_parametrization_xpu.py") exit_code = os.WEXITSTATUS(res) From 1dcaf3eaa27aaf893b7630483ce97c81e1364fcb Mon Sep 17 00:00:00 2001 From: Feng Yuan Date: Wed, 10 Jul 2024 13:15:17 +0800 Subject: [PATCH 2/5] build: fix -Wunused-function warning (#554) PyTorch enforced unused-function and unused-variable compilation options recently. It leads to torch-xpu-ops building failure with -Werror. Fixing: https://github.com/pytorch/pytorch/pull/130084 Signed-off-by: Feng Yuan --- src/ATen/native/xpu/XPUFallback.template | 4 ---- test/xpu/run_test_with_skip.py | 5 +++++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template index fa7fafe13..7bfdd6abd 100644 --- a/src/ATen/native/xpu/XPUFallback.template +++ b/src/ATen/native/xpu/XPUFallback.template @@ -109,10 +109,6 @@ static void xpu_lazy_registration_or_error_fallback( } } -static void xpu_force_fallback( - const c10::OperatorHandle& op, - torch::jit::Stack* stack) {} - TORCH_LIBRARY_IMPL(_, XPU, m) { static const char* enable_xpu_fallback = getenv("PYTORCH_ENABLE_XPU_FALLBACK"); diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py index 9e8b02d3c..28a073bd4 100644 --- a/test/xpu/run_test_with_skip.py +++ b/test/xpu/run_test_with_skip.py @@ -1380,6 +1380,11 @@ def launch_test(test_case, skip_list=None, exe_list=None): # https://github.com/intel/torch-xpu-ops/issues/461 "test_index_put_src_datatype_xpu_float8_e5m2", "test_index_put_src_datatype_xpu_float8_e4m3fn", + + # Regression after PyTorch update + # http://github.com/intel/torch-xpu-ops/issues/549 + # IndexError: tensors used as indices must be long, byte or bool tensors. + "test_index_ind_dtype_xpu", ) res += launch_test("test_indexing_xpu.py", skip_list) From 5473e8d19535bc072500bc6e2aa74aa3f1354be1 Mon Sep 17 00:00:00 2001 From: mengfeil Date: Wed, 10 Jul 2024 14:28:53 +0800 Subject: [PATCH 3/5] enhance ci workflow --- .github/scripts/apply_torch_pr.py | 64 ++++++++++++++++++-------- .github/workflows/nightly_ondemand.yml | 2 +- 2 files changed, 46 insertions(+), 20 deletions(-) diff --git a/.github/scripts/apply_torch_pr.py b/.github/scripts/apply_torch_pr.py index d0ab9a163..9ef238abb 100644 --- a/.github/scripts/apply_torch_pr.py +++ b/.github/scripts/apply_torch_pr.py @@ -25,7 +25,7 @@ # check reverted PR is in current code base or not def check_reverted_reopen(pr_info): - git_cmd = "git log nightly -n 1 2>&1 |grep 'nightly release' |head -1 |sed 's/.*(//;s/).*//' || git rev-parse HEAD" + git_cmd = "((git log -n 1 2>&1 |grep 'nightly release' |head -1 |sed 's/.*(//;s/).*//' || true) && git rev-parse HEAD) |head -n 1" git_info = subprocess.Popen(git_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) main_commit = git_info.communicate()[0].decode("utf-8").replace("\n", "") revert_cmd = "cur_cmt=$(git rev-parse HEAD) && git fetch origin main > /dev/null 2>&1 && " + \ @@ -40,6 +40,39 @@ def check_reverted_reopen(pr_info): reverted = False return reverted +def check_merged(pr_info): + git_cmd = "((git log -n 1 2>&1 |grep 'nightly release' |head -1 |sed 's/.*(//;s/).*//' || true) && git rev-parse HEAD) |head -n 1" + git_info = subprocess.Popen(git_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) + main_commit = git_info.communicate()[0].decode("utf-8").replace("\n", "") + merge_cmd = "cur_cmt=$(git rev-parse HEAD) && git fetch origin main > /dev/null 2>&1 && " + \ + "git checkout " + main_commit + " > /dev/null 2>&1 && " + \ + "git log |grep 'resolved: " + pr_info["html_url"] + "' || true && " + \ + "git checkout $cur_cmt > /dev/null 2>&1" + merge_info = subprocess.Popen(merge_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) + merge_msg = merge_info.communicate()[0].decode("utf-8") + if "resolved: " + pr_info["html_url"] in merge_msg: + merged = True + else: + merged = False + return merged + +def appyly_pr(pr_info, re_apply_msg): + # get pr diff + pr_file = pr_info["diff_url"].split("/")[-1] + urllib.request.urlretrieve(pr_info["diff_url"], pr_file) + # apply diff + apply_cmd = "git apply --3way " + pr_file + " && rm -f " + pr_file + apply_info = subprocess.Popen(apply_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) + apply_message = apply_info.communicate()[0].decode("utf-8") + apply_status = apply_info.returncode + # apply status + if apply_status == 0: + print("{} {}, applied got SUCCESSFUL".format(pr_info["diff_url"], re_apply_msg)) + else: + print("{} {}, applied got FAILED".format(pr_info["diff_url"], apply_message)) + print(apply_status, apply_message) + exit(1) + # headers = {'Authorization': 'Bearer ' + args.token} if args.token != None else args.token pr_list = args.pr_list + args.extra_pr_list @@ -53,7 +86,7 @@ def check_reverted_reopen(pr_info): if pr_info["state"].lower() == "open": # for reverted PR reverted_id = next((item["id"] for item in pr_info["labels"] if item["name"] == "Reverted"), -1) - re_apply_msg = "" + re_apply_msg = "is opened" if reverted_id != -1: reverted = check_reverted_reopen(pr_info) # skip if PR not reverted but re-open in current code base @@ -61,24 +94,17 @@ def check_reverted_reopen(pr_info): print("{} is re-open but not reverted, no need to apply".format(pr_info["diff_url"])) continue else: - re_apply_msg = "is re-opened & reverted," - # get pr diff - pr_file = pr_info["diff_url"].split("/")[-1] - urllib.request.urlretrieve(pr_info["diff_url"], pr_file) - # apply diff - apply_cmd = "git apply --3way " + pr_file + " && rm -f " + pr_file - apply_info = subprocess.Popen(apply_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) - apply_message = apply_info.communicate()[0].decode("utf-8") - apply_status = apply_info.returncode - # apply status - if apply_status == 0: - print("{} {} applied got SUCCESSFUL".format(pr_info["diff_url"], re_apply_msg)) - else: - print("{} applied got FAILED".format(pr_info["diff_url"])) - print(apply_status, apply_message) - exit(1) + re_apply_msg = "is re-opened and reverted," + appyly_pr(pr_info, re_apply_msg) elif pr_info["state"].lower() == "closed": - print("{} is ClOSED, no need to apply".format(pr_info["diff_url"])) + merged_id = next((item["id"] for item in pr_info["labels"] if item["name"] == "Merged"), -1) + re_apply_msg = "is closed but not merged" + if merged_id != -1: + merged = check_merged(pr_info) + if merged: + print("{} is closed and merged, no need to apply".format(pr_info["diff_url"])) + continue + appyly_pr(pr_info, re_apply_msg) else: print("{} is {}, no need to apply".format(pr_info["diff_url"], pr_info["state"])) exit(1) diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 039407bc8..686aa059a 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -17,7 +17,7 @@ on: default: 'false' description: Keep torch-xpu-ops pin. `true` means use pined commit ut: - required: true + required: false type: string default: 'torch_xpu' description: UT scope. `op_example,op_extended,op_ut,torch_xpu` Delimiter is comma From 87b4b116c7f481a90e7ca570a718a88f5a6cf8cc Mon Sep 17 00:00:00 2001 From: mengfeil Date: Wed, 10 Jul 2024 14:34:19 +0800 Subject: [PATCH 4/5] update --- .github/workflows/nightly_ondemand.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 686aa059a..039407bc8 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -17,7 +17,7 @@ on: default: 'false' description: Keep torch-xpu-ops pin. `true` means use pined commit ut: - required: false + required: true type: string default: 'torch_xpu' description: UT scope. `op_example,op_extended,op_ut,torch_xpu` Delimiter is comma From 7283c7519042ecb35b2dc2dbbb7e2ac5e2c579a0 Mon Sep 17 00:00:00 2001 From: mengfeil Date: Wed, 10 Jul 2024 15:21:20 +0800 Subject: [PATCH 5/5] modify expected accuracy check --- .github/ci_expected_accuracy/check_expected.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/ci_expected_accuracy/check_expected.py b/.github/ci_expected_accuracy/check_expected.py index 060d83f1f..5339f4ce3 100644 --- a/.github/ci_expected_accuracy/check_expected.py +++ b/.github/ci_expected_accuracy/check_expected.py @@ -48,9 +48,9 @@ passed_models.append([model_name, test_accuracy]) if refer_accuracy == "N/A": new_models.append([model_name, test_accuracy]) - refer_data.loc[refer_data.tail(1).index.tolist()[0] + 1,:] = "N/A" - refer_data.at[refer_data.tail(1).index, "name"] = model_name - refer_data.at[refer_data.tail(1).index, args.dtype] = test_accuracy + refer_data.loc[len(refer_data),:] = "N/A" + refer_data.at[len(refer_data) - 1, "name"] = model_name + refer_data.at[len(refer_data) - 1, args.dtype] = test_accuracy elif 'pass' not in refer_accuracy: new_pass_models.append([model_name, test_accuracy]) refer_data.at[refer_row[0], args.dtype] = test_accuracy @@ -58,9 +58,9 @@ if refer_accuracy == "N/A": new_models.append([model_name, test_accuracy]) real_failed_models.append([model_name, test_accuracy]) - refer_data.loc[refer_data.tail(1).index.tolist()[0] + 1,:] = "N/A" - refer_data.at[refer_data.tail(1).index, "name"] = model_name - refer_data.at[refer_data.tail(1).index, args.dtype] = test_accuracy + refer_data.loc[len(refer_data),:] = "N/A" + refer_data.at[len(refer_data) - 1, "name"] = model_name + refer_data.at[len(refer_data) - 1, args.dtype] = test_accuracy elif "pass" in refer_accuracy: real_failed_models.append([model_name, test_accuracy]) else: @@ -80,7 +80,7 @@ print("Pass rate: {:.2f}%".format(len(passed_models) / len(model_names) * 100)) if len(new_pass_models + new_models) > 0: - print("NOTE: New models result, please update the reference", new_pass_models) + print("NOTE: New models result, please update the reference", new_pass_models, new_models) if args.update: refer_data.to_csv(refer_file, sep=',', encoding='utf-8', index=False) print("Updated. Now, confirm the changes to .csvs and `git add` them if satisfied.")