Skip to content

Commit 12b4a67

Browse files
jithunnair-amdBLOrange-AMD
authored andcommitted
PyTorch unit test helper scripts enhancements (#1517)
* Fail earlier for distributed-on-1-GPU scenario * print cmd in consolidated log with prettier formatting * python->python3 Fixes https://ontrack-internal.amd.com/browse/SWDEV-477264 --------- Co-authored-by: blorange-amd <[email protected]>
1 parent 87728a1 commit 12b4a67

File tree

1 file changed

+26
-13
lines changed

1 file changed

+26
-13
lines changed

.automation_scripts/run_pytorch_unit_tests.py

+26-13
Original file line numberDiff line numberDiff line change
@@ -217,13 +217,12 @@ def summarize_xml_files(path, workflow_name):
217217
return res
218218

219219
def run_command_and_capture_output(cmd):
220-
if os.environ['TEST_CONFIG'] == 'distributed':
221-
p = subprocess.run("rocminfo | grep -cE 'Name:\s+gfx'", shell=True, capture_output=True, text=True)
222-
num_gpus_visible = int(p.stdout)
223-
assert num_gpus_visible > 1, "Number of visible GPUs should be >1 to run TEST_CONFIG=distributed"
224220
try:
225221
print(f"Running command '{cmd}'")
226222
with open(CONSOLIDATED_LOG_FILE_PATH, "a+") as output_file:
223+
print(f"========================================", file=output_file, flush=True)
224+
print(f"[RUN_PYTORCH_UNIT_TESTS] Running command '{cmd}'", file=output_file, flush=True) # send to consolidated file as well
225+
print(f"========================================", file=output_file, flush=True)
227226
p = subprocess.run(cmd, shell=True, stdout=output_file, stderr=STDOUT, text=True)
228227
except CalledProcessError as e:
229228
print(f"ERROR: Cmd {cmd} failed with return code: {e.returncode}!")
@@ -261,7 +260,7 @@ def run_priority_tests(workflow_name, test_run_test_path, overall_logs_path_curr
261260
copied_logs_path = overall_logs_path_current_run + "default_xml_results_priority_tests/"
262261
# use run_test.py for tests execution
263262
default_priority_test_suites = " ".join(DEFAULT_CORE_TESTS)
264-
command = "python " + test_run_test_path + " --include " + default_priority_test_suites + " --exclude-jit-executor --exclude-distributed-tests --verbose"
263+
command = "python3 " + test_run_test_path + " --include " + default_priority_test_suites + " --exclude-jit-executor --exclude-distributed-tests --verbose"
265264
run_command_and_capture_output(command)
266265
del os.environ['HIP_VISIBLE_DEVICES']
267266
elif workflow_name == "distributed":
@@ -270,7 +269,7 @@ def run_priority_tests(workflow_name, test_run_test_path, overall_logs_path_curr
270269
copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_priority_tests/"
271270
# use run_test.py for tests execution
272271
distributed_priority_test_suites = " ".join(DISTRIBUTED_CORE_TESTS)
273-
command = "python " + test_run_test_path + " --include " + distributed_priority_test_suites + " --distributed-tests --verbose"
272+
command = "python3 " + test_run_test_path + " --include " + distributed_priority_test_suites + " --distributed-tests --verbose"
274273
run_command_and_capture_output(command)
275274
del os.environ['HIP_VISIBLE_DEVICES']
276275
copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path)
@@ -290,7 +289,7 @@ def run_selected_tests(workflow_name, test_run_test_path, overall_logs_path_curr
290289
copied_logs_path = overall_logs_path_current_run + "default_xml_results_selected_tests/"
291290
# use run_test.py for tests execution
292291
default_selected_test_suites = " ".join(selected_list)
293-
command = "python " + test_run_test_path + " --include " + default_selected_test_suites + " --exclude-jit-executor --exclude-distributed-tests --verbose"
292+
command = "python3 " + test_run_test_path + " --include " + default_selected_test_suites + " --exclude-jit-executor --exclude-distributed-tests --verbose"
294293
run_command_and_capture_output(command)
295294
del os.environ['HIP_VISIBLE_DEVICES']
296295
elif workflow_name == "distributed":
@@ -299,7 +298,7 @@ def run_selected_tests(workflow_name, test_run_test_path, overall_logs_path_curr
299298
copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_selected_tests/"
300299
# use run_test.py for tests execution
301300
distributed_selected_test_suites = " ".join(selected_list)
302-
command = "python " + test_run_test_path + " --include " + distributed_selected_test_suites + " --distributed-tests --verbose"
301+
command = "python3 " + test_run_test_path + " --include " + distributed_selected_test_suites + " --distributed-tests --verbose"
303302
run_command_and_capture_output(command)
304303
del os.environ['HIP_VISIBLE_DEVICES']
305304
elif workflow_name == "inductor":
@@ -316,11 +315,11 @@ def run_selected_tests(workflow_name, test_run_test_path, overall_logs_path_curr
316315
non_inductor_selected_test_suites += " "
317316
if inductor_selected_test_suites != "":
318317
inductor_selected_test_suites = inductor_selected_test_suites[:-1]
319-
command = "python " + test_run_test_path + " --include " + inductor_selected_test_suites + " --verbose"
318+
command = "python3 " + test_run_test_path + " --include " + inductor_selected_test_suites + " --verbose"
320319
run_command_and_capture_output(command)
321320
if non_inductor_selected_test_suites != "":
322321
non_inductor_selected_test_suites = non_inductor_selected_test_suites[:-1]
323-
command = "python " + test_run_test_path + " --inductor --include " + non_inductor_selected_test_suites + " --verbose"
322+
command = "python3 " + test_run_test_path + " --inductor --include " + non_inductor_selected_test_suites + " --verbose"
324323
run_command_and_capture_output(command)
325324
copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path)
326325
selected_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name)
@@ -371,10 +370,19 @@ def run_test_and_summarize_results(
371370
global CONSOLIDATED_LOG_FILE_PATH
372371
CONSOLIDATED_LOG_FILE_PATH = overall_logs_path_current_run + CONSOLIDATED_LOG_FILE_NAME
373372

373+
# Check multi gpu availability if distributed tests are enabled
374+
if ("distributed" in args.test_config) or len(args.distributed_list) != 0:
375+
check_num_gpus_for_distributed();
376+
377+
# Install test requirements
378+
command = "pip3 install -r requirements.txt && pip3 install -r .ci/docker/requirements-ci.txt"
379+
run_command_and_capture_output(command)
380+
374381
# Run entire tests for each workflow
375382
if not priority_tests and not default_list and not distributed_list and not inductor_list:
376383
# run entire tests for default, distributed and inductor workflows → use test.sh
377384
if not test_config:
385+
check_num_gpus_for_distributed();
378386
# default test process
379387
res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src)
380388
res_all_tests_dict["default"] = res_default_all
@@ -465,11 +473,16 @@ def parse_args():
465473
" test_file_and_status(file_name='test_file_name_1', status='SKIPPED'): {}, \n"
466474
" test_file_and_status(file_name='test_file_name_1', status='STATISTICS'): {} \n"
467475
"}}\n")
468-
parser.add_argument('--example_usages', type=str, help="RUN ALL TESTS: python run_pytorch_unit_tests.py \n"
469-
"RUN PRIORITY TESTS: python run_pytorch_unit_tests.py --test_config distributed --priority_test \n"
470-
"RUN SELECTED TESTS: python run_pytorch_unit_tests.py --default_list test_weak test_dlpack --inductor_list inductor/test_torchinductor")
476+
parser.add_argument('--example_usages', type=str, help="RUN ALL TESTS: python3 run_pytorch_unit_tests.py \n"
477+
"RUN PRIORITY TESTS: python3 run_pytorch_unit_tests.py --test_config distributed --priority_test \n"
478+
"RUN SELECTED TESTS: python3 run_pytorch_unit_tests.py --default_list test_weak test_dlpack --inductor_list inductor/test_torchinductor")
471479
return parser.parse_args()
472480

481+
def check_num_gpus_for_distributed():
482+
p = subprocess.run("rocminfo | grep -cE 'Name:\s+gfx'", shell=True, capture_output=True, text=True)
483+
num_gpus_visible = int(p.stdout)
484+
assert num_gpus_visible > 1, "Number of visible GPUs should be >1 to run distributed unit tests"
485+
473486
def main():
474487
global args
475488
args = parse_args()

0 commit comments

Comments
 (0)