@@ -217,13 +217,12 @@ def summarize_xml_files(path, workflow_name):
217
217
return res
218
218
219
219
def run_command_and_capture_output (cmd ):
220
- if os .environ ['TEST_CONFIG' ] == 'distributed' :
221
- p = subprocess .run ("rocminfo | grep -cE 'Name:\s+gfx'" , shell = True , capture_output = True , text = True )
222
- num_gpus_visible = int (p .stdout )
223
- assert num_gpus_visible > 1 , "Number of visible GPUs should be >1 to run TEST_CONFIG=distributed"
224
220
try :
225
221
print (f"Running command '{ cmd } '" )
226
222
with open (CONSOLIDATED_LOG_FILE_PATH , "a+" ) as output_file :
223
+ print (f"========================================" , file = output_file , flush = True )
224
+ print (f"[RUN_PYTORCH_UNIT_TESTS] Running command '{ cmd } '" , file = output_file , flush = True ) # send to consolidated file as well
225
+ print (f"========================================" , file = output_file , flush = True )
227
226
p = subprocess .run (cmd , shell = True , stdout = output_file , stderr = STDOUT , text = True )
228
227
except CalledProcessError as e :
229
228
print (f"ERROR: Cmd { cmd } failed with return code: { e .returncode } !" )
@@ -261,7 +260,7 @@ def run_priority_tests(workflow_name, test_run_test_path, overall_logs_path_curr
261
260
copied_logs_path = overall_logs_path_current_run + "default_xml_results_priority_tests/"
262
261
# use run_test.py for tests execution
263
262
default_priority_test_suites = " " .join (DEFAULT_CORE_TESTS )
264
- command = "python " + test_run_test_path + " --include " + default_priority_test_suites + " --exclude-jit-executor --exclude-distributed-tests --verbose"
263
+ command = "python3 " + test_run_test_path + " --include " + default_priority_test_suites + " --exclude-jit-executor --exclude-distributed-tests --verbose"
265
264
run_command_and_capture_output (command )
266
265
del os .environ ['HIP_VISIBLE_DEVICES' ]
267
266
elif workflow_name == "distributed" :
@@ -270,7 +269,7 @@ def run_priority_tests(workflow_name, test_run_test_path, overall_logs_path_curr
270
269
copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_priority_tests/"
271
270
# use run_test.py for tests execution
272
271
distributed_priority_test_suites = " " .join (DISTRIBUTED_CORE_TESTS )
273
- command = "python " + test_run_test_path + " --include " + distributed_priority_test_suites + " --distributed-tests --verbose"
272
+ command = "python3 " + test_run_test_path + " --include " + distributed_priority_test_suites + " --distributed-tests --verbose"
274
273
run_command_and_capture_output (command )
275
274
del os .environ ['HIP_VISIBLE_DEVICES' ]
276
275
copied_logs_path_destination = shutil .copytree (test_reports_src , copied_logs_path )
@@ -290,7 +289,7 @@ def run_selected_tests(workflow_name, test_run_test_path, overall_logs_path_curr
290
289
copied_logs_path = overall_logs_path_current_run + "default_xml_results_selected_tests/"
291
290
# use run_test.py for tests execution
292
291
default_selected_test_suites = " " .join (selected_list )
293
- command = "python " + test_run_test_path + " --include " + default_selected_test_suites + " --exclude-jit-executor --exclude-distributed-tests --verbose"
292
+ command = "python3 " + test_run_test_path + " --include " + default_selected_test_suites + " --exclude-jit-executor --exclude-distributed-tests --verbose"
294
293
run_command_and_capture_output (command )
295
294
del os .environ ['HIP_VISIBLE_DEVICES' ]
296
295
elif workflow_name == "distributed" :
@@ -299,7 +298,7 @@ def run_selected_tests(workflow_name, test_run_test_path, overall_logs_path_curr
299
298
copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_selected_tests/"
300
299
# use run_test.py for tests execution
301
300
distributed_selected_test_suites = " " .join (selected_list )
302
- command = "python " + test_run_test_path + " --include " + distributed_selected_test_suites + " --distributed-tests --verbose"
301
+ command = "python3 " + test_run_test_path + " --include " + distributed_selected_test_suites + " --distributed-tests --verbose"
303
302
run_command_and_capture_output (command )
304
303
del os .environ ['HIP_VISIBLE_DEVICES' ]
305
304
elif workflow_name == "inductor" :
@@ -316,11 +315,11 @@ def run_selected_tests(workflow_name, test_run_test_path, overall_logs_path_curr
316
315
non_inductor_selected_test_suites += " "
317
316
if inductor_selected_test_suites != "" :
318
317
inductor_selected_test_suites = inductor_selected_test_suites [:- 1 ]
319
- command = "python " + test_run_test_path + " --include " + inductor_selected_test_suites + " --verbose"
318
+ command = "python3 " + test_run_test_path + " --include " + inductor_selected_test_suites + " --verbose"
320
319
run_command_and_capture_output (command )
321
320
if non_inductor_selected_test_suites != "" :
322
321
non_inductor_selected_test_suites = non_inductor_selected_test_suites [:- 1 ]
323
- command = "python " + test_run_test_path + " --inductor --include " + non_inductor_selected_test_suites + " --verbose"
322
+ command = "python3 " + test_run_test_path + " --inductor --include " + non_inductor_selected_test_suites + " --verbose"
324
323
run_command_and_capture_output (command )
325
324
copied_logs_path_destination = shutil .copytree (test_reports_src , copied_logs_path )
326
325
selected_results_dict = summarize_xml_files (copied_logs_path_destination , workflow_name )
@@ -371,10 +370,19 @@ def run_test_and_summarize_results(
371
370
global CONSOLIDATED_LOG_FILE_PATH
372
371
CONSOLIDATED_LOG_FILE_PATH = overall_logs_path_current_run + CONSOLIDATED_LOG_FILE_NAME
373
372
373
+ # Check multi gpu availability if distributed tests are enabled
374
+ if ("distributed" in args .test_config ) or len (args .distributed_list ) != 0 :
375
+ check_num_gpus_for_distributed ();
376
+
377
+ # Install test requirements
378
+ command = "pip3 install -r requirements.txt && pip3 install -r .ci/docker/requirements-ci.txt"
379
+ run_command_and_capture_output (command )
380
+
374
381
# Run entire tests for each workflow
375
382
if not priority_tests and not default_list and not distributed_list and not inductor_list :
376
383
# run entire tests for default, distributed and inductor workflows → use test.sh
377
384
if not test_config :
385
+ check_num_gpus_for_distributed ();
378
386
# default test process
379
387
res_default_all = run_entire_tests ("default" , test_shell_path , overall_logs_path_current_run , test_reports_src )
380
388
res_all_tests_dict ["default" ] = res_default_all
@@ -465,11 +473,16 @@ def parse_args():
465
473
" test_file_and_status(file_name='test_file_name_1', status='SKIPPED'): {}, \n "
466
474
" test_file_and_status(file_name='test_file_name_1', status='STATISTICS'): {} \n "
467
475
"}}\n " )
468
- parser .add_argument ('--example_usages' , type = str , help = "RUN ALL TESTS: python run_pytorch_unit_tests.py \n "
469
- "RUN PRIORITY TESTS: python run_pytorch_unit_tests.py --test_config distributed --priority_test \n "
470
- "RUN SELECTED TESTS: python run_pytorch_unit_tests.py --default_list test_weak test_dlpack --inductor_list inductor/test_torchinductor" )
476
+ parser .add_argument ('--example_usages' , type = str , help = "RUN ALL TESTS: python3 run_pytorch_unit_tests.py \n "
477
+ "RUN PRIORITY TESTS: python3 run_pytorch_unit_tests.py --test_config distributed --priority_test \n "
478
+ "RUN SELECTED TESTS: python3 run_pytorch_unit_tests.py --default_list test_weak test_dlpack --inductor_list inductor/test_torchinductor" )
471
479
return parser .parse_args ()
472
480
481
+ def check_num_gpus_for_distributed ():
482
+ p = subprocess .run ("rocminfo | grep -cE 'Name:\s+gfx'" , shell = True , capture_output = True , text = True )
483
+ num_gpus_visible = int (p .stdout )
484
+ assert num_gpus_visible > 1 , "Number of visible GPUs should be >1 to run distributed unit tests"
485
+
473
486
def main ():
474
487
global args
475
488
args = parse_args ()
0 commit comments