2
2
set -x
3
3
4
4
# Global variables
5
- export LOCK_FILE=$0 .lock
6
- export RESULT_FILE=amlarc-test-result.txt
5
+ export SCRIPT_DIR=$( cd " $( dirname " ${BASH_SOURCE[0]} " ) " && pwd )
6
+ export LOCK_FILE=${SCRIPT_DIR} /$0 .lock
7
+ export RESULT_FILE=${SCRIPT_DIR} /kubernetes-compute-test-result.txt
7
8
export MAX_RETRIES=60
8
9
export SLEEP_SECONDS=20
9
10
@@ -56,15 +57,17 @@ refresh_lock_file(){
56
57
echo $( date) > $LOCK_FILE
57
58
}
58
59
59
- set_release_train (){
60
- if [ " $1 " != " " ]; then
61
- AMLARC_RELEASE_TRAIN=$1
62
- else
63
- if (( 10 #$(date - d "$(cat $LOCK_FILE )" + "% H") < 12 )) ; then
64
- AMLARC_RELEASE_TRAIN=experimental
65
- else
66
- AMLARC_RELEASE_TRAIN=staging
67
- fi
60
+ remove_lock_file (){
61
+ rm -f $LOCK_FILE
62
+ }
63
+
64
+ check_lock_file (){
65
+ if [ -f $LOCK_FILE ]; then
66
+ echo true
67
+ return 0
68
+ else
69
+ echo false
70
+ return 1
68
71
fi
69
72
}
70
73
@@ -303,7 +306,7 @@ setup_workspace(){
303
306
# setup compute
304
307
setup_compute (){
305
308
306
- COMPUTE_NS=${COMPUTE_NS:- default}
309
+ COMPUTE_NS=${COMPUTE_NS:- default}
307
310
308
311
az ml compute attach \
309
312
--subscription $SUBSCRIPTION \
@@ -349,23 +352,23 @@ delete_extension(){
349
352
--cluster-type $CLUSTER_TYPE \
350
353
--cluster-name $CLUSTER_NAME \
351
354
--name $EXTENSION_NAME \
352
- --yes --no-wait -- force
355
+ --yes --force
353
356
}
354
357
355
358
delete_arc (){
356
359
az connectedk8s delete \
357
360
--subscription $SUBSCRIPTION \
358
361
--resource-group $RESOURCE_GROUP \
359
362
--name $ARC_CLUSTER_NAME \
360
- --yes --no-wait
363
+ --yes
361
364
}
362
365
363
366
delete_aks (){
364
367
az aks delete \
365
368
--subscription $SUBSCRIPTION \
366
369
--resource-group $RESOURCE_GROUP \
367
370
--name $AKS_CLUSTER_NAME \
368
- --yes --no-wait
371
+ --yes
369
372
}
370
373
371
374
delete_compute (){
@@ -425,37 +428,78 @@ delete_workspace(){
425
428
# run cli test job
426
429
run_cli_job (){
427
430
JOB_YML=" ${1:- examples/ training/ simple-train-cli/ job.yml} "
428
- SET_ARGS=" ${@: 2} "
429
- if [ " $SET_ARGS " != " " ]; then
430
- EXTRA_ARGS=" --set $SET_ARGS "
431
- else
432
- EXTRA_ARGS=" --set compute=azureml:$COMPUTE resources.instance_type=$INSTANCE_TYPE_NAME "
433
- fi
431
+ CONVERTER_ARGS=" ${@: 2} "
434
432
435
- echo " [JobSubmission] $JOB_YML " | tee -a $RESULT_FILE
436
-
437
433
SRW=" --subscription $SUBSCRIPTION --resource-group $RESOURCE_GROUP --workspace-name $WORKSPACE "
434
+ TIMEOUT=" ${TIMEOUT:- 60m} "
435
+
436
+ # preprocess job spec for amlarc compute
437
+ python $SCRIPT_DIR /convert.py -i $JOB_YML $CONVERTER_ARGS
438
+
439
+ # submit job
440
+ echo " [JobSubmission] $JOB_YML " | tee -a $RESULT_FILE
441
+ run_id=$( az ml job create $SRW -f $JOB_YML --query name -o tsv)
438
442
439
- run_id=$( az ml job create $SRW -f $JOB_YML $EXTRA_ARGS --query name -o tsv)
440
- TIMEOUT=" ${TIMEOUT:- 30m} "
443
+ # stream job logs
441
444
timeout ${TIMEOUT} az ml job stream $SRW -n $run_id
445
+
446
+ # show job status
442
447
status=$( az ml job show $SRW -n $run_id --query status -o tsv)
443
- timeout 5m az ml job cancel $SRW -n $run_id
444
- echo $status
445
- if [[ $status == " Completed" ]]; then
446
- echo " [JobStatus] $JOB_YML completed" | tee -a $RESULT_FILE
447
- elif [[ $status == " Failed" ]]; then
448
- echo " [JobStatus] $JOB_YML failed" | tee -a $RESULT_FILE
448
+ echo " [JobStatus] $JOB_YML ${status} " | tee -a $RESULT_FILE
449
+
450
+ if [[ $status == " Failed" ]]; then
449
451
return 1
450
- else
451
- echo " [JobStatus] $JOB_YML unknown " | tee -a $RESULT_FILE
452
- return 2
452
+ elif [[ $status != " Completed " ]] ; then
453
+ timeout 5m az ml job cancel $SRW -n $run_id
454
+ return 2
453
455
fi
454
456
}
455
457
458
+ collect_jobs_from_workflows (){
459
+ OUPUT_FILE=${1:- job-list.txt}
460
+ SELECTOR=${2:- cli-jobs-basics}
461
+ FILTER=${3:- java}
462
+ WORKFLOWS_DIR=" .github/workflows"
463
+
464
+ echo " WORKFLOWS_DIR: $WORKFLOWS_DIR , OUPUT_FILE: $OUPUT_FILE , FILTER: $FILTER "
465
+
466
+ rm -f $OUPUT_FILE
467
+ touch $OUPUT_FILE
468
+
469
+ for workflow in $( ls -a $WORKFLOWS_DIR | grep -E " $SELECTOR " | grep -E -v " $FILTER " ) ; do
470
+
471
+ workflow=$WORKFLOWS_DIR /$workflow
472
+ echo " Check workflow: $workflow "
473
+
474
+ job_yml=" "
475
+ stepcount=$( cat $workflow | shyaml get-length jobs.build.steps)
476
+ stepcount=$(( $stepcount - 1 ))
477
+ for i in $( seq 0 $stepcount ) ; do
478
+ name=$( cat $workflow | shyaml get-value jobs.build.steps.$i .name)
479
+ if [ " $name " != " run job" ]; then
480
+ continue
481
+ fi
482
+
483
+ run=$( cat $workflow | shyaml get-value jobs.build.steps.$i .run)
484
+ wkdir=$( cat $workflow | shyaml get-value jobs.build.steps.$i .working-directory)
485
+ echo " Found: run: $run wkdir: $wkdir "
486
+
487
+ job_yml=$wkdir /$( echo $run | awk ' {print $NF}' | xargs)
488
+ echo " ${job_yml} " | tee -a $OUPUT_FILE
489
+ done
490
+
491
+ if [ " $job_yml " == " " ]; then
492
+ echo " Warning: no job yml found in workflow: $workflow "
493
+ fi
494
+
495
+ done
496
+
497
+ echo " Found $( wc -l $OUPUT_FILE ) jobs:"
498
+ cat $OUPUT_FILE
499
+ }
500
+
456
501
generate_workspace_config (){
457
502
mkdir -p .azureml
458
-
459
503
cat << EOF > .azureml/config.json
460
504
{
461
505
"subscription_id": "$SUBSCRIPTION ",
@@ -474,7 +518,6 @@ install_jupyter_dependency(){
474
518
pip list || true
475
519
}
476
520
477
-
478
521
# run jupyter test
479
522
run_jupyter_test (){
480
523
JOB_SPEC=" ${1:- examples/ training/ simple-train-sdk/ img-classification-training.ipynb} "
@@ -526,14 +569,15 @@ count_result(){
526
569
527
570
MIN_SUCCESS_NUM=${MIN_SUCCESS_NUM:- -1}
528
571
572
+ [ ! -f $RESULT_FILE ] && touch $RESULT_FILE
573
+
529
574
echo " RESULT:"
530
575
cat $RESULT_FILE
531
576
532
- [ ! -f $RESULT_FILE ] && touch $RESULT_FILE
533
-
534
577
total=$( grep -c " \[JobSubmission\]" $RESULT_FILE )
535
578
success=$( grep " \[JobStatus\]" $RESULT_FILE | grep -ic completed)
536
579
unhealthy=$(( $total - $success ))
580
+
537
581
echo " Total: ${total} , Success: ${success} , Unhealthy: ${unhealthy} , MinSuccessNum: ${MIN_SUCCESS_NUM} ."
538
582
539
583
if (( 10 #${unhealthy} > 0 )) ; then
@@ -609,6 +653,8 @@ download_icm_cert(){
609
653
610
654
file_icm (){
611
655
656
+ set -e
657
+
612
658
ICM_XML_TEMPLATE=' <?xml version="1.0" encoding="UTF-8"?>
613
659
<s:Envelope xmlns:s="http://www.w3.org/2003/05/soap-envelope" xmlns:a="http://www.w3.org/2005/08/addressing">
614
660
<s:Header>
@@ -736,7 +782,6 @@ ICM_XML_TEMPLATE='<?xml version="1.0" encoding="UTF-8"?>
736
782
}
737
783
738
784
739
-
740
785
help (){
741
786
echo " All functions:"
742
787
declare -F
0 commit comments