Skip to content

Commit 7b82b7d

Browse files
test sdk-preview for amlarc compute (Azure#1277)
* add convert script * add steup and cleanup and basics jobs workflow * bugfix * debug * debug * format code * bugfix * bugfix * bugfix * watch amlarc tool * remove files * remove amlarc name
1 parent 45d80cf commit 7b82b7d

10 files changed

+793
-288
lines changed

.github/kubernetes-compute/convert.py

+92
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import argparse
2+
import yaml
3+
4+
5+
def convert(input_file, compute_target, instance_type, common_runtime, output_file):
6+
with open(input_file, "r") as f:
7+
data = yaml.load(f, Loader=yaml.FullLoader)
8+
job_schema = data.get("$schema", "")
9+
is_pipeline_job = False
10+
is_sweep_job = False
11+
if "pipelineJob" in job_schema or "jobs" in data:
12+
is_pipeline_job = True
13+
if "sweepJob" in job_schema or data.get("type") == "sweep":
14+
is_sweep_job = True
15+
16+
# change compute target
17+
data["compute"] = "azureml:%s" % compute_target
18+
if is_pipeline_job:
19+
settings = data.get("settings", {})
20+
settings["default_compute"] = "azureml:%s" % compute_target
21+
data["settings"] = settings
22+
23+
for step in data.get("jobs", {}):
24+
data["jobs"][step]["compute"] = "azureml:%s" % compute_target
25+
26+
# set instance type
27+
if not is_pipeline_job and instance_type:
28+
resources = data.get("resources", {})
29+
resources["instance_type"] = instance_type
30+
data["resources"] = resources
31+
32+
# set common runtime environment variables.
33+
if common_runtime:
34+
if is_pipeline_job:
35+
for step in data.get("jobs", {}):
36+
env = data["jobs"][step].get("environment_variables", {})
37+
env["AZUREML_COMPUTE_USE_COMMON_RUNTIME"] = "true"
38+
data["jobs"][step]["environment_variables"] = env
39+
elif is_sweep_job:
40+
env = data["trial"].get("environment_variables", {})
41+
env["AZUREML_COMPUTE_USE_COMMON_RUNTIME"] = "true"
42+
data["trial"]["environment_variables"] = env
43+
else:
44+
env = data.get("environment_variables", {})
45+
env["AZUREML_COMPUTE_USE_COMMON_RUNTIME"] = "true"
46+
data["environment_variables"] = env
47+
48+
# write to output file if output file is specified, otherwise change inplace.
49+
if output_file:
50+
with open(output_file, "w") as f:
51+
yaml.dump(data, f)
52+
else:
53+
with open(input_file, "w") as f:
54+
yaml.dump(data, f)
55+
56+
57+
if __name__ == "__main__":
58+
# Parse command line arguments
59+
parser = argparse.ArgumentParser(
60+
description="Convert test case to AMLARC-compatible files."
61+
)
62+
parser.add_argument("-i", "--input", required=True, help="Input test case file")
63+
parser.add_argument(
64+
"-o",
65+
"--output",
66+
required=False,
67+
help="Output AMLARC-compatible file, if not provides, " "replace file inplace",
68+
)
69+
parser.add_argument(
70+
"-c",
71+
"--compute-target",
72+
required=False,
73+
help='Compute target, default is "githubtest"',
74+
default="githubtest",
75+
)
76+
parser.add_argument("-it", "--instance-type", required=False, help="Instance type")
77+
parser.add_argument(
78+
"-cr",
79+
"--common-runtime",
80+
required=False,
81+
default=False,
82+
action="store_true",
83+
help='Enable common runtime explicitly, default is "false"',
84+
)
85+
args = parser.parse_args()
86+
convert(
87+
args.input,
88+
args.compute_target,
89+
args.instance_type,
90+
args.common_runtime,
91+
args.output,
92+
)

.github/amlarc-tool.sh .github/kubernetes-compute/tool.sh

+84-39
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22
set -x
33

44
# Global variables
5-
export LOCK_FILE=$0.lock
6-
export RESULT_FILE=amlarc-test-result.txt
5+
export SCRIPT_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
6+
export LOCK_FILE=${SCRIPT_DIR}/$0.lock
7+
export RESULT_FILE=${SCRIPT_DIR}/kubernetes-compute-test-result.txt
78
export MAX_RETRIES=60
89
export SLEEP_SECONDS=20
910

@@ -56,15 +57,17 @@ refresh_lock_file(){
5657
echo $(date) > $LOCK_FILE
5758
}
5859

59-
set_release_train(){
60-
if [ "$1" != "" ]; then
61-
AMLARC_RELEASE_TRAIN=$1
62-
else
63-
if (( 10#$(date -d "$(cat $LOCK_FILE)" +"%H") < 12 )); then
64-
AMLARC_RELEASE_TRAIN=experimental
65-
else
66-
AMLARC_RELEASE_TRAIN=staging
67-
fi
60+
remove_lock_file(){
61+
rm -f $LOCK_FILE
62+
}
63+
64+
check_lock_file(){
65+
if [ -f $LOCK_FILE ]; then
66+
echo true
67+
return 0
68+
else
69+
echo false
70+
return 1
6871
fi
6972
}
7073

@@ -303,7 +306,7 @@ setup_workspace(){
303306
# setup compute
304307
setup_compute(){
305308

306-
COMPUTE_NS=${COMPUTE_NS:-default}
309+
COMPUTE_NS=${COMPUTE_NS:-default}
307310

308311
az ml compute attach \
309312
--subscription $SUBSCRIPTION \
@@ -349,23 +352,23 @@ delete_extension(){
349352
--cluster-type $CLUSTER_TYPE \
350353
--cluster-name $CLUSTER_NAME \
351354
--name $EXTENSION_NAME \
352-
--yes --no-wait --force
355+
--yes --force
353356
}
354357

355358
delete_arc(){
356359
az connectedk8s delete \
357360
--subscription $SUBSCRIPTION \
358361
--resource-group $RESOURCE_GROUP \
359362
--name $ARC_CLUSTER_NAME \
360-
--yes --no-wait
363+
--yes
361364
}
362365

363366
delete_aks(){
364367
az aks delete \
365368
--subscription $SUBSCRIPTION \
366369
--resource-group $RESOURCE_GROUP \
367370
--name $AKS_CLUSTER_NAME \
368-
--yes --no-wait
371+
--yes
369372
}
370373

371374
delete_compute(){
@@ -425,37 +428,78 @@ delete_workspace(){
425428
# run cli test job
426429
run_cli_job(){
427430
JOB_YML="${1:-examples/training/simple-train-cli/job.yml}"
428-
SET_ARGS="${@:2}"
429-
if [ "$SET_ARGS" != "" ]; then
430-
EXTRA_ARGS=" --set $SET_ARGS "
431-
else
432-
EXTRA_ARGS=" --set compute=azureml:$COMPUTE resources.instance_type=$INSTANCE_TYPE_NAME "
433-
fi
431+
CONVERTER_ARGS="${@:2}"
434432

435-
echo "[JobSubmission] $JOB_YML" | tee -a $RESULT_FILE
436-
437433
SRW=" --subscription $SUBSCRIPTION --resource-group $RESOURCE_GROUP --workspace-name $WORKSPACE "
434+
TIMEOUT="${TIMEOUT:-60m}"
435+
436+
# preprocess job spec for amlarc compute
437+
python $SCRIPT_DIR/convert.py -i $JOB_YML $CONVERTER_ARGS
438+
439+
# submit job
440+
echo "[JobSubmission] $JOB_YML" | tee -a $RESULT_FILE
441+
run_id=$(az ml job create $SRW -f $JOB_YML --query name -o tsv)
438442

439-
run_id=$(az ml job create $SRW -f $JOB_YML $EXTRA_ARGS --query name -o tsv)
440-
TIMEOUT="${TIMEOUT:-30m}"
443+
# stream job logs
441444
timeout ${TIMEOUT} az ml job stream $SRW -n $run_id
445+
446+
# show job status
442447
status=$(az ml job show $SRW -n $run_id --query status -o tsv)
443-
timeout 5m az ml job cancel $SRW -n $run_id
444-
echo $status
445-
if [[ $status == "Completed" ]]; then
446-
echo "[JobStatus] $JOB_YML completed" | tee -a $RESULT_FILE
447-
elif [[ $status == "Failed" ]]; then
448-
echo "[JobStatus] $JOB_YML failed" | tee -a $RESULT_FILE
448+
echo "[JobStatus] $JOB_YML ${status}" | tee -a $RESULT_FILE
449+
450+
if [[ $status == "Failed" ]]; then
449451
return 1
450-
else
451-
echo "[JobStatus] $JOB_YML unknown" | tee -a $RESULT_FILE
452-
return 2
452+
elif [[ $status != "Completed" ]]; then
453+
timeout 5m az ml job cancel $SRW -n $run_id
454+
return 2
453455
fi
454456
}
455457

458+
collect_jobs_from_workflows(){
459+
OUPUT_FILE=${1:-job-list.txt}
460+
SELECTOR=${2:-cli-jobs-basics}
461+
FILTER=${3:-java}
462+
WORKFLOWS_DIR=".github/workflows"
463+
464+
echo "WORKFLOWS_DIR: $WORKFLOWS_DIR, OUPUT_FILE: $OUPUT_FILE, FILTER: $FILTER"
465+
466+
rm -f $OUPUT_FILE
467+
touch $OUPUT_FILE
468+
469+
for workflow in $(ls -a $WORKFLOWS_DIR | grep -E "$SELECTOR" | grep -E -v "$FILTER" ); do
470+
471+
workflow=$WORKFLOWS_DIR/$workflow
472+
echo "Check workflow: $workflow"
473+
474+
job_yml=""
475+
stepcount=$(cat $workflow | shyaml get-length jobs.build.steps)
476+
stepcount=$(($stepcount - 1))
477+
for i in $(seq 0 $stepcount); do
478+
name=$(cat $workflow| shyaml get-value jobs.build.steps.$i.name)
479+
if [ "$name" != "run job" ]; then
480+
continue
481+
fi
482+
483+
run=$(cat $workflow| shyaml get-value jobs.build.steps.$i.run)
484+
wkdir=$(cat $workflow| shyaml get-value jobs.build.steps.$i.working-directory)
485+
echo "Found: run: $run wkdir: $wkdir"
486+
487+
job_yml=$wkdir/$(echo $run | awk '{print $NF}' | xargs)
488+
echo "${job_yml}" | tee -a $OUPUT_FILE
489+
done
490+
491+
if [ "$job_yml" == "" ]; then
492+
echo "Warning: no job yml found in workflow: $workflow"
493+
fi
494+
495+
done
496+
497+
echo "Found $(wc -l $OUPUT_FILE) jobs:"
498+
cat $OUPUT_FILE
499+
}
500+
456501
generate_workspace_config(){
457502
mkdir -p .azureml
458-
459503
cat << EOF > .azureml/config.json
460504
{
461505
"subscription_id": "$SUBSCRIPTION",
@@ -474,7 +518,6 @@ install_jupyter_dependency(){
474518
pip list || true
475519
}
476520

477-
478521
# run jupyter test
479522
run_jupyter_test(){
480523
JOB_SPEC="${1:-examples/training/simple-train-sdk/img-classification-training.ipynb}"
@@ -526,14 +569,15 @@ count_result(){
526569

527570
MIN_SUCCESS_NUM=${MIN_SUCCESS_NUM:--1}
528571

572+
[ ! -f $RESULT_FILE ] && touch $RESULT_FILE
573+
529574
echo "RESULT:"
530575
cat $RESULT_FILE
531576

532-
[ ! -f $RESULT_FILE ] && touch $RESULT_FILE
533-
534577
total=$(grep -c "\[JobSubmission\]" $RESULT_FILE)
535578
success=$(grep "\[JobStatus\]" $RESULT_FILE | grep -ic completed)
536579
unhealthy=$(( $total - $success ))
580+
537581
echo "Total: ${total}, Success: ${success}, Unhealthy: ${unhealthy}, MinSuccessNum: ${MIN_SUCCESS_NUM}."
538582

539583
if (( 10#${unhealthy} > 0 )) ; then
@@ -609,6 +653,8 @@ download_icm_cert(){
609653

610654
file_icm(){
611655

656+
set -e
657+
612658
ICM_XML_TEMPLATE='<?xml version="1.0" encoding="UTF-8"?>
613659
<s:Envelope xmlns:s="http://www.w3.org/2003/05/soap-envelope" xmlns:a="http://www.w3.org/2005/08/addressing">
614660
<s:Header>
@@ -736,7 +782,6 @@ ICM_XML_TEMPLATE='<?xml version="1.0" encoding="UTF-8"?>
736782
}
737783

738784

739-
740785
help(){
741786
echo "All functions:"
742787
declare -F

0 commit comments

Comments
 (0)