Skip to content

Commit a3cb19d

Browse files
Zetia/add single step workflow (Azure#1297)
* test with main * bugfix * print command * bugfix * print command * get kuebconfig before setup instance type * bugfix * bugfix * add cluster type * setup bash flag * bugfix * add cli-jobs-single-step workflow * format code * change to sdk-preview branch
1 parent 7b82b7d commit a3cb19d

10 files changed

+153
-26
lines changed

.github/kubernetes-compute/convert.py

+9-12
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,15 @@ def convert(input_file, compute_target, instance_type, common_runtime, output_fi
1414
is_sweep_job = True
1515

1616
# change compute target
17-
data["compute"] = "azureml:%s" % compute_target
18-
if is_pipeline_job:
19-
settings = data.get("settings", {})
20-
settings["default_compute"] = "azureml:%s" % compute_target
21-
data["settings"] = settings
17+
if compute_target:
18+
data["compute"] = "azureml:%s" % compute_target
19+
if is_pipeline_job:
20+
settings = data.get("settings", {})
21+
settings["default_compute"] = "azureml:%s" % compute_target
22+
data["settings"] = settings
2223

23-
for step in data.get("jobs", {}):
24-
data["jobs"][step]["compute"] = "azureml:%s" % compute_target
24+
for step in data.get("jobs", {}):
25+
data["jobs"][step]["compute"] = "azureml:%s" % compute_target
2526

2627
# set instance type
2728
if not is_pipeline_job and instance_type:
@@ -67,11 +68,7 @@ def convert(input_file, compute_target, instance_type, common_runtime, output_fi
6768
help="Output AMLARC-compatible file, if not provides, " "replace file inplace",
6869
)
6970
parser.add_argument(
70-
"-c",
71-
"--compute-target",
72-
required=False,
73-
help='Compute target, default is "githubtest"',
74-
default="githubtest",
71+
"-c", "--compute-target", required=False, help="Compute target",
7572
)
7673
parser.add_argument("-it", "--instance-type", required=False, help="Instance type")
7774
parser.add_argument(

.github/kubernetes-compute/tool.sh

+9-4
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ set -x
33

44
# Global variables
55
export SCRIPT_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
6-
export LOCK_FILE=${SCRIPT_DIR}/$0.lock
6+
export LOCK_FILE=${SCRIPT_DIR}/"$(basename ${BASH_SOURCE[0]})".lock
77
export RESULT_FILE=${SCRIPT_DIR}/kubernetes-compute-test-result.txt
88
export MAX_RETRIES=60
99
export SLEEP_SECONDS=20
@@ -440,6 +440,11 @@ run_cli_job(){
440440
echo "[JobSubmission] $JOB_YML" | tee -a $RESULT_FILE
441441
run_id=$(az ml job create $SRW -f $JOB_YML --query name -o tsv)
442442

443+
if [[ "$run_id" == "" ]]; then
444+
echo "[JobStatus] $JOB_YML SubmissionFailed" | tee -a $RESULT_FILE
445+
return 1
446+
fi
447+
443448
# stream job logs
444449
timeout ${TIMEOUT} az ml job stream $SRW -n $run_id
445450

@@ -448,10 +453,10 @@ run_cli_job(){
448453
echo "[JobStatus] $JOB_YML ${status}" | tee -a $RESULT_FILE
449454

450455
if [[ $status == "Failed" ]]; then
451-
return 1
456+
return 2
452457
elif [[ $status != "Completed" ]]; then
453458
timeout 5m az ml job cancel $SRW -n $run_id
454-
return 2
459+
return 3
455460
fi
456461
}
457462

@@ -494,7 +499,7 @@ collect_jobs_from_workflows(){
494499

495500
done
496501

497-
echo "Found $(wc -l $OUPUT_FILE) jobs:"
502+
echo "Found $(cat $OUPUT_FILE | wc -l) jobs:"
498503
cat $OUPUT_FILE
499504
}
500505

.github/workflows/kubernetes-compute-cpu-resources-cleanup.yml

+5
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ jobs:
3131
VM_SKU: Standard_D4s_v3
3232
MIN_COUNT: 5
3333
COMPUTE: "cpu-cluster"
34+
CLUSTER_TYPE: managedClusters
3435

3536
DELETE_ENDPOINTS: ${{ github.event.inputs.DELETE_ENDPOINTS }}
3637
CLEANUP_WORKSPACE: ${{ github.event.inputs.CLEANUP_WORKSPACE }}
@@ -59,13 +60,15 @@ jobs:
5960
- name: delete_endpoints
6061
if: ${{ always() }}
6162
run: |
63+
set +e
6264
if [ "${DELETE_ENDPOINTS}" == "true" ] ; then
6365
bash .github/kubernetes-compute/tool.sh delete_endpoints
6466
fi
6567
timeout-minutes: 60
6668
- name: delete_workspace
6769
if: ${{ always() }}
6870
run: |
71+
set +e
6972
if [ "${CLEANUP_WORKSPACE}" == "true" ] ; then
7073
bash .github/kubernetes-compute/tool.sh delete_compute
7174
bash .github/kubernetes-compute/tool.sh delete_workspace
@@ -74,6 +77,7 @@ jobs:
7477
- name: delete_extension
7578
if: ${{ always() }}
7679
run: |
80+
set +e
7781
if [ "${UNINSTALL_EXTENSION}" == "true" ] ; then
7882
bash .github/kubernetes-compute/tool.sh delete_compute
7983
bash .github/kubernetes-compute/tool.sh delete_extension
@@ -82,6 +86,7 @@ jobs:
8286
- name: delete_cluster
8387
if: ${{ always() }}
8488
run: |
89+
set +e
8590
if [ "${CLEANUP_CLUSTER}" == "true" ] ; then
8691
bash .github/kubernetes-compute/tool.sh delete_aks
8792
fi

.github/workflows/kubernetes-compute-cpu-resources-setup.yml

+4-2
Original file line numberDiff line numberDiff line change
@@ -92,12 +92,14 @@ jobs:
9292
timeout-minutes: 30
9393
- name: setup_instance_type
9494
run: |
95+
bash .github/kubernetes-compute/tool.sh get_kubeconfig
9596
bash .github/kubernetes-compute/tool.sh setup_instance_type defaultinstancetype 2 4Gi
9697
bash .github/kubernetes-compute/tool.sh setup_instance_type cpu 2 4Gi
9798
timeout-minutes: 30
9899

99100
- name: setup_dataset
100101
run: |
102+
set -x
101103
if [ "${SETUP_DATASET}" == "true" ] ; then
102104
wget https://azcopyvnext.azureedge.net/release20220511/azcopy_linux_amd64_10.15.0.tar.gz
103105
tar zxf azcopy_linux_amd64_10.15.0.tar.gz
@@ -112,13 +114,13 @@ jobs:
112114

113115
- name: remove_lock_file
114116
run: |
115-
run: bash .github/kubernetes-compute/tool.sh remove_lock_file
117+
bash .github/kubernetes-compute/tool.sh remove_lock_file
116118
timeout-minutes: 30
117119

118120
- name: file_icm
119121
if: ${{ always() }}
120122
run: |
121-
if [ "$(bash .github/kubernetes-compute/tool.sh check_lock_file) == "true" ] && [ "${{ github.event.inputs.FILE_TICKET }}" != "false" ] ; then
123+
if [ "$(bash .github/kubernetes-compute/tool.sh check_lock_file)" == "true" ] && [ "${{ github.event.inputs.FILE_TICKET }}" != "false" ] ; then
122124
# download certificates
123125
export ICM_HOST_NAME=ICM-HOST-AML-EXAMPLES
124126
export ICM_CONNECTOR_ID_NAME=ICM-CONNECTOR-ID-AML-EXAMPLES

.github/workflows/kubernetes-compute-gpu-resources-cleanup.yml

+5
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ jobs:
3030
WORKSPACE: amlarc-githubtest-ws
3131
VM_SKU: STANDARD_NC12
3232
COMPUTE: "gpu-cluster"
33+
CLUSTER_TYPE: managedClusters
3334

3435
DELETE_ENDPOINTS: ${{ github.event.inputs.DELETE_ENDPOINTS }}
3536
CLEANUP_WORKSPACE: ${{ github.event.inputs.CLEANUP_WORKSPACE }}
@@ -58,13 +59,15 @@ jobs:
5859
- name: delete_endpoints
5960
if: ${{ always() }}
6061
run: |
62+
set +e
6163
if [ "${DELETE_ENDPOINTS}" == "true" ] ; then
6264
bash .github/kubernetes-compute/tool.sh delete_endpoints
6365
fi
6466
timeout-minutes: 60
6567
- name: delete_workspace
6668
if: ${{ always() }}
6769
run: |
70+
set +e
6871
if [ "${CLEANUP_WORKSPACE}" == "true" ] ; then
6972
bash .github/kubernetes-compute/tool.sh delete_compute
7073
bash .github/kubernetes-compute/tool.sh delete_workspace
@@ -73,6 +76,7 @@ jobs:
7376
- name: delete_extension
7477
if: ${{ always() }}
7578
run: |
79+
set +e
7680
if [ "${UNINSTALL_EXTENSION}" == "true" ] ; then
7781
bash .github/kubernetes-compute/tool.sh delete_compute
7882
bash .github/kubernetes-compute/tool.sh delete_extension
@@ -81,6 +85,7 @@ jobs:
8185
- name: delete_cluster
8286
if: ${{ always() }}
8387
run: |
88+
set +e
8489
if [ "${CLEANUP_CLUSTER}" == "true" ] ; then
8590
bash .github/kubernetes-compute/tool.sh delete_aks
8691
fi

.github/workflows/kubernetes-compute-gpu-resources-setup.yml

+4-2
Original file line numberDiff line numberDiff line change
@@ -91,13 +91,15 @@ jobs:
9191
timeout-minutes: 30
9292
- name: setup_instance_type
9393
run: |
94+
bash .github/kubernetes-compute/tool.sh get_kubeconfig
9495
bash .github/kubernetes-compute/tool.sh setup_instance_type defaultinstancetype 4 32Gi 2
9596
bash .github/kubernetes-compute/tool.sh setup_instance_type cpu 4 32Gi
9697
bash .github/kubernetes-compute/tool.sh setup_instance_type gpu 4 32Gi 2
9798
timeout-minutes: 30
9899

99100
- name: setup_dataset
100101
run: |
102+
set -x
101103
if [ "${SETUP_DATASET}" == "true" ] ; then
102104
wget https://azcopyvnext.azureedge.net/release20220511/azcopy_linux_amd64_10.15.0.tar.gz
103105
tar zxf azcopy_linux_amd64_10.15.0.tar.gz
@@ -112,13 +114,13 @@ jobs:
112114

113115
- name: remove_lock_file
114116
run: |
115-
run: bash .github/kubernetes-compute/tool.sh remove_lock_file
117+
bash .github/kubernetes-compute/tool.sh remove_lock_file
116118
timeout-minutes: 30
117119

118120
- name: file_icm
119121
if: ${{ always() }}
120122
run: |
121-
if [ "$(bash .github/kubernetes-compute/tool.sh check_lock_file) == "true" ] && [ "${{ github.event.inputs.FILE_TICKET }}" != "false" ] ; then
123+
if [ "$(bash .github/kubernetes-compute/tool.sh check_lock_file)" == "true" ] && [ "${{ github.event.inputs.FILE_TICKET }}" != "false" ] ; then
122124
# download certificates
123125
export ICM_HOST_NAME=ICM-HOST-AML-EXAMPLES
124126
export ICM_CONNECTOR_ID_NAME=ICM-CONNECTOR-ID-AML-EXAMPLES

.github/workflows/kubernetes-compute-training-cli-jobs-basics.yml

+3-3
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ jobs:
3434

3535
JOB_SELECTOR: ${{ github.event.inputs.JOB_SELECTOR }}
3636
JOB_FILTER: ${{ github.event.inputs.JOB_FILTER }}
37-
JOB_LIST_FILE: amlarc-training-cli-jobs-basics.txt
37+
JOB_LIST_FILE: kubernetes-compute-training-cli-jobs-basics.txt
3838
TIMEOUT: ${{ github.event.inputs.TIMEOUT }}
3939

4040
MIN_SUCCESS_NUM: 1
@@ -76,8 +76,8 @@ jobs:
7676
for job in $(cat $JOB_LIST_FILE); do
7777
if [[ "$job" = *"yml" ]]; then
7878
echo "Run job: $job"
79-
bash .github/kubernetes-compute/tool.sh run_cli_job $job &
80-
sleep 1
79+
bash .github/kubernetes-compute/tool.sh run_cli_job $job -cr &
80+
sleep 60
8181
else
8282
echo "Found invalid job: $job"
8383
fi
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
name: kubernetes-compute-training-cli-jobs-single-step
2+
on:
3+
schedule:
4+
- cron: "0 23 * * *"
5+
workflow_dispatch:
6+
inputs:
7+
JOB_SELECTOR:
8+
description: 'Job selector used with grep command to select job workflows'
9+
required: false
10+
default: '^cli-jobs-single-step.*yml$'
11+
JOB_FILTER:
12+
description: 'Job filter used with grep command to filter out job workflows'
13+
required: false
14+
default: 'spark|java'
15+
TIMEOUT:
16+
description: 'Timeout of a single job'
17+
required: false
18+
default: '120m'
19+
FILE_TICKET:
20+
description: 'Whether to file icm ticket: true or false'
21+
required: false
22+
default: 'false'
23+
jobs:
24+
test:
25+
runs-on: ubuntu-latest
26+
env:
27+
SUBSCRIPTION: 6560575d-fa06-4e7d-95fb-f962e74efd7a
28+
RESOURCE_GROUP: azureml-examples-rg
29+
WORKSPACE: amlarc-githubtest-ws
30+
KEY_VAULT_NAME: amlarcgithubworkflowkv
31+
KEY_VAULT_SUB: dd94e4df-b58f-4e81-9c5e-bd72a4b2ef9e
32+
KEY_VAULT_RG: amlarc-github-workflow
33+
LOCATION: ${{ github.event.inputs.AMLARC_TEST_REGION }}
34+
35+
JOB_SELECTOR: ${{ github.event.inputs.JOB_SELECTOR }}
36+
JOB_FILTER: ${{ github.event.inputs.JOB_FILTER }}
37+
JOB_LIST_FILE: kubernetes-compute-training-cli-jobs-single-step.txt
38+
TIMEOUT: ${{ github.event.inputs.TIMEOUT }}
39+
40+
MIN_SUCCESS_NUM: 1
41+
SEVERITY: 3
42+
43+
TITLE: "[Github Workflow] kubernetes-compute-training-cli-jobs-single-step failed"
44+
GITHUB_REPO: https://github.com/Azure/azureml-examples
45+
WORKFLOW_URL: https://github.com/Azure/azureml-examples/actions/workflows/kubernetes-compute-training-cli-jobs-single-step.yml
46+
TSG_ID: https://microsoft.sharepoint.com/teams/Vienna/_layouts/15/Doc.aspx?sourcedoc=%7B7ebf9ccd-fa20-4e82-8b2b-6c14c9f1740f%7D&action=edit&wd=target(AMLArcTSG.one%7C69e8bdb1-5734-4b07-967a-5a50a91cf040%2FTroubleshooting%20github%20workflow%7C53a1a232-f9f0-4192-b7d2-0474848ebb18%2F)&share=IgHNnL9-IPqCTosrbBTJ8XQPAVwbVLD_kFe92G2Y9a49ROs
47+
ICM_MESSAGE: "Failed to run cli-jobs-single-step jobs on kubernetes compute!"
48+
49+
steps:
50+
- name: check out repo
51+
uses: actions/checkout@v2
52+
with:
53+
ref: sdk-preview
54+
- name: setup python
55+
uses: actions/setup-python@v2
56+
with:
57+
python-version: "3.8"
58+
- name: install tools
59+
run: |
60+
pip install shyaml
61+
bash .github/kubernetes-compute/tool.sh install_tools
62+
timeout-minutes: 30
63+
- name: azure login
64+
uses: azure/login@v1
65+
with:
66+
creds: ${{secrets.AZ_AE_CREDS}}
67+
timeout-minutes: 30
68+
69+
- name: collect jobs
70+
run: |
71+
bash .github/kubernetes-compute/tool.sh collect_jobs_from_workflows "$JOB_LIST_FILE" "$JOB_SELECTOR" "$JOB_FILTER"
72+
timeout-minutes: 30
73+
74+
- name: run jobs
75+
run: |
76+
for job in $(cat $JOB_LIST_FILE); do
77+
if [[ "$job" = *"yml" ]]; then
78+
echo "Run job: $job"
79+
bash .github/kubernetes-compute/tool.sh run_cli_job $job -cr &
80+
sleep 60
81+
else
82+
echo "Found invalid job: $job"
83+
fi
84+
done
85+
86+
wait
87+
timeout-minutes: 300
88+
89+
- name: check jobs
90+
if: ${{ always() }}
91+
run: |
92+
bash .github/kubernetes-compute/tool.sh count_result
93+
timeout-minutes: 30
94+
95+
- name: file_icm
96+
if: ${{ always() }}
97+
run: |
98+
bash .github/kubernetes-compute/tool.sh count_result || ret=$?
99+
if [ "$ret" != "0" ] && [ "$ret" != "" ] && [ "${{ github.event.inputs.FILE_TICKET }}" != "false" ] ; then
100+
# download certificates
101+
export ICM_HOST_NAME=ICM-HOST-AML-EXAMPLES
102+
export ICM_CONNECTOR_ID_NAME=ICM-CONNECTOR-ID-AML-EXAMPLES
103+
export ICM_ROUTING_ID_NAME=ICM-ROUTING-ID-AML-EXAMPLES
104+
bash .github/kubernetes-compute/tool.sh download_icm_cert
105+
export ICM_HOST=$(cat icm_host)
106+
export CONNECTOR_ID=$(cat icm_connector_id)
107+
export ROUTING_ID=$(cat icm_routing_id)
108+
export SUMMARY=$(bash .github/kubernetes-compute/tool.sh gen_summary_for_github_test)
109+
bash .github/kubernetes-compute/tool.sh file_icm
110+
fi
111+
timeout-minutes: 30

.github/workflows/kubernetes-compute-workspace-setup.yml

+1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ jobs:
4040
timeout-minutes: 30
4141
- name: setup_dataset
4242
run: |
43+
set -x
4344
wget https://azcopyvnext.azureedge.net/release20220511/azcopy_linux_amd64_10.15.0.tar.gz
4445
tar zxf azcopy_linux_amd64_10.15.0.tar.gz
4546
cp azcopy_linux_amd64_10.15.0/azcopy .

setup-repo/copy-data.sh

+2-3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
set -x
2+
13
# setup variables
24

35
datapath="example-data"
@@ -41,11 +43,8 @@ destination="$protocol://$account.blob.$endpoint/$container/$datapath/"
4143
# give access to blob container
4244

4345
az role assignment create \
44-
4546
--role "Storage Blob Data Owner" \
46-
4747
--assignee $principal \
48-
4948
--scope "/subscriptions/$subscription/resourceGroups/$group/providers/Microsoft.Storage/storageAccounts/$account"
5049

5150

0 commit comments

Comments
 (0)