-
Notifications
You must be signed in to change notification settings - Fork 0
341 lines (311 loc) · 13 KB
/
build-and-run-batch-job.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
name: build-and-run-batch-job
on:
workflow_call:
inputs:
ref:
description: >
The git ref for the workflow branch that is being requested. For
instance, if calling build-and-run-batch-job.yaml@v1, this variable
should be set to "v1". The default is "main". This seems repetitive,
but it's necessary to deal with the fact that reusable external
workflows currently do not have access to their ref:
https://github.com/actions/toolkit/issues/1264
required: false
type: string
default: main
role-duration-seconds:
description: How long IAM role used to auth with AWS can be valid.
required: false
type: string
default: 3600
vcpu:
description: >
Count of vCPUs to provision for the container. Per AWS requirements,
this parameter must be formatted as a float in increments of 0.25 when
the backend is "fargate" (e.g. 1.0 for 1 vCPU), but it must be
formatted as an integer when the backend is "ec2" (e.g. 1 for 1 vCPU).
The minimum is 1 vCPU.
required: false
type: string
default: "1.0"
gpu:
description: >
Count of GPUs to provision for the container. Per AWS requirements,
must be formatted as an integer. This parameter is only available when
the backend is "ec2", otherwise Terraform will raise an error. An
empty string indicates a null value, and is also the default.
required: false
type: string
default: ""
memory:
description: Count of megabytes of RAM to provision for the container.
required: false
type: string
default: "4096"
backend:
description: >
The type of AWS Batch compute environment to provision. Must
be one of "fargate" or "ec2". Fargate allows for provisioning
fractional amounts of vCPU and tends to start up jobs faster, but EC2
allows GPU instances to be configured using the `gpu` parameter.
required: false
# It would be nice if this were a `choice` type, but reusable workflows
# currently do not support that input type; instead, we perform
# validation in a step of the `build` job
type: string
default: fargate
container_env_vars:
description: |
A newline-delimited list of key-value pairs representing environment
variables that will be set in the Batch job container. Example:
CONTAINER_ENV_VARS: |
FOO=foo
BAR=bar
required: false
type: string
default: ""
poll_for_status:
description: >
Whether to poll the Batch job status once it starts up. It can be
useful to disable polling if the job implements its own
status reporting in a third-party system, or if the job runtime
exceeds the GitHub workflow timeout of 6 hours.
required: false
type: boolean
default: true
secrets:
# The ARN for the IAM role that the workflow will assume in order to make
# requests to AWS
AWS_IAM_ROLE_TO_ASSUME_ARN:
required: true
# The ID of the AWS account where requests will be sent; this is not
# actually used in any requests, but is instead masked to ensure that the
# workflow never accidentally logs it
AWS_ACCOUNT_ID:
required: true
# A version of the container_env_vars input variable for secret values.
# Values will be masked during parsing using the GitHub Actions add-mask
# function such that they are never printed to the logs under any
# circumstances. Example:
#
# CONTAINER_ENV_VARS: |
# FOO=${{ secrets.FOO }}
# BAR=${{ secrets.BAR }}
CONTAINER_ENV_VARS:
required: false
env:
DOCKER_REGISTRY: ghcr.io
DOCKER_IMAGE_NAME: ${{ github.repository }}
# Directory where Terraform configurations are stored (this assumes that
# the workflow jobs check out this repo's code into the ./actions path,
# which is necessary in order to reference local files in a reusable workflow)
TF_WORKDIR: ./actions/.github/workflows/build-and-run-batch-job-terraform/
jobs:
build:
# Don't run on closed PRs (required since the cleanup step will run on
# this event trigger)
# yamllint disable-line rule:line-length
if: ${{ ! ( github.event_name == 'pull_request' && github.event.action == 'closed') }}
runs-on: ubuntu-latest
outputs:
image-name: ${{ steps.save-image-name.outputs.image-name }}
steps:
- name: Validate input variables
run: |
if [[ "$BACKEND" != "fargate" && "$BACKEND" != "ec2" ]]; then
echo "backend must be one of 'fargate' or 'ec2', got $BACKEND"
exit 1
fi
shell: bash
env:
BACKEND: ${{ inputs.backend }}
- name: Checkout repo code
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ${{ env.DOCKER_REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_IMAGE_NAME }}
# Tag the following types of images:
# * On a branch, tag with the branch name (e.g. `master`)
# * On a PR, tag with the PR number (e.g. `pr-12`)
# * On a tagged commit, tag with the git tag (e.g. `2023`)
tags: |
type=ref,event=branch
type=ref,event=pr
type=ref,event=tag
- name: Build and push Docker image
id: build-and-push
uses: docker/build-push-action@v5
with:
context: .
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
# Fix incorrect container type sidebar display in GitHub Container
# Registry
provenance: false
- name: Save image name to output
id: save-image-name
run: |
IMAGE_NAME=$(echo "$METADATA" | jq -r '."image.name"')
echo "image-name=$IMAGE_NAME" >> "$GITHUB_OUTPUT"
shell: bash
env:
METADATA: ${{ steps.build-and-push.outputs.metadata }}
run:
# Only deploy the model to Batch when it's manually dispatched. This cuts
# down on notifications and reduces the chance of saving useless models
if: github.event_name == 'workflow_dispatch'
needs: [build]
runs-on: ubuntu-latest
# Require manual approval to run this job
environment: deploy
steps:
- name: Checkout repo code
uses: actions/checkout@v4
- name: Checkout workflow scripts
uses: actions/checkout@v4
with:
repository: ccao-data/actions
ref: ${{ inputs.ref }}
path: ./actions/
# yamllint disable rule:line-length
- name: Parse and mask container env vars
id: parse-env-vars
run: |
# Use $'' to encapsulate string in order to preserve newlines
CONTAINER_ENV_VARS=$'${{ inputs.container_env_vars }}'
SECRET_ENV_VARS=$'${{ secrets.CONTAINER_ENV_VARS }}'
if [[ -z "$CONTAINER_ENV_VARS" && -z "$SECRET_ENV_VARS" ]]; then
echo "Neither container_env_vars input nor CONTAINER_ENV_VARS secret is defined;"
echo "no additional env vars will be set in the container"
echo "environment=[]" >> "$GITHUB_OUTPUT"
else
declare -a OUTPUT_ARRAY # Array to store parsed vars
parse_env_vars_and_add_to_output_array() {
while IFS= read -r line; do
# Our method of iterating the newline-delimited string can
# introduce empty lines, so we need to be sure to filter them out
if [ -z "$line" ]; then
echo "Encountered empty line in container env vars; skipping"
else
# Parse env vars into key/value spliting on equal
VAR_KEY=$(echo "$line" | cut -d '=' -f 1)
VAR_VAL=$(echo "$line" | cut -d '=' -f 2-)
if [[ "$2" == "--mask" ]]; then
# Env vars are sometimes sensitive, so mask the value
echo "::add-mask::$VAR_VAL"
fi
# Transform the var into the key/val JSON format that AWS expects
OUTPUT_ARRAY+=("{\"name\":\"${VAR_KEY}\",\"value\":\"${VAR_VAL}\"}")
fi
done <<< "$1"
}
if [ -n "$CONTAINER_ENV_VARS" ]; then
echo "container_env_vars input is set; adding to the container vars"
parse_env_vars_and_add_to_output_array "$CONTAINER_ENV_VARS"
fi
if [ -n "$SECRET_ENV_VARS" ]; then
echo "CONTAINER_ENV_VARS secret is set; adding to the container vars"
parse_env_vars_and_add_to_output_array "$SECRET_ENV_VARS" --mask
fi
# Join the output array of env vars
echo "environment=[$(IFS=, ; echo "${OUTPUT_ARRAY[*]}")]" >> "$GITHUB_OUTPUT"
fi
shell: bash
# yamllint enable rule:line-length
- name: Setup Terraform
uses: ./actions/setup-terraform
with:
role-to-assume: ${{ secrets.AWS_IAM_ROLE_TO_ASSUME_ARN }}
aws-account-id: ${{ secrets.AWS_ACCOUNT_ID }}
batch-container-image-name: ${{ needs.build.outputs.image-name }}
batch-job-definition-vcpu: ${{ inputs.vcpu }}
batch-job-definition-memory: ${{ inputs.memory }}
batch-compute-environment-backend: ${{ inputs.backend }}
batch-job-definition-gpu: ${{ inputs.gpu }}
role-duration-seconds: ${{ inputs.role-duration-seconds}}
working-directory: ${{ env.TF_WORKDIR }}
- name: Validate Terraform config
run: terraform validate
working-directory: ${{ env.TF_WORKDIR }}
shell: bash
- name: Apply Terraform changes
run: terraform apply -auto-approve
working-directory: ${{ env.TF_WORKDIR }}
shell: bash
- name: Submit new Batch job
id: submit-job
run: |
# Use terraform-bin to disable the wrapper script installed by
# the setup-terraform action, which adds extra context to
# `terraform output` calls
BATCH_JOB_NAME="$(terraform-bin output -raw batch_job_name)"
BATCH_JOB_QUEUE="$(terraform-bin output -raw batch_job_queue_arn)"
BATCH_JOB_DEFINITION="$(\
terraform-bin output -raw batch_job_definition_arn \
)"
if [ -z "$BATCH_JOB_ENVIRONMENT" ]; then
BATCH_JOB_CONTAINER_OVERRIDES="{}"
else
BATCH_JOB_CONTAINER_OVERRIDES="{\"environment\":${BATCH_JOB_ENVIRONMENT}}"
fi
BATCH_JOB_DETAILS=$(\
aws batch submit-job \
--job-name "$BATCH_JOB_NAME" \
--job-definition "$BATCH_JOB_DEFINITION" \
--job-queue "$BATCH_JOB_QUEUE" \
--container-overrides "$BATCH_JOB_CONTAINER_OVERRIDES"
)
BATCH_JOB_ID=$(echo $BATCH_JOB_DETAILS | jq -r ".jobId")
echo "batch-job-id=$BATCH_JOB_ID" >> "$GITHUB_OUTPUT"
shell: bash
working-directory: ${{ env.TF_WORKDIR }}
env:
BATCH_JOB_ENVIRONMENT: ${{ steps.parse-env-vars.outputs.environment }}
- name: Wait for Batch job to start and print link to AWS logs
run: |
./actions/.github/workflows/scripts/batch_job_poll_status.sh \
"$BATCH_JOB_ID" startup
shell: bash
env:
BATCH_JOB_ID: ${{ steps.submit-job.outputs.batch-job-id }}
- name: Wait for Batch job to complete
if: ${{ inputs.poll_for_status == true }}
run: |
./actions/.github/workflows/scripts/batch_job_poll_status.sh \
"$BATCH_JOB_ID"
shell: bash
env:
BATCH_JOB_ID: ${{ steps.submit-job.outputs.batch-job-id }}
cleanup:
# Only run on closed PRs, to destroy staging resources
# yamllint disable-line rule:line-length
if: github.event_name == 'pull_request' && github.event.action == 'closed'
runs-on: ubuntu-latest
steps:
- name: Checkout repo code
uses: actions/checkout@v4
- name: Checkout workflow scripts
uses: actions/checkout@v4
with:
repository: ccao-data/actions
ref: ${{ inputs.ref }}
path: ./actions/
- name: Cleanup Terraform
uses: ./actions/cleanup-terraform
with:
role-to-assume: ${{ secrets.AWS_IAM_ROLE_TO_ASSUME_ARN }}
aws-account-id: ${{ secrets.AWS_ACCOUNT_ID }}
working-directory: ${{ env.TF_WORKDIR }}