.github/workflows/build-and-run-batch-job.yaml

name: build-and-run-batch-job

on:
  workflow_call:
    inputs:
      ref:
        description: >
          The git ref for the workflow branch that is being requested. For
          instance, if calling build-and-run-batch-job.yaml@v1, this variable
          should be set to "v1". The default is "main". This seems repetitive,
          but it's necessary to deal with the fact that reusable external
          workflows currently do not have access to their ref:
          https://github.com/actions/toolkit/issues/1264
        required: false
        type: string
        default: main
      role-duration-seconds:
        description: How long IAM role used to auth with AWS can be valid.
        required: false
        type: string
        default: 3600
      vcpu:
        description: >
          Count of vCPUs to provision for the container. Per AWS requirements,
          this parameter must be formatted as a float in increments of 0.25 when
          the backend is "fargate" (e.g. 1.0 for 1 vCPU), but it must be
          formatted as an integer when the backend is "ec2" (e.g. 1 for 1 vCPU).
          The minimum is 1 vCPU.
        required: false
        type: string
        default: "1.0"
      gpu:
        description: >
          Count of GPUs to provision for the container. Per AWS requirements,
          must be formatted as an integer. This parameter is only available when
          the backend is "ec2", otherwise Terraform will raise an error. An
          empty string indicates a null value, and is also the default.
        required: false
        type: string
        default: ""
      memory:
        description: Count of megabytes of RAM to provision for the container.
        required: false
        type: string
        default: "4096"
      backend:
        description: >
          The type of AWS Batch compute environment to provision. Must
          be one of "fargate" or "ec2". Fargate allows for provisioning
          fractional amounts of vCPU and tends to start up jobs faster, but EC2
          allows GPU instances to be configured using the `gpu` parameter.
        required: false
        # It would be nice if this were a `choice` type, but reusable workflows
        # currently do not support that input type; instead, we perform
        # validation in a step of the `build` job
        type: string
        default: fargate
      container_env_vars:
        description: |
          A newline-delimited list of key-value pairs representing environment
          variables that will be set in the Batch job container. Example:

          CONTAINER_ENV_VARS: |
            FOO=foo
            BAR=bar
        required: false
        type: string
        default: ""
      poll_for_status:
        description: >
          Whether to poll the Batch job status once it starts up. It can be
          useful to disable polling if the job implements its own
          status reporting in a third-party system, or if the job runtime
          exceeds the GitHub workflow timeout of 6 hours.
        required: false
        type: boolean
        default: true


    secrets:
      # The ARN for the IAM role that the workflow will assume in order to make
      # requests to AWS
      AWS_IAM_ROLE_TO_ASSUME_ARN:
        required: true
      # The ID of the AWS account where requests will be sent; this is not
      # actually used in any requests, but is instead masked to ensure that the
      # workflow never accidentally logs it
      AWS_ACCOUNT_ID:
        required: true
      # A version of the container_env_vars input variable for secret values.
      # Values will be masked during parsing using the GitHub Actions add-mask
      # function such that they are never printed to the logs under any
      # circumstances. Example:
      #
      # CONTAINER_ENV_VARS: |
      #   FOO=${{ secrets.FOO }}
      #   BAR=${{ secrets.BAR }}
      CONTAINER_ENV_VARS:
        required: false

env:
  DOCKER_REGISTRY: ghcr.io
  DOCKER_IMAGE_NAME: ${{ github.repository }}
  # Directory where Terraform configurations are stored (this assumes that
  # the workflow jobs check out this repo's code into the ./actions path,
  # which is necessary in order to reference local files in a reusable workflow)
  TF_WORKDIR: ./actions/.github/workflows/build-and-run-batch-job-terraform/

jobs:
  build:
    # Don't run on closed PRs (required since the cleanup step will run on
    # this event trigger)
    # yamllint disable-line rule:line-length
    if: ${{ ! ( github.event_name == 'pull_request' && github.event.action == 'closed') }}
    runs-on: ubuntu-latest
    outputs:
      image-name: ${{ steps.save-image-name.outputs.image-name }}
    steps:
      - name: Validate input variables
        run: |
          if [[ "$BACKEND" != "fargate" && "$BACKEND" != "ec2" ]]; then
            echo "backend must be one of 'fargate' or 'ec2', got $BACKEND"
            exit 1
          fi
        shell: bash
        env:
          BACKEND: ${{ inputs.backend }}

      - name: Checkout repo code
        uses: actions/checkout@v4

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3

      - name: Login to GitHub Container Registry
        uses: docker/login-action@v3
        with:
          registry: ${{ env.DOCKER_REGISTRY }}
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Extract metadata (tags, labels) for Docker
        id: meta
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_IMAGE_NAME }}
          # Tag the following types of images:
          #   * On a branch, tag with the branch name (e.g. `master`)
          #   * On a PR, tag with the PR number (e.g. `pr-12`)
          #   * On a tagged commit, tag with the git tag (e.g. `2023`)
          tags: |
            type=ref,event=branch
            type=ref,event=pr
            type=ref,event=tag

      - name: Build and push Docker image
        id: build-and-push
        uses: docker/build-push-action@v5
        with:
          context: .
          push: true
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
          cache-from: type=gha
          cache-to: type=gha,mode=max
          # Fix incorrect container type sidebar display in GitHub Container
          # Registry
          provenance: false

      - name: Save image name to output
        id: save-image-name
        run: |
          IMAGE_NAME=$(echo "$METADATA" | jq -r '."image.name"')
          echo "image-name=$IMAGE_NAME" >> "$GITHUB_OUTPUT"
        shell: bash
        env:
          METADATA: ${{ steps.build-and-push.outputs.metadata }}

  run:
    # Only deploy the model to Batch when it's manually dispatched. This cuts
    # down on notifications and reduces the chance of saving useless models
    if: github.event_name == 'workflow_dispatch'
    needs: [build]
    runs-on: ubuntu-latest
    # Require manual approval to run this job
    environment: deploy
    steps:
      - name: Checkout repo code
        uses: actions/checkout@v4

      - name: Checkout workflow scripts
        uses: actions/checkout@v4
        with:
          repository: ccao-data/actions
          ref: ${{ inputs.ref }}
          path: ./actions/

      # yamllint disable rule:line-length
      - name: Parse and mask container env vars
        id: parse-env-vars
        run: |
          # Use $'' to encapsulate string in order to preserve newlines
          CONTAINER_ENV_VARS=$'${{ inputs.container_env_vars }}'
          SECRET_ENV_VARS=$'${{ secrets.CONTAINER_ENV_VARS }}'

          if [[ -z "$CONTAINER_ENV_VARS" && -z "$SECRET_ENV_VARS" ]]; then
            echo "Neither container_env_vars input nor CONTAINER_ENV_VARS secret is defined;"
            echo "no additional env vars will be set in the container"
            echo "environment=[]" >> "$GITHUB_OUTPUT"
          else
            declare -a OUTPUT_ARRAY  # Array to store parsed vars

            parse_env_vars_and_add_to_output_array() {
              while IFS= read -r line; do
                # Our method of iterating the newline-delimited string can
                # introduce empty lines, so we need to be sure to filter them out
                if [ -z "$line" ]; then
                  echo "Encountered empty line in container env vars; skipping"
                else
                  # Parse env vars into key/value spliting on equal
                  VAR_KEY=$(echo "$line" | cut -d '=' -f 1)
                  VAR_VAL=$(echo "$line" | cut -d '=' -f 2-)
                  if [[ "$2" == "--mask" ]]; then
                    # Env vars are sometimes sensitive, so mask the value
                    echo "::add-mask::$VAR_VAL"
                  fi
                  # Transform the var into the key/val JSON format that AWS expects
                  OUTPUT_ARRAY+=("{\"name\":\"${VAR_KEY}\",\"value\":\"${VAR_VAL}\"}")
                fi
              done <<< "$1"
            }

            if [ -n "$CONTAINER_ENV_VARS" ]; then
              echo "container_env_vars input is set; adding to the container vars"
              parse_env_vars_and_add_to_output_array "$CONTAINER_ENV_VARS"
            fi
            if [ -n "$SECRET_ENV_VARS" ]; then
              echo "CONTAINER_ENV_VARS secret is set; adding to the container vars"
              parse_env_vars_and_add_to_output_array "$SECRET_ENV_VARS" --mask
            fi

            # Join the output array of env vars
            echo "environment=[$(IFS=, ; echo "${OUTPUT_ARRAY[*]}")]" >> "$GITHUB_OUTPUT"
          fi
        shell: bash

      # yamllint enable rule:line-length
      - name: Setup Terraform
        uses: ./actions/setup-terraform
        with:
          role-to-assume: ${{ secrets.AWS_IAM_ROLE_TO_ASSUME_ARN }}
          aws-account-id: ${{ secrets.AWS_ACCOUNT_ID }}
          batch-container-image-name: ${{ needs.build.outputs.image-name }}
          batch-job-definition-vcpu: ${{ inputs.vcpu }}
          batch-job-definition-memory: ${{ inputs.memory }}
          batch-compute-environment-backend: ${{ inputs.backend }}
          batch-job-definition-gpu: ${{ inputs.gpu }}
          role-duration-seconds: ${{ inputs.role-duration-seconds}}
          working-directory: ${{ env.TF_WORKDIR }}

      - name: Validate Terraform config
        run: terraform validate
        working-directory: ${{ env.TF_WORKDIR }}
        shell: bash

      - name: Apply Terraform changes
        run: terraform apply -auto-approve
        working-directory: ${{ env.TF_WORKDIR }}
        shell: bash

      - name: Submit new Batch job
        id: submit-job
        run: |
          # Use terraform-bin to disable the wrapper script installed by
          # the setup-terraform action, which adds extra context to
          # `terraform output` calls
          BATCH_JOB_NAME="$(terraform-bin output -raw batch_job_name)"
          BATCH_JOB_QUEUE="$(terraform-bin output -raw batch_job_queue_arn)"
          BATCH_JOB_DEFINITION="$(\
            terraform-bin output -raw batch_job_definition_arn \
          )"

          if [ -z "$BATCH_JOB_ENVIRONMENT" ]; then
            BATCH_JOB_CONTAINER_OVERRIDES="{}"
          else
            BATCH_JOB_CONTAINER_OVERRIDES="{\"environment\":${BATCH_JOB_ENVIRONMENT}}"
          fi

          BATCH_JOB_DETAILS=$(\
            aws batch submit-job \
              --job-name "$BATCH_JOB_NAME" \
              --job-definition "$BATCH_JOB_DEFINITION" \
              --job-queue "$BATCH_JOB_QUEUE" \
              --container-overrides "$BATCH_JOB_CONTAINER_OVERRIDES"
          )
          BATCH_JOB_ID=$(echo $BATCH_JOB_DETAILS | jq -r ".jobId")
          echo "batch-job-id=$BATCH_JOB_ID" >> "$GITHUB_OUTPUT"
        shell: bash
        working-directory: ${{ env.TF_WORKDIR }}
        env:
          BATCH_JOB_ENVIRONMENT: ${{ steps.parse-env-vars.outputs.environment }}

      - name: Wait for Batch job to start and print link to AWS logs
        run: |
          ./actions/.github/workflows/scripts/batch_job_poll_status.sh \
            "$BATCH_JOB_ID" startup
        shell: bash
        env:
          BATCH_JOB_ID: ${{ steps.submit-job.outputs.batch-job-id }}

      - name: Wait for Batch job to complete
        if: ${{ inputs.poll_for_status == true }}
        run: |
          ./actions/.github/workflows/scripts/batch_job_poll_status.sh \
            "$BATCH_JOB_ID"
        shell: bash
        env:
          BATCH_JOB_ID: ${{ steps.submit-job.outputs.batch-job-id }}

  cleanup:
    # Only run on closed PRs, to destroy staging resources
    # yamllint disable-line rule:line-length
    if: github.event_name == 'pull_request' && github.event.action == 'closed'
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repo code
        uses: actions/checkout@v4

      - name: Checkout workflow scripts
        uses: actions/checkout@v4
        with:
          repository: ccao-data/actions
          ref: ${{ inputs.ref }}
          path: ./actions/

      - name: Cleanup Terraform
        uses: ./actions/cleanup-terraform
        with:
          role-to-assume: ${{ secrets.AWS_IAM_ROLE_TO_ASSUME_ARN }}
          aws-account-id: ${{ secrets.AWS_ACCOUNT_ID }}
          working-directory: ${{ env.TF_WORKDIR }}