Skip to content

dispatch job

dispatch job #310

Workflow file for this run

name: dispatch job
on:
workflow_dispatch:
inputs:
repo:
description: 'The https github url for the recipe feedstock'
required: true
ref:
description: 'The tag or branch to target in your recipe repo'
required: true
default: 'main'
feedstock_subdir:
description: 'The subdir of the feedstock directory in the repo'
required: true
default: 'feedstock'
parallelism:
description: 'Number of partitions to divide the the Spark RDD into (usually equals [num-of-executors]*[num-of-vcpus])'
required: true
default: '1280'
job_name:
description: 'Name the EMR job'
required: true
permissions:
id-token: write # This is required for requesting the JWT
contents: read # This is required for actions/checkout
jobs:
name-job:
runs-on: ubuntu-latest
outputs:
repo_name: ${{ steps.string_manipulation.outputs.result }}
steps:
- name: manipuluate strings
id: string_manipulation
run: |
repo_name=$(basename -s .git "${{ github.event.inputs.repo }}")
echo "result=$repo_name" >> $GITHUB_OUTPUT
run-job:
if: contains('["ranchodeluxe","abarciauskas-bgse", "norlandrhagen", "sharkinsspatial", "moradology", "thodson-usgs"]', github.actor)
name: kickoff job ${{ needs.name-job.outputs.repo_name }}@${{ github.event.inputs.ref }}
needs: name-job
environment: veda-smce
outputs:
job_name: ${{ steps.report_ids.outputs.job_name }}
job_id: ${{ steps.report_ids.outputs.job_id }}
runs-on: ubuntu-latest
steps:
- name: checkout repository
uses: actions/checkout@v3
- name: configure aws credentials
uses: aws-actions/configure-aws-credentials@v3
with:
role-to-assume: arn:aws:iam::444055461661:role/github-actions-role-eodc
role-session-name: veda-pforge-run-job
role-duration-seconds: 3600
aws-region: us-west-2
- name: set up python 3.10
uses: actions/setup-python@v3
with:
python-version: '3.10'
- name: echo inputs to user
run: |
echo "Manually triggered workflow": \
${{ github.event.inputs.repo }} \
${{ github.event.inputs.ref }} \
${{ github.event.inputs.feedstock_subdir}} \
${{ github.event.inputs.parallelism }}
- name: submit job to EMR serverless
id: executejob
continue-on-error: true
run: |
# TODO: make submit_spark_job.py or some other config.py checkout, build env and package on s3
# before submission
python submit_spark_job.py \
--name=${{ github.event.inputs.job_name }} \
--application-id="00firgpmjusj5e0l" \
--execution-role-arn="arn:aws:iam::444055461661:role/veda-data-reader-dev" \
--entry-point="s3://veda-pforge-emr-input-scripts-v3/runwrapper.py" \
--entry-point-arguments="${{ github.event.inputs.repo }} ${{ github.event.inputs.ref }} ${{ github.event.inputs.feedstock_subdir }}" \
--spark-submit-parameters="--conf spark.executor.cores=16 --conf spark.executor.memory=60G --conf spark.executor.memoryOverhead=60G --conf spark.driver.memory=10G --conf spark.driver.memoryOverhead=4G --conf spark.shuffle.file.buffer=64k --conf spark.default.parallelism=${{ github.event.inputs.ref }} --conf spark.emr-serverless.executor.disk=200G"
env:
REPO: ${{ github.event.inputs.repo }}
REF: ${{ github.event.inputs.ref }}
FEEDSTOCK_SUBDIR: ${{ github.event.inputs.feedstock_subdir }}
PARALLELISM_OPTION: ${{ github.event.inputs.parallelism }}
JOB_NAME: ${{ github.event.inputs.job_name }}
- name: cleanup if submission failed
if: steps.executejob.outcome == 'failure'
run: |
echo "The previous command failed. Running cleanup logic..."
# force GH action to show failed result
exit 128
#
# - name: echo JobID, JobName, FlinkDashboard to user
# id: report_ids
# run: |
# # TODO: we also need to report historyserver URL and flink dashboard URL
# # but this also requires us to think how we're going to have a thin
# # layer of authentication around these services so they aren't totally public
# echo '############ RECIPE JOB NAME ################'
# echo $RECIPE_JOB_NAME
# echo '############ FLINK JOB NAME ################'
# echo $JOB_NAME
# echo "job_name=$JOB_NAME" >> $GITHUB_OUTPUT
# echo '############ JOB ID ################'
# echo $JOB_ID
# echo "job_id=$JOB_ID" >> $GITHUB_OUTPUT
# echo '############ FLINK DASHBOARD ################'
# echo $FLINK_DASH
# echo "flink_dash=$FLINK_DASH" >> $GITHUB_OUTPUT
# monitor-job:
# runs-on: ubuntu-latest
# name: monitor job ${{ needs.name-job.outputs.repo_name }}@${{ github.event.inputs.ref }}
# environment: veda-smce
# needs: [name-job, run-job]
# steps:
# - name: Configure AWS credentials
# uses: aws-actions/configure-aws-credentials@v3
# with:
# role-to-assume: arn:aws:iam::444055461661:role/github-actions-role-eodc
# role-session-name: veda-pforge-monitor-job
# role-duration-seconds: 43200 # note this has to match our timeout-minutes below for monitoring
# aws-region: us-west-2