Skip to content

Commit

Permalink
rebae on main
Browse files Browse the repository at this point in the history
  • Loading branch information
ranchodeluxe committed Mar 12, 2024
2 parents 91baa0b + 0ad5e3a commit 817c222
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 13 deletions.
18 changes: 9 additions & 9 deletions .github/workflows/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,10 @@ def calc_task_manager_resources(task_manager_process_memory):

resource_profile_choice = os.environ.get("RESOURCE_PROFILE")
task_manager_process_memory_map = {
"small": 7168,
"medium": 10240,
"large": 15360,
"xlarge": 20480,
"small": 7824,
"medium": 9824,
"large": 11824,
"xlarge": 13824,
}
if resource_profile_choice not in list(task_manager_process_memory_map.keys()):
raise ValueError(
Expand All @@ -98,9 +98,9 @@ def calc_task_manager_resources(task_manager_process_memory):
c.Bake.feedstock_subdir = os.environ.get("FEEDSTOCK_SUBDIR")

c.FlinkOperatorBakery.parallelism = int(os.environ.get("PARALLELISM_OPTION"))
c.FlinkOperatorBakery.enable_job_archiving = True
c.FlinkOperatorBakery.enable_job_archiving = False
c.FlinkOperatorBakery.flink_version = "1.16"
c.FlinkOperatorBakery.job_manager_resources = {"memory": "1536m", "cpu": 0.3}
c.FlinkOperatorBakery.job_manager_resources = {"memory": "1280m", "cpu": 0.3}
c.FlinkOperatorBakery.task_manager_resources = {
"memory": f"{task_manager_process_memory_map[resource_profile_choice]}m",
"cpu": 0.3
Expand All @@ -121,6 +121,6 @@ def calc_task_manager_resources(task_manager_process_memory):
"client_kwargs": {"region_name": "us-west-2"},
}

c.InputCacheStorage.fsspec_class = c.TargetStorage.fsspec_class
c.InputCacheStorage.fsspec_args = c.TargetStorage.fsspec_args
c.InputCacheStorage.root_path = f"{BUCKET_PREFIX}/cache/"
# c.InputCacheStorage.fsspec_class = c.TargetStorage.fsspec_class
# c.InputCacheStorage.fsspec_args = c.TargetStorage.fsspec_args
# c.InputCacheStorage.root_path = f"{BUCKET_PREFIX}/cache/"
16 changes: 12 additions & 4 deletions .github/workflows/job-runner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,11 @@ on:
description: 'What auth mode (edl or iamrole) to use when accessing files.'
required: false
default: 'iamrole'
job_name:
description: 'A unique job name (no other existing filnk deployment can have it) so we can inspect metrics easier in Grafana.'
required: true
resource_profile:
description: 'jobs have different memory requirements so choose (small[7168M], medium[10240M], large[15360M], xlarge[20480M])'
description: 'jobs have different memory requirements so choose (small[7824_MiB], medium[9824_MiB], large[11824_MiB], xlarge[13824_MiB])'
required: false
default: 'small'

Expand Down Expand Up @@ -61,10 +64,11 @@ jobs:
uses: actions/checkout@v3

- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v2
uses: aws-actions/configure-aws-credentials@v3
with:
role-to-assume: arn:aws:iam::444055461661:role/github-actions-role-eodc
role-session-name: veda-pforge-run-job
role-duration-seconds: 3600
aws-region: us-west-2

- name: set up python 3.10
Expand Down Expand Up @@ -106,6 +110,7 @@ jobs:
bake \
--repo=${{ github.event.inputs.repo }} \
--ref=${{ github.event.inputs.ref }} \
--Bake.job_name="${{ github.event.inputs.job_name }}" \
-f .github/workflows/config.py
# we just use the `inspect` branch to dump runner errors and fail fast
Expand Down Expand Up @@ -133,6 +138,7 @@ jobs:
PARALLELISM_OPTION: ${{ github.event.inputs.parallelism }}
OUTPUT_BUCKET: ${{ vars.OUTPUT_BUCKET }}
AUTH_MODE: ${{ github.event.inputs.auth_mode }}
AWS_ROLE_ARN: ${{ vars.AWS_ROLE_ARN }}
RESOURCE_PROFILE: ${{ github.event.inputs.resource_profile }}

- name: cleanup if "pangeo-forge-runner bake" failed
Expand Down Expand Up @@ -195,13 +201,15 @@ jobs:
monitor-job:
runs-on: ubuntu-latest
name: monitor job ${{ needs.name-job.outputs.repo_name }}@${{ github.event.inputs.ref }}
environment: veda-smce
needs: [name-job, run-job]
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v2
uses: aws-actions/configure-aws-credentials@v3
with:
role-to-assume: arn:aws:iam::444055461661:role/github-actions-role-eodc
role-session-name: veda-pforge-monitor-job
role-duration-seconds: 43200 # note this has to match our timeout-minutes below for monitoring
aws-region: us-west-2

- name: install kubectl
Expand All @@ -219,7 +227,7 @@ jobs:
#
- name: monitor logs of job manager and report final status
id: monitorjob
timeout-minutes: 240
timeout-minutes: 720
continue-on-error: true
run: |
# TODO: this needs to not check the logs but the historyserver status
Expand Down

0 comments on commit 817c222

Please sign in to comment.