From 418cd734b1d4883a22e0a2f36e2fb7ba913309fd Mon Sep 17 00:00:00 2001 From: ranchodeluxe Date: Wed, 6 Mar 2024 08:37:54 -0800 Subject: [PATCH 01/15] disable flink history server --- .github/workflows/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/config.py b/.github/workflows/config.py index ce5584d..f394408 100644 --- a/.github/workflows/config.py +++ b/.github/workflows/config.py @@ -98,7 +98,7 @@ def calc_task_manager_resources(task_manager_process_memory): c.Bake.feedstock_subdir = os.environ.get("FEEDSTOCK_SUBDIR") c.FlinkOperatorBakery.parallelism = int(os.environ.get("PARALLELISM_OPTION")) -c.FlinkOperatorBakery.enable_job_archiving = True +c.FlinkOperatorBakery.enable_job_archiving = False c.FlinkOperatorBakery.flink_version = "1.16" c.FlinkOperatorBakery.job_manager_resources = {"memory": "1536m", "cpu": 0.3} c.FlinkOperatorBakery.task_manager_resources = { From e99e83e7a222b45ae5a54ae135d29512749a37f2 Mon Sep 17 00:00:00 2001 From: ranchodeluxe Date: Wed, 6 Mar 2024 10:42:18 -0800 Subject: [PATCH 02/15] misc fixes --- .github/workflows/config.py | 2 +- .github/workflows/job-runner.yaml | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/config.py b/.github/workflows/config.py index f394408..ec74154 100644 --- a/.github/workflows/config.py +++ b/.github/workflows/config.py @@ -76,7 +76,7 @@ def calc_task_manager_resources(task_manager_process_memory): resource_profile_choice = os.environ.get("RESOURCE_PROFILE") task_manager_process_memory_map = { - "small": 7168, + "small": 5632, "medium": 10240, "large": 15360, "xlarge": 20480, diff --git a/.github/workflows/job-runner.yaml b/.github/workflows/job-runner.yaml index e938877..8bfe690 100644 --- a/.github/workflows/job-runner.yaml +++ b/.github/workflows/job-runner.yaml @@ -26,8 +26,11 @@ on: description: 'What auth mode (edl or iamrole) to use when accessing files.' required: false default: 'iamrole' + job_name: + description: 'A unique job name (no other existing filnk deployment can have it) so we can inspect metrics easier in Grafana.' + required: true resource_profile: - description: 'jobs have different memory requirements so choose (small[7168M], medium[10240M], large[15360M], xlarge[20480M])' + description: 'jobs have different memory requirements so choose (small[5632M], medium[10240M], large[15360M], xlarge[20480M])' required: false default: 'small' @@ -106,6 +109,7 @@ jobs: bake \ --repo=${{ github.event.inputs.repo }} \ --ref=${{ github.event.inputs.ref }} \ + --Bake.job_name=${{ github.event.inputs.repo }} \ -f .github/workflows/config.py > execute.log # export all the valuable information from the logs @@ -192,6 +196,7 @@ jobs: monitor-job: runs-on: ubuntu-latest name: monitor job ${{ needs.name-job.outputs.repo_name }}@${{ github.event.inputs.ref }} + environment: veda-smce needs: [name-job, run-job] steps: - name: Configure AWS credentials From 0ba48b8412d23c0429aa02b0cf705a0bcfa0354f Mon Sep 17 00:00:00 2001 From: ranchodeluxe Date: Wed, 6 Mar 2024 10:52:54 -0800 Subject: [PATCH 03/15] comment it out and see what gives --- .github/workflows/config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/config.py b/.github/workflows/config.py index ec74154..3ce21e4 100644 --- a/.github/workflows/config.py +++ b/.github/workflows/config.py @@ -121,6 +121,6 @@ def calc_task_manager_resources(task_manager_process_memory): "client_kwargs": {"region_name": "us-west-2"}, } -c.InputCacheStorage.fsspec_class = c.TargetStorage.fsspec_class -c.InputCacheStorage.fsspec_args = c.TargetStorage.fsspec_args -c.InputCacheStorage.root_path = f"{BUCKET_PREFIX}/cache/" +# c.InputCacheStorage.fsspec_class = c.TargetStorage.fsspec_class +# c.InputCacheStorage.fsspec_args = c.TargetStorage.fsspec_args +# c.InputCacheStorage.root_path = f"{BUCKET_PREFIX}/cache/" From 7045f6f8f0cbb1164783fab0960ad2a229c0920b Mon Sep 17 00:00:00 2001 From: ranchodeluxe Date: Wed, 6 Mar 2024 10:58:13 -0800 Subject: [PATCH 04/15] job_name --- .github/workflows/job-runner.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/job-runner.yaml b/.github/workflows/job-runner.yaml index 8bfe690..bdb983a 100644 --- a/.github/workflows/job-runner.yaml +++ b/.github/workflows/job-runner.yaml @@ -109,7 +109,7 @@ jobs: bake \ --repo=${{ github.event.inputs.repo }} \ --ref=${{ github.event.inputs.ref }} \ - --Bake.job_name=${{ github.event.inputs.repo }} \ + --Bake.job_name="${{ github.event.inputs.job_name }}" \ -f .github/workflows/config.py > execute.log # export all the valuable information from the logs From 2d93bf2b3c4ded10a8c970d7e5a16271860d82e3 Mon Sep 17 00:00:00 2001 From: ranchodeluxe Date: Wed, 6 Mar 2024 13:45:30 -0800 Subject: [PATCH 05/15] more tweaks --- .github/workflows/job-runner.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/job-runner.yaml b/.github/workflows/job-runner.yaml index bdb983a..bbaee18 100644 --- a/.github/workflows/job-runner.yaml +++ b/.github/workflows/job-runner.yaml @@ -64,7 +64,7 @@ jobs: uses: actions/checkout@v3 - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v2 + uses: aws-actions/configure-aws-credentials@v3 with: role-to-assume: arn:aws:iam::444055461661:role/github-actions-role-eodc role-session-name: veda-pforge-run-job @@ -200,10 +200,11 @@ jobs: needs: [name-job, run-job] steps: - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v2 + uses: aws-actions/configure-aws-credentials@v3 with: role-to-assume: arn:aws:iam::444055461661:role/github-actions-role-eodc role-session-name: veda-pforge-monitor-job + role-duration-seconds: 21600 # note this has to match our timeout-minutes below for monitoring aws-region: us-west-2 - name: install kubectl @@ -221,7 +222,7 @@ jobs: # - name: monitor logs of job manager and report final status id: monitorjob - timeout-minutes: 240 + timeout-minutes: 360 continue-on-error: true run: | # TODO: this needs to not check the logs but the historyserver status From 10e3586b2f80e72bc009fd731e628c2946adbb07 Mon Sep 17 00:00:00 2001 From: ranchodeluxe Date: Wed, 6 Mar 2024 14:32:14 -0800 Subject: [PATCH 06/15] resource scaling --- .github/workflows/config.py | 10 +++++----- .github/workflows/job-runner.yaml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/config.py b/.github/workflows/config.py index 3ce21e4..d796cc6 100644 --- a/.github/workflows/config.py +++ b/.github/workflows/config.py @@ -76,10 +76,10 @@ def calc_task_manager_resources(task_manager_process_memory): resource_profile_choice = os.environ.get("RESOURCE_PROFILE") task_manager_process_memory_map = { - "small": 5632, - "medium": 10240, - "large": 15360, - "xlarge": 20480, + "small": 7824, + "medium": 9824, + "large": 11824, + "xlarge": 13824, } if resource_profile_choice not in list(task_manager_process_memory_map.keys()): raise ValueError( @@ -100,7 +100,7 @@ def calc_task_manager_resources(task_manager_process_memory): c.FlinkOperatorBakery.parallelism = int(os.environ.get("PARALLELISM_OPTION")) c.FlinkOperatorBakery.enable_job_archiving = False c.FlinkOperatorBakery.flink_version = "1.16" -c.FlinkOperatorBakery.job_manager_resources = {"memory": "1536m", "cpu": 0.3} +c.FlinkOperatorBakery.job_manager_resources = {"memory": "1280m", "cpu": 0.3} c.FlinkOperatorBakery.task_manager_resources = { "memory": f"{task_manager_process_memory_map[resource_profile_choice]}m", "cpu": 0.3 diff --git a/.github/workflows/job-runner.yaml b/.github/workflows/job-runner.yaml index bbaee18..e0e7dd7 100644 --- a/.github/workflows/job-runner.yaml +++ b/.github/workflows/job-runner.yaml @@ -30,7 +30,7 @@ on: description: 'A unique job name (no other existing filnk deployment can have it) so we can inspect metrics easier in Grafana.' required: true resource_profile: - description: 'jobs have different memory requirements so choose (small[5632M], medium[10240M], large[15360M], xlarge[20480M])' + description: 'jobs have different memory requirements so choose (small[7824_MiB], medium[9824_MiB], large[11824_MiB], xlarge[13824_MiB])' required: false default: 'small' From b4bff4ddef025443aab4ce06a454c61ab27a4b5f Mon Sep 17 00:00:00 2001 From: ranchodeluxe Date: Wed, 6 Mar 2024 16:54:51 -0800 Subject: [PATCH 07/15] renable config --- .github/workflows/config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/config.py b/.github/workflows/config.py index d796cc6..e3b0103 100644 --- a/.github/workflows/config.py +++ b/.github/workflows/config.py @@ -121,6 +121,6 @@ def calc_task_manager_resources(task_manager_process_memory): "client_kwargs": {"region_name": "us-west-2"}, } -# c.InputCacheStorage.fsspec_class = c.TargetStorage.fsspec_class -# c.InputCacheStorage.fsspec_args = c.TargetStorage.fsspec_args -# c.InputCacheStorage.root_path = f"{BUCKET_PREFIX}/cache/" +c.InputCacheStorage.fsspec_class = c.TargetStorage.fsspec_class +c.InputCacheStorage.fsspec_args = c.TargetStorage.fsspec_args +c.InputCacheStorage.root_path = f"{BUCKET_PREFIX}/cache/" From f9ee41769dab34a5e73432b8c5150ac48cde256c Mon Sep 17 00:00:00 2001 From: ranchodeluxe Date: Thu, 7 Mar 2024 06:21:39 -0800 Subject: [PATCH 08/15] try this again --- .github/workflows/config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/config.py b/.github/workflows/config.py index e3b0103..d796cc6 100644 --- a/.github/workflows/config.py +++ b/.github/workflows/config.py @@ -121,6 +121,6 @@ def calc_task_manager_resources(task_manager_process_memory): "client_kwargs": {"region_name": "us-west-2"}, } -c.InputCacheStorage.fsspec_class = c.TargetStorage.fsspec_class -c.InputCacheStorage.fsspec_args = c.TargetStorage.fsspec_args -c.InputCacheStorage.root_path = f"{BUCKET_PREFIX}/cache/" +# c.InputCacheStorage.fsspec_class = c.TargetStorage.fsspec_class +# c.InputCacheStorage.fsspec_args = c.TargetStorage.fsspec_args +# c.InputCacheStorage.root_path = f"{BUCKET_PREFIX}/cache/" From c0eb4e596ef900a81a90540620fdccb9ff3baa5e Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Thu, 7 Mar 2024 12:22:09 -0800 Subject: [PATCH 09/15] Update job-runner.yaml --- .github/workflows/job-runner.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/job-runner.yaml b/.github/workflows/job-runner.yaml index e0e7dd7..ad47aed 100644 --- a/.github/workflows/job-runner.yaml +++ b/.github/workflows/job-runner.yaml @@ -134,6 +134,7 @@ jobs: PARALLELISM_OPTION: ${{ github.event.inputs.parallelism }} OUTPUT_BUCKET: ${{ vars.OUTPUT_BUCKET }} AUTH_MODE: ${{ github.event.inputs.auth_mode }} + AWS_ROLE_ARN: ${{ vars.AWS_ROLE_ARN }} RESOURCE_PROFILE: ${{ github.event.inputs.resource_profile }} - name: cleanup if "pangeo-forge-runner bake" failed From 030c571bf15f88fff4d3aad8a559ff24edbe31c6 Mon Sep 17 00:00:00 2001 From: ranchodeluxe Date: Thu, 7 Mar 2024 12:57:15 -0800 Subject: [PATCH 10/15] more assume role shenanigans --- .github/workflows/config.py | 20 ++++++++++++++++++++ .github/workflows/job-runner.yaml | 5 +++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/.github/workflows/config.py b/.github/workflows/config.py index e3b0103..b035d3a 100644 --- a/.github/workflows/config.py +++ b/.github/workflows/config.py @@ -116,7 +116,27 @@ def calc_task_manager_resources(task_manager_process_memory): BUCKET_PREFIX = os.environ.get("OUTPUT_BUCKET") c.TargetStorage.fsspec_class = "s3fs.S3FileSystem" c.TargetStorage.root_path = f"{BUCKET_PREFIX}/{{job_name}}/output" +# +# NOTE: lots of explanation needed here: +# +# 1) we NEED to always have `fsspec_args` set up with credentials b/c some pangeo-forge-recipe +# transforms/subroutines don't get explicit dep injection and they forward fsspec credentials +# to those transforms/subroutines via these passed credentials +# +# 2) the os env vars we are sourcing here come from GH actions `aws-actions/configure-aws-credentials`. +# the goal of that assume role is to allow GH to talk with the EKS cluster and kick off pangeo jobs +# but this role also has s3 permissions. Just because our s3 pangeo-forge output bucket lives in SMCE +# this role will also sneakily give us a session token to be able to write to the bucket during jobs +# with the correct extended token duration. This is in addition to our Flink cluster having a ServiceAccount +# tied to another IAM Role that also grants permissions to write to s3 +# +# TODO: in the future we should probably have a dedicated role (it could be the Flink one possibly) +# and our runner workflow would need to not only assume the GH actions role but then another role +# that has permissions to s3. Doing this for now b/c we're short on time c.TargetStorage.fsspec_args = { + "key": os.environ.get("AWS_ACCESS_KEY_ID"), + "secret": os.environ.get("AWS_SECRET_ACCESS_KEY"), + "token": os.environ.get("AWS_SESSION_TOKEN"), "anon": False, "client_kwargs": {"region_name": "us-west-2"}, } diff --git a/.github/workflows/job-runner.yaml b/.github/workflows/job-runner.yaml index e0e7dd7..29ae800 100644 --- a/.github/workflows/job-runner.yaml +++ b/.github/workflows/job-runner.yaml @@ -68,6 +68,7 @@ jobs: with: role-to-assume: arn:aws:iam::444055461661:role/github-actions-role-eodc role-session-name: veda-pforge-run-job + role-duration-seconds: 43200 aws-region: us-west-2 - name: set up python 3.10 @@ -204,7 +205,7 @@ jobs: with: role-to-assume: arn:aws:iam::444055461661:role/github-actions-role-eodc role-session-name: veda-pforge-monitor-job - role-duration-seconds: 21600 # note this has to match our timeout-minutes below for monitoring + role-duration-seconds: 43200 # note this has to match our timeout-minutes below for monitoring aws-region: us-west-2 - name: install kubectl @@ -222,7 +223,7 @@ jobs: # - name: monitor logs of job manager and report final status id: monitorjob - timeout-minutes: 360 + timeout-minutes: 720 continue-on-error: true run: | # TODO: this needs to not check the logs but the historyserver status From 3a4b83319ba6f9e02a1381cbc97427f5c1876b07 Mon Sep 17 00:00:00 2001 From: ranchodeluxe Date: Thu, 7 Mar 2024 13:32:49 -0800 Subject: [PATCH 11/15] try again --- .github/workflows/job-runner.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/job-runner.yaml b/.github/workflows/job-runner.yaml index 29ae800..61188e7 100644 --- a/.github/workflows/job-runner.yaml +++ b/.github/workflows/job-runner.yaml @@ -87,7 +87,7 @@ jobs: - name: install deps run: | python -m pip install --upgrade pip - pip install pangeo-forge-runner>=0.10.0 + pip install boto3 pangeo-forge-runner>=0.10.0 - name: install kubectl run: | @@ -136,6 +136,9 @@ jobs: OUTPUT_BUCKET: ${{ vars.OUTPUT_BUCKET }} AUTH_MODE: ${{ github.event.inputs.auth_mode }} RESOURCE_PROFILE: ${{ github.event.inputs.resource_profile }} + AWS_ACCESS_KEY_ID: ${{ env.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ env.AWS_SECRET_ACCESS_KEY }} + AWS_SESSION_TOKEN: ${{ env.AWS_SESSION_TOKEN }} - name: cleanup if "pangeo-forge-runner bake" failed if: steps.executejob.outcome == 'failure' From a1a0bfc4bf48d425d30688e1a809799367c588db Mon Sep 17 00:00:00 2001 From: ranchodeluxe Date: Thu, 7 Mar 2024 13:56:30 -0800 Subject: [PATCH 12/15] try again --- .github/workflows/config.py | 17 ++++++++++++++--- .github/workflows/job-runner.yaml | 3 --- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/.github/workflows/config.py b/.github/workflows/config.py index b035d3a..3fe6fce 100644 --- a/.github/workflows/config.py +++ b/.github/workflows/config.py @@ -1,4 +1,15 @@ import os +import boto3 + + +# assume our designated pangeo-runner dep injected role for s3 write access +sts_client = boto3.client('sts') +assumed_role = sts_client.assume_role( + RoleArn="arn:aws:iam::444055461661:role/test-pangeo-forge-runner-s3-write-role", + RoleSessionName="veda-pforge-s3-dep-injection" +) +tmp_credentials = assumed_role['Credentials'] + def calc_task_manager_resources(task_manager_process_memory): """ @@ -134,9 +145,9 @@ def calc_task_manager_resources(task_manager_process_memory): # and our runner workflow would need to not only assume the GH actions role but then another role # that has permissions to s3. Doing this for now b/c we're short on time c.TargetStorage.fsspec_args = { - "key": os.environ.get("AWS_ACCESS_KEY_ID"), - "secret": os.environ.get("AWS_SECRET_ACCESS_KEY"), - "token": os.environ.get("AWS_SESSION_TOKEN"), + "key": tmp_credentials['AccessKeyId'], + "secret": tmp_credentials['SecretAccessKey'], + "token": tmp_credentials['SessionToken'], "anon": False, "client_kwargs": {"region_name": "us-west-2"}, } diff --git a/.github/workflows/job-runner.yaml b/.github/workflows/job-runner.yaml index 61188e7..74d74a5 100644 --- a/.github/workflows/job-runner.yaml +++ b/.github/workflows/job-runner.yaml @@ -136,9 +136,6 @@ jobs: OUTPUT_BUCKET: ${{ vars.OUTPUT_BUCKET }} AUTH_MODE: ${{ github.event.inputs.auth_mode }} RESOURCE_PROFILE: ${{ github.event.inputs.resource_profile }} - AWS_ACCESS_KEY_ID: ${{ env.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ env.AWS_SECRET_ACCESS_KEY }} - AWS_SESSION_TOKEN: ${{ env.AWS_SESSION_TOKEN }} - name: cleanup if "pangeo-forge-runner bake" failed if: steps.executejob.outcome == 'failure' From a308b658645f9357c3db1b0ca289ffb4b0e031e1 Mon Sep 17 00:00:00 2001 From: ranchodeluxe Date: Thu, 7 Mar 2024 15:00:25 -0800 Subject: [PATCH 13/15] moar --- .github/workflows/config.py | 4 ++-- .github/workflows/job-runner.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/config.py b/.github/workflows/config.py index 3fe6fce..9c21e9b 100644 --- a/.github/workflows/config.py +++ b/.github/workflows/config.py @@ -1,12 +1,12 @@ import os import boto3 - +print(f"[ WAT??????????????? ]: FAKE{os.environ['AWS_ACCESS_KEY_ID']}FAKE") # assume our designated pangeo-runner dep injected role for s3 write access sts_client = boto3.client('sts') assumed_role = sts_client.assume_role( RoleArn="arn:aws:iam::444055461661:role/test-pangeo-forge-runner-s3-write-role", - RoleSessionName="veda-pforge-s3-dep-injection" + RoleSessionName="veda-pforge-s3-dep-injection", ) tmp_credentials = assumed_role['Credentials'] diff --git a/.github/workflows/job-runner.yaml b/.github/workflows/job-runner.yaml index 74d74a5..5134e69 100644 --- a/.github/workflows/job-runner.yaml +++ b/.github/workflows/job-runner.yaml @@ -68,7 +68,7 @@ jobs: with: role-to-assume: arn:aws:iam::444055461661:role/github-actions-role-eodc role-session-name: veda-pforge-run-job - role-duration-seconds: 43200 + role-duration-seconds: 3600 aws-region: us-west-2 - name: set up python 3.10 From 1a4eb01900a8cdfae8f33ac353a807e1728bef5d Mon Sep 17 00:00:00 2001 From: ranchodeluxe Date: Thu, 7 Mar 2024 16:17:23 -0800 Subject: [PATCH 14/15] remove second assume role --- .github/workflows/config.py | 13 ------------- .github/workflows/job-runner.yaml | 2 +- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/.github/workflows/config.py b/.github/workflows/config.py index 9c21e9b..6c9937f 100644 --- a/.github/workflows/config.py +++ b/.github/workflows/config.py @@ -1,14 +1,4 @@ import os -import boto3 - -print(f"[ WAT??????????????? ]: FAKE{os.environ['AWS_ACCESS_KEY_ID']}FAKE") -# assume our designated pangeo-runner dep injected role for s3 write access -sts_client = boto3.client('sts') -assumed_role = sts_client.assume_role( - RoleArn="arn:aws:iam::444055461661:role/test-pangeo-forge-runner-s3-write-role", - RoleSessionName="veda-pforge-s3-dep-injection", -) -tmp_credentials = assumed_role['Credentials'] def calc_task_manager_resources(task_manager_process_memory): @@ -145,9 +135,6 @@ def calc_task_manager_resources(task_manager_process_memory): # and our runner workflow would need to not only assume the GH actions role but then another role # that has permissions to s3. Doing this for now b/c we're short on time c.TargetStorage.fsspec_args = { - "key": tmp_credentials['AccessKeyId'], - "secret": tmp_credentials['SecretAccessKey'], - "token": tmp_credentials['SessionToken'], "anon": False, "client_kwargs": {"region_name": "us-west-2"}, } diff --git a/.github/workflows/job-runner.yaml b/.github/workflows/job-runner.yaml index 5134e69..311fac3 100644 --- a/.github/workflows/job-runner.yaml +++ b/.github/workflows/job-runner.yaml @@ -87,7 +87,7 @@ jobs: - name: install deps run: | python -m pip install --upgrade pip - pip install boto3 pangeo-forge-runner>=0.10.0 + pip install pangeo-forge-runner>=0.10.0 - name: install kubectl run: | From 634b269b34f3e0908923bc7f6287be6becba6676 Mon Sep 17 00:00:00 2001 From: ranchodeluxe Date: Thu, 7 Mar 2024 16:48:15 -0800 Subject: [PATCH 15/15] yuck fsspec --- .github/workflows/config.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/.github/workflows/config.py b/.github/workflows/config.py index 6c9937f..e3b0103 100644 --- a/.github/workflows/config.py +++ b/.github/workflows/config.py @@ -1,6 +1,5 @@ import os - def calc_task_manager_resources(task_manager_process_memory): """ illustration of Flink memory model: @@ -117,23 +116,6 @@ def calc_task_manager_resources(task_manager_process_memory): BUCKET_PREFIX = os.environ.get("OUTPUT_BUCKET") c.TargetStorage.fsspec_class = "s3fs.S3FileSystem" c.TargetStorage.root_path = f"{BUCKET_PREFIX}/{{job_name}}/output" -# -# NOTE: lots of explanation needed here: -# -# 1) we NEED to always have `fsspec_args` set up with credentials b/c some pangeo-forge-recipe -# transforms/subroutines don't get explicit dep injection and they forward fsspec credentials -# to those transforms/subroutines via these passed credentials -# -# 2) the os env vars we are sourcing here come from GH actions `aws-actions/configure-aws-credentials`. -# the goal of that assume role is to allow GH to talk with the EKS cluster and kick off pangeo jobs -# but this role also has s3 permissions. Just because our s3 pangeo-forge output bucket lives in SMCE -# this role will also sneakily give us a session token to be able to write to the bucket during jobs -# with the correct extended token duration. This is in addition to our Flink cluster having a ServiceAccount -# tied to another IAM Role that also grants permissions to write to s3 -# -# TODO: in the future we should probably have a dedicated role (it could be the Flink one possibly) -# and our runner workflow would need to not only assume the GH actions role but then another role -# that has permissions to s3. Doing this for now b/c we're short on time c.TargetStorage.fsspec_args = { "anon": False, "client_kwargs": {"region_name": "us-west-2"},