diff --git a/.github/workflows/config.py b/.github/workflows/config.py index ce5584d..d796cc6 100644 --- a/.github/workflows/config.py +++ b/.github/workflows/config.py @@ -76,10 +76,10 @@ def calc_task_manager_resources(task_manager_process_memory): resource_profile_choice = os.environ.get("RESOURCE_PROFILE") task_manager_process_memory_map = { - "small": 7168, - "medium": 10240, - "large": 15360, - "xlarge": 20480, + "small": 7824, + "medium": 9824, + "large": 11824, + "xlarge": 13824, } if resource_profile_choice not in list(task_manager_process_memory_map.keys()): raise ValueError( @@ -98,9 +98,9 @@ def calc_task_manager_resources(task_manager_process_memory): c.Bake.feedstock_subdir = os.environ.get("FEEDSTOCK_SUBDIR") c.FlinkOperatorBakery.parallelism = int(os.environ.get("PARALLELISM_OPTION")) -c.FlinkOperatorBakery.enable_job_archiving = True +c.FlinkOperatorBakery.enable_job_archiving = False c.FlinkOperatorBakery.flink_version = "1.16" -c.FlinkOperatorBakery.job_manager_resources = {"memory": "1536m", "cpu": 0.3} +c.FlinkOperatorBakery.job_manager_resources = {"memory": "1280m", "cpu": 0.3} c.FlinkOperatorBakery.task_manager_resources = { "memory": f"{task_manager_process_memory_map[resource_profile_choice]}m", "cpu": 0.3 @@ -121,6 +121,6 @@ def calc_task_manager_resources(task_manager_process_memory): "client_kwargs": {"region_name": "us-west-2"}, } -c.InputCacheStorage.fsspec_class = c.TargetStorage.fsspec_class -c.InputCacheStorage.fsspec_args = c.TargetStorage.fsspec_args -c.InputCacheStorage.root_path = f"{BUCKET_PREFIX}/cache/" +# c.InputCacheStorage.fsspec_class = c.TargetStorage.fsspec_class +# c.InputCacheStorage.fsspec_args = c.TargetStorage.fsspec_args +# c.InputCacheStorage.root_path = f"{BUCKET_PREFIX}/cache/" diff --git a/.github/workflows/job-runner.yaml b/.github/workflows/job-runner.yaml index 07eb5d2..c4f8f29 100644 --- a/.github/workflows/job-runner.yaml +++ b/.github/workflows/job-runner.yaml @@ -26,8 +26,11 @@ on: description: 'What auth mode (edl or iamrole) to use when accessing files.' required: false default: 'iamrole' + job_name: + description: 'A unique job name (no other existing filnk deployment can have it) so we can inspect metrics easier in Grafana.' + required: true resource_profile: - description: 'jobs have different memory requirements so choose (small[7168M], medium[10240M], large[15360M], xlarge[20480M])' + description: 'jobs have different memory requirements so choose (small[7824_MiB], medium[9824_MiB], large[11824_MiB], xlarge[13824_MiB])' required: false default: 'small' @@ -61,10 +64,11 @@ jobs: uses: actions/checkout@v3 - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v2 + uses: aws-actions/configure-aws-credentials@v3 with: role-to-assume: arn:aws:iam::444055461661:role/github-actions-role-eodc role-session-name: veda-pforge-run-job + role-duration-seconds: 3600 aws-region: us-west-2 - name: set up python 3.10 @@ -106,6 +110,7 @@ jobs: bake \ --repo=${{ github.event.inputs.repo }} \ --ref=${{ github.event.inputs.ref }} \ + --Bake.job_name="${{ github.event.inputs.job_name }}" \ -f .github/workflows/config.py # we just use the `inspect` branch to dump runner errors and fail fast @@ -133,6 +138,7 @@ jobs: PARALLELISM_OPTION: ${{ github.event.inputs.parallelism }} OUTPUT_BUCKET: ${{ vars.OUTPUT_BUCKET }} AUTH_MODE: ${{ github.event.inputs.auth_mode }} + AWS_ROLE_ARN: ${{ vars.AWS_ROLE_ARN }} RESOURCE_PROFILE: ${{ github.event.inputs.resource_profile }} - name: cleanup if "pangeo-forge-runner bake" failed @@ -195,13 +201,15 @@ jobs: monitor-job: runs-on: ubuntu-latest name: monitor job ${{ needs.name-job.outputs.repo_name }}@${{ github.event.inputs.ref }} + environment: veda-smce needs: [name-job, run-job] steps: - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v2 + uses: aws-actions/configure-aws-credentials@v3 with: role-to-assume: arn:aws:iam::444055461661:role/github-actions-role-eodc role-session-name: veda-pforge-monitor-job + role-duration-seconds: 43200 # note this has to match our timeout-minutes below for monitoring aws-region: us-west-2 - name: install kubectl @@ -219,7 +227,7 @@ jobs: # - name: monitor logs of job manager and report final status id: monitorjob - timeout-minutes: 240 + timeout-minutes: 720 continue-on-error: true run: | # TODO: this needs to not check the logs but the historyserver status