From eb5e76de0f5e69666c11a64df7a59242601a3d1f Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 26 Mar 2024 16:59:59 -0700 Subject: [PATCH] ingest-to-phylogenetic: Use cache to check new data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Uses GitHub Actions cache to store a file that contains the `Metadata.sh256sum` of the ingest files on S3 and use the `hashFiles` function to create a unique cache key. Then the existence of the cache key is an indicator that the ingest file contents have not been updated since a previous run on GH Actions. This does come with a big caveat that GH will remove any cache entries that have not been accessed in over 7 days.¹ If the workflow is not being automatically run within 7 days, then it will always run the phylogenetic job. If this works well, then we may want to consider moving this within the `pathogen-repo-build` reusable workflow to have the same functionality across pathogen automation workflows. ¹ https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#usage-limits-and-eviction-policy --- .github/workflows/ingest-to-phylogenetic.yaml | 44 +++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ingest-to-phylogenetic.yaml b/.github/workflows/ingest-to-phylogenetic.yaml index 03d65b3..980a399 100644 --- a/.github/workflows/ingest-to-phylogenetic.yaml +++ b/.github/workflows/ingest-to-phylogenetic.yaml @@ -39,11 +39,49 @@ jobs: ingest/logs/ ingest/.snakemake/log/ - # TKTK check if ingest results include new data - # potentially use actions/cache to store Metadata.sha256sum of S3 files + # Check if ingest results include new data by checking for the cache + # of the file with the results' Metadata.sh256sum (which should have been added within upload-to-s3) + # GitHub will remove any cache entries that have not been accessed in over 7 days, + # so if the workflow has not been run over 7 days then it will trigger phylogenetic. + check-new-data: + needs: [ingest] + runs-on: ubuntu-latest + outputs: + cache-hit: ${{ steps.check-cache.outputs.cache-hit }} + steps: + - name: Get sha256sum + id: get-sha256sum + run: | + s3_urls=( + "s3://nextstrain-data/files/workflows/zika/metadata.tsv.zst" + "s3://nextstrain-data/files/workflows/zika/sequences.fasta.zst" + ) + + # Code below is modified from ingest/upload-to-s3 + # https://github.com/nextstrain/ingest/blob/c0b4c6bb5e6ccbba86374d2c09b42077768aac23/upload-to-s3#L23-L29 + + no_hash=0000000000000000000000000000000000000000000000000000000000000000 + + for s3_url in "${s3_urls[@]}"; do + s3path="${s3_url#s3://}" + bucket="${s3path%%/*}" + key="${s3path#*/}" + + s3_hash="$(aws s3api head-object --no-sign-request --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text 2>/dev/null || echo "$no_hash")" + echo "${s3_hash}" >> ingest-output-sha256sum + done + + - name: Check cache + id: check-cache + uses: actions/cache@v4 + with: + path: ingest-output-sha256sum + key: ingest-output-sha256sum-${{ hashFiles('ingest-output-sha256sum') }} + lookup-only: true phylogenetic: - needs: [ingest] + needs: [check-new-data] + if: ${{ needs.check-new-data.outputs.cache-hit != 'true' }} permissions: id-token: write uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master