diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml new file mode 100644 index 000000000..04fe7beb5 --- /dev/null +++ b/.github/actions/build-container/action.yml @@ -0,0 +1,212 @@ +name: Build container + +description: "Builds a Docker container image for JAX-based projects using NVIDIA's Mealkit and uploads it to GitHub Container Registry." + +inputs: + ARCHITECTURE: + description: 'CPU architecture to build the image for, e.g. amd64, arm64' + required: true + BASE_IMAGE: + description: 'Base docker image that provides JAX' + required: false + default: ghcr.io/nvidia/jax:mealkit + BUILD_DATE: + description: "Build date in YYYY-MM-DD format" + required: false + default: 'NOT SPECIFIED' + ARTIFACT_NAME: + description: 'Name of the artifact zip file, e.g. artifact-t5x-build' + required: true + BADGE_FILENAME: + description: 'Name of the endpoint JSON file for shields.io badge, e.g. badge-t5x-build' + required: true + CONTAINER_NAME: + description: "Container name, e.g. upstream-t5x" + required: true + DOCKERFILE: + description: "Dockerfile to use, e.g. .github/container/Dockerfile.t5x" + required: true + DOCKER_CONTEXT: + description: "Dockerfile context to build" + default: '.github/container' + required: false + RUNNER_SIZE: + description: "Size of the runner to use" + required: false + default: small + EXTRA_BUILD_ARGS: + description: "Extra build arguments to pass to the Docker build" + required: false + default: "" + ssh-private-key: + description: "SSH private key to use for building the image" + required: true + default: "" + ssh-known-hosts: + description: "SSH known hosts entries to use for building the image" + required: true + default: "" + github-token: + description: "GitHub token to use for authentication" + required: true + default: "" + bazel-remote-cache-url: + description: "URL of the Bazel remote cache to use for building the image" + required: true + default: "" + +outputs: + DOCKER_TAG_MEALKIT: + description: "Tags of the 'mealkit' image built" + value: ${{ steps.export.outputs.DOCKER_TAG_MEALKIT }} + DOCKER_TAG_FINAL: + description: "Tags of the complete image built" + value: ${{ steps.export.outputs.DOCKER_TAG_FINAL }} + +runs: + using: 'composite' + steps: + - name: Set up environment variables + shell: bash + id: set-env + run: | + echo 'UPLD_IMAGE=ghcr.io/nvidia/jax-toolbox-internal' >> $GITHUB_ENV + echo "BADGE_FILENAME_FULL=${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json" >> $GITHUB_ENV + + - name: Setup SSH + id: setup-ssh + uses: ./.github/actions/setup-ssh + with: + ssh-private-key: ${{ inputs.ssh-private-key }} + ssh-known-hosts: ${{ inputs.ssh-known-hosts }} + + - name: Login to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ inputs.github-token }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + image=moby/buildkit:v0.12.1 + + # MEALKIT BUILD + - name: Set docker metadata - mealkit + id: mealkit-metadata + uses: docker/metadata-action@v5 + with: + images: | + ${{ env.UPLD_IMAGE }} + flavor: | + latest=false + tags: | + type=raw,value=${{ github.run_id }}-${{ inputs.CONTAINER_NAME }}-${{ inputs.ARCHITECTURE }}-mealkit + labels: + org.opencontainers.image.created=${{ inputs.BUILD_DATE }} + + - name: Build mealkit image + id: mealkit-build + uses: docker/build-push-action@v5 + with: + context: ${{ inputs.DOCKER_CONTEXT }} + push: true + file: ${{ inputs.DOCKERFILE }} + platforms: linux/${{ inputs.ARCHITECTURE }} + target: mealkit + tags: ${{ steps.mealkit-metadata.outputs.tags }} + labels: ${{ steps.mealkit-metadata.outputs.labels }} + ssh: default + secret-files: | + "SSH_KNOWN_HOSTS=${{ steps.setup-ssh.outputs.known-hosts-file }}" + build-args: | + BASE_IMAGE=${{ inputs.BASE_IMAGE }} + BAZEL_CACHE=${{ inputs.bazel-remote-cache-url }} + BUILD_DATE=${{ inputs.BUILD_DATE }} + ${{ inputs.EXTRA_BUILD_ARGS }} + # FINAL IMAGE BUILD + - name: Set docker metadata - final + id: final-metadata + uses: docker/metadata-action@v5 + with: + images: | + ${{ env.UPLD_IMAGE }} + flavor: | + latest=false + tags: | + type=raw,value=${{ github.run_id }}-${{ inputs.CONTAINER_NAME }}-${{ inputs.ARCHITECTURE }} + labels: + org.opencontainers.image.created=${{ inputs.BUILD_DATE }} + + - name: Build final image + id: final-build + uses: docker/build-push-action@v5 + with: + context: ${{ inputs.DOCKER_CONTEXT }} + push: true + file: ${{ inputs.DOCKERFILE }} + platforms: linux/${{ inputs.ARCHITECTURE }} + tags: ${{ steps.final-metadata.outputs.tags }} + labels: ${{ steps.final-metadata.outputs.labels }} + target: final + ssh: default + secret-files: | + "SSH_KNOWN_HOSTS=${{ steps.setup-ssh.outputs.known-hosts-file }}" + build-args: | + BASE_IMAGE=${{ inputs.BASE_IMAGE }} + BAZEL_CACHE=${{ inputs.bazel-remote-cache-url }} + BUILD_DATE=${{ inputs.BUILD_DATE }} + ${{ inputs.EXTRA_BUILD_ARGS }} + + # SITREP GENERATION + - name: Generate sitrep + if: "!cancelled()" + shell: bash -x -e {0} + run: | + # bring in utility functions + source .github/workflows/scripts/to_json.sh + + badge_label='${{ inputs.CONTAINER_NAME }} ${{ inputs.ARCHITECTURE }} build' + tags="${{ steps.final-metadata.outputs.tags }}" + digest="${{ steps.final-build.outputs.digest }}" + outcome="${{ steps.final-build.outcome }}" + + if [[ ${outcome} == "success" ]]; then + badge_message="pass" + badge_color=brightgreen + summary="${{ inputs.CONTAINER_NAME }} build on ${{ inputs.ARCHITECTURE }}: $badge_message" + else + badge_message="fail" + badge_color=red + summary="${{ inputs.CONTAINER_NAME }} build on ${{ inputs.ARCHITECTURE }}: $badge_message" + fi + + to_json \ + summary \ + badge_label tags digest outcome \ + > sitrep.json + + schemaVersion=1 \ + label="${badge_label}" \ + message="${badge_message}" \ + color="${badge_color}" \ + to_json schemaVersion label message color \ + > ${{ env.BADGE_FILENAME_FULL }} + + - name: Upload sitrep and badge + if: "!cancelled()" + uses: actions/upload-artifact@v4 + with: + name: ${{ inputs.ARTIFACT_NAME }}-${{ inputs.ARCHITECTURE }} + path: | + sitrep.json + ${{ env.BADGE_FILENAME_FULL }} + + - name: Export outputs + id: export + shell: bash + run: | + echo "DOCKER_TAG_MEALKIT=${{ steps.mealkit-metadata.outputs.tags }}" >> "$GITHUB_OUTPUT" + echo "DOCKER_TAG_FINAL=${{ steps.final-metadata.outputs.tags }}" >> "$GITHUB_OUTPUT" diff --git a/.github/workflows/_build.yaml b/.github/workflows/_build.yaml deleted file mode 100644 index 076382f2b..000000000 --- a/.github/workflows/_build.yaml +++ /dev/null @@ -1,210 +0,0 @@ -name: ~build container - -on: - workflow_call: - inputs: - ARCHITECTURE: - type: string - description: 'CPU architecture to build the image for, e.g. amd64, arm64' - required: true - BASE_IMAGE: - type: string - description: 'Base docker image that provides JAX' - required: false - default: ghcr.io/nvidia/jax:mealkit - BUILD_DATE: - type: string - description: "Build date in YYYY-MM-DD format" - required: false - default: 'NOT SPECIFIED' - ARTIFACT_NAME: - type: string - description: 'Name of the artifact zip file, e.g. artifact-t5x-build' - required: true - BADGE_FILENAME: - type: string - description: 'Name of the endpoint JSON file for shields.io badge, e.g. badge-t5x-build' - required: true - CONTAINER_NAME: - type: string - description: "Container name, e.g. upstream-t5x" - required: true - DOCKERFILE: - type: string - description: "Dockerfile to use, e.g. .github/container/Dockerfile.t5x" - required: true - DOCKER_CONTEXT: - type: string - description: "Dockerfile context to build" - default: '.github/container' - required: false - RUNNER_SIZE: - type: string - description: "Size of the runner to use" - required: false - default: small - EXTRA_BUILD_ARGS: - type: string - description: "Extra build arguments to pass to the Docker build" - required: false - default: "" - outputs: - DOCKER_TAG_MEALKIT: - description: "Tags of the 'mealkit' image built" - value: ${{ jobs.build.outputs.DOCKER_TAG_MEALKIT }} - DOCKER_TAG_FINAL: - description: "Tags of the complete image built" - value: ${{ jobs.build.outputs.DOCKER_TAG_FINAL }} - -env: - UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal - -permissions: - contents: read # to fetch code - actions: write # to cancel previous workflows - packages: write # to upload container - -jobs: - build: - name: build-${{ inputs.CONTAINER_NAME }} - runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "${{ inputs.RUNNER_SIZE }}"] - env: - BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json - outputs: - DOCKER_TAG_MEALKIT: ${{ steps.mealkit-build.outcome == 'success' && steps.mealkit-metadata.outputs.tags || '' }} - DOCKER_TAG_FINAL: ${{ steps.final-build.outcome == 'success' && steps.final-metadata.outputs.tags || '' }} - steps: - - name: Print environment variables - run: env - - - name: Check out the repository under ${GITHUB_WORKSPACE} - uses: actions/checkout@v4 - - - name: Setup SSH - id: setup-ssh - uses: ./.github/actions/setup-ssh - with: - ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} - - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - with: - driver-opts: | - image=moby/buildkit:v0.12.1 - - - name: Set docker metadata - mealkit - id: mealkit-metadata - uses: docker/metadata-action@v5 - with: - images: | - ${{ env.UPLD_IMAGE }} - flavor: | - latest=false - tags: | - type=raw,value=${{ github.run_id }}-${{ inputs.CONTAINER_NAME }}-${{ inputs.ARCHITECTURE }}-mealkit - labels: - org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - - name: Build mealkit image - id: mealkit-build - uses: docker/build-push-action@v5 - with: - context: ${{ inputs.DOCKER_CONTEXT }} - push: true - file: ${{ inputs.DOCKERFILE }} - platforms: linux/${{ inputs.ARCHITECTURE }} - target: mealkit - tags: ${{ steps.mealkit-metadata.outputs.tags }} - labels: ${{ steps.mealkit-metadata.outputs.labels }} - ssh: default - secret-files: | - "SSH_KNOWN_HOSTS=${{ steps.setup-ssh.outputs.known-hosts-file }}" - build-args: | - BASE_IMAGE=${{ inputs.BASE_IMAGE }} - BAZEL_CACHE=${{ vars.BAZEL_REMOTE_CACHE_URL }} - BUILD_DATE=${{ inputs.BUILD_DATE }} - ${{ inputs.EXTRA_BUILD_ARGS }} - - - name: Set docker metadata - final - id: final-metadata - uses: docker/metadata-action@v5 - with: - images: | - ${{ env.UPLD_IMAGE }} - flavor: | - latest=false - tags: | - type=raw,value=${{ github.run_id }}-${{ inputs.CONTAINER_NAME }}-${{ inputs.ARCHITECTURE }} - labels: - org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - - name: Build final image - id: final-build - uses: docker/build-push-action@v5 - with: - context: ${{ inputs.DOCKER_CONTEXT }} - push: true - file: ${{ inputs.DOCKERFILE }} - platforms: linux/${{ inputs.ARCHITECTURE }} - tags: ${{ steps.final-metadata.outputs.tags }} - labels: ${{ steps.final-metadata.outputs.labels }} - target: final - ssh: default - secret-files: | - "SSH_KNOWN_HOSTS=${{ steps.setup-ssh.outputs.known-hosts-file }}" - build-args: | - BASE_IMAGE=${{ inputs.BASE_IMAGE }} - BAZEL_CACHE=${{ vars.BAZEL_REMOTE_CACHE_URL }} - BUILD_DATE=${{ inputs.BUILD_DATE }} - ${{ inputs.EXTRA_BUILD_ARGS }} - - - name: Generate sitrep - if: "!cancelled()" - shell: bash -x -e {0} - run: | - # bring in utility functions - source .github/workflows/scripts/to_json.sh - - badge_label='${{ inputs.CONTAINER_NAME }} ${{ inputs.ARCHITECTURE }} build' - tags="${{ steps.final-metadata.outputs.tags }}" - digest="${{ steps.final-build.outputs.digest }}" - outcome="${{ steps.final-build.outcome }}" - - if [[ ${outcome} == "success" ]]; then - badge_message="pass" - badge_color=brightgreen - summary="${{ inputs.CONTAINER_NAME }} build on ${{ inputs.ARCHITECTURE }}: $badge_message" - else - badge_message="fail" - badge_color=red - summary="${{ inputs.CONTAINER_NAME }} build on ${{ inputs.ARCHITECTURE }}: $badge_message" - fi - - to_json \ - summary \ - badge_label tags digest outcome \ - > sitrep.json - - schemaVersion=1 \ - label="${badge_label}" \ - message="${badge_message}" \ - color="${badge_color}" \ - to_json schemaVersion label message color \ - > ${{ env.BADGE_FILENAME_FULL }} - - - name: Upload sitrep and badge - if: "!cancelled()" - uses: actions/upload-artifact@v4 - with: - name: ${{ inputs.ARTIFACT_NAME }}-${{ inputs.ARCHITECTURE }} - path: | - sitrep.json - ${{ env.BADGE_FILENAME_FULL }} diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index b1f0307c6..abe7d7dae 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -41,6 +41,7 @@ permissions: actions: write # to cancel previous workflows packages: write # to upload container + jobs: build-base: @@ -52,71 +53,159 @@ jobs: MANIFEST_ARTIFACT_NAME: ${{ inputs.MANIFEST_ARTIFACT_NAME }} secrets: inherit - build-jax: + test-nccl: + if: inputs.ARCHITECTURE == 'amd64' # build only amd64 needs: build-base - uses: ./.github/workflows/_build.yaml + uses: ./.github/workflows/_test_nccl.yaml with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-jax-build - BADGE_FILENAME: badge-jax-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} - CONTAINER_NAME: jax - DOCKERFILE: .github/container/Dockerfile.jax - RUNNER_SIZE: large - EXTRA_BUILD_ARGS: | - URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }} - URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }} - URLREF_FLAX=${{ fromJson(inputs.SOURCE_URLREFS).FLAX }} - URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }} + CONTAINER: ${{ needs.build-base.outputs.DOCKER_TAG }} secrets: inherit + build-jax: + needs: build-base + runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "large"] + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Build JAX container + id: build-jax + uses: ./.github/actions/build-container + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-jax-build + BADGE_FILENAME: badge-jax-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} + CONTAINER_NAME: jax + DOCKERFILE: .github/container/Dockerfile.jax + RUNNER_SIZE: large + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + github-token: ${{ secrets.GITHUB_TOKEN }} + bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} + EXTRA_BUILD_ARGS: | + URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }} + URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }} + URLREF_FLAX=${{ fromJson(inputs.SOURCE_URLREFS).FLAX }} + URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }} + outputs: + DOCKER_TAG_MEALKIT: ${{ steps.build-jax.outputs.DOCKER_TAG_MEALKIT }} + DOCKER_TAG_FINAL: ${{ steps.build-jax.outputs.DOCKER_TAG_FINAL }} + build-equinox: needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-equinox-build - BADGE_FILENAME: badge-equinox-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: equinox - DOCKERFILE: .github/container/Dockerfile.equinox - EXTRA_BUILD_ARGS: | - URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} - secrets: inherit + runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"] + outputs: + DOCKER_TAG_MEALKIT: ${{ steps.build-equinox.outputs.DOCKER_TAG_MEALKIT }} + DOCKER_TAG_FINAL: ${{ steps.build-equinox.outputs.DOCKER_TAG_FINAL }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Build Equinox container + id: build-equinox + uses: ./.github/actions/build-container + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-equinox-build + BADGE_FILENAME: badge-equinox-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: equinox + DOCKERFILE: .github/container/Dockerfile.equinox + RUNNER_SIZE: small + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + github-token: ${{ secrets.GITHUB_TOKEN }} + bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} + EXTRA_BUILD_ARGS: | + URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} build-maxtext: needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-maxtext-build - BADGE_FILENAME: badge-maxtext-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: maxtext - DOCKERFILE: .github/container/Dockerfile.maxtext - EXTRA_BUILD_ARGS: | - URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} - URLREF_JETSTREAM=${{ fromJson(inputs.SOURCE_URLREFS).GOOGLE_JETSTREAM }} - secrets: inherit + runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"] + outputs: + DOCKER_TAG_MEALKIT: ${{ steps.build-maxtext.outputs.DOCKER_TAG_MEALKIT }} + DOCKER_TAG_FINAL: ${{ steps.build-maxtext.outputs.DOCKER_TAG_FINAL }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Build MaxText container + id: build-maxtext + uses: ./.github/actions/build-container + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-maxtext-build + BADGE_FILENAME: badge-maxtext-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: maxtext + DOCKERFILE: .github/container/Dockerfile.maxtext + RUNNER_SIZE: small + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + github-token: ${{ secrets.GITHUB_TOKEN }} + bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} + EXTRA_BUILD_ARGS: | + URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} + URLREF_JETSTREAM=${{ fromJson(inputs.SOURCE_URLREFS).GOOGLE_JETSTREAM }} build-upstream-t5x: needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: "artifact-t5x-build" - BADGE_FILENAME: "badge-t5x-build" - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: upstream-t5x - DOCKERFILE: .github/container/Dockerfile.t5x - EXTRA_BUILD_ARGS: | - URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} - URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} - secrets: inherit + runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"] + outputs: + DOCKER_TAG_MEALKIT: ${{ steps.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} + DOCKER_TAG_FINAL: ${{ steps.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} + steps: + - id: checkout + name: Checkout repository + uses: actions/checkout@v4 + - name: Build Upstream T5X container + id: build-upstream-t5x + uses: ./.github/actions/build-container + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-t5x-build + BADGE_FILENAME: badge-t5x-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: upstream-t5x + DOCKERFILE: .github/container/Dockerfile.t5x + RUNNER_SIZE: small + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + github-token: ${{ secrets.GITHUB_TOKEN }} + bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} + EXTRA_BUILD_ARGS: | + URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} + URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} + + build-axlearn: + needs: build-jax + runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "large"] + outputs: + DOCKER_TAG_MEALKIT: ${{ steps.build-axlearn.outputs.DOCKER_TAG_MEALKIT }} + DOCKER_TAG_FINAL: ${{ steps.build-axlearn.outputs.DOCKER_TAG_FINAL }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Build AxLearn container + id: build-axlearn + uses: ./.github/actions/build-container + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-axlearn-build + BADGE_FILENAME: badge-axlearn-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: axlearn + DOCKERFILE: .github/container/Dockerfile.axlearn + RUNNER_SIZE: large + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + github-token: ${{ secrets.GITHUB_TOKEN }} + bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} + EXTRA_BUILD_ARGS: | + URLREF_AXLEARN=${{ fromJson(inputs.SOURCE_URLREFS).AXLEARN }} build-rosetta-t5x: needs: build-upstream-t5x @@ -128,22 +217,6 @@ jobs: BASE_LIBRARY: t5x secrets: inherit - build-axlearn: - needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-axlearn-build - BADGE_FILENAME: badge-axlearn-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: axlearn - DOCKERFILE: .github/container/Dockerfile.axlearn - RUNNER_SIZE: large - EXTRA_BUILD_ARGS: | - URLREF_AXLEARN=${{ fromJson(inputs.SOURCE_URLREFS).AXLEARN }} - secrets: inherit - collect-docker-tags: runs-on: ubuntu-22.04 if: ${{ !cancelled() }} @@ -153,8 +226,8 @@ jobs: - build-equinox - build-maxtext - build-upstream-t5x - - build-rosetta-t5x - build-axlearn + - build-rosetta-t5x outputs: TAGS: ${{ steps.collect-tags.outputs.TAGS }} steps: @@ -450,30 +523,6 @@ jobs: job-config-file: .github/eks-workflow-files/post-process-job.yml job-name: ${{ env.POSTPROCESS_JOB_NAME }} - # test-equinox: - # needs: build-equinox - # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_unit.yaml - # with: - # IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} - # TEST_NAME: equinox - # EXECUTE: | - # docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ - # bash -exc -o pipefail \ - # 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log - # STATISTICS_SCRIPT: | - # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - # total_tests=$((failed_tests + passed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # test-equinox.log - # secrets: inherit - test-te-h100: needs: build-jax if: >- @@ -721,7 +770,7 @@ jobs: .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml - - name: Submit & delete axlearn test + - name: Submit & delete axlearn fuji model test uses: ./.github/actions/submit-delete-k8s-job with: job-config-file: ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml" diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml index a274963fc..b0436b562 100644 --- a/.github/workflows/_test_nccl.yaml +++ b/.github/workflows/_test_nccl.yaml @@ -3,12 +3,9 @@ name: ~run NCCL tests on: workflow_call: inputs: - # Note that cuda-dl-base installs the NCCL tests, while the vanilla nvidia/cuda - # images do not; when JAX-Toolbox moves to using cuda-dl-base this workflow ought - # to be modified to test one of the JAX-Toolbox containers. CONTAINER: type: string - description: CUDA image to use as base, e.g. nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04 + description: Container to test; assumed to already contain {all_gather,all_reduce,broadcast,reduce_scatter}_perf_mpi in $PATH required: true permissions: @@ -18,17 +15,31 @@ permissions: jobs: build-mpi-operator-compatible-base: - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: amd64 - ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build - BADGE_FILENAME: badge-mpi-operator-compatible-base-build - BUILD_DATE: 0000-00-00 # not important; this image is never published - BASE_IMAGE: ${{ inputs.CONTAINER }} - CONTAINER_NAME: mpi-operator-compatible-base - DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base - RUNNER_SIZE: small - secrets: inherit + runs-on: [self-hosted, "amd64", "large"] + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Build MPI operator compatible base container + id: build + uses: ./.github/actions/build-container + with: + ARCHITECTURE: amd64 + ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build + BADGE_FILENAME: badge-mpi-operator-compatible-base-build + BUILD_DATE: 0000-00-00 # not important; this image is never published + BASE_IMAGE: ${{ inputs.CONTAINER }} + CONTAINER_NAME: mpi-operator-compatible-base + DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base + RUNNER_SIZE: small + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + github-token: ${{ secrets.GITHUB_TOKEN }} + bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} + outputs: + DOCKER_TAG_MEALKIT: ${{ steps.build.outputs.DOCKER_TAG_MEALKIT }} + DOCKER_TAG_FINAL: ${{ steps.build.outputs.DOCKER_TAG_FINAL }} + + nccl-test: needs: build-mpi-operator-compatible-base strategy: @@ -39,7 +50,7 @@ jobs: BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }} TEST_NAME: ${{ matrix.test }} steps: - - name: Check out the repository + - name: Checkout repository uses: actions/checkout@v4 - name: Login to GitHub Container Registry uses: docker/login-action@v3 diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index be3187f7e..1a6f53ec4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -31,7 +31,7 @@ on: required: false CUDA_IMAGE: type: string - description: CUDA image to use as base, e.g. nvcr.io/nvidia/cuda-dl-base:25.01-cuda12.8-devel-ubuntu24.04 + description: CUDA image, e.g. nvcr.io/nvidia/cuda-dl-base:25.01-cuda12.8-devel-ubuntu24.04 or latest. default: 'latest' required: false SOURCE_OVERRIDES: @@ -133,7 +133,7 @@ jobs: exit 1 fi - - name: Determine CUDA image to use + - name: Determine CUDA image tag to use id: cuda-image shell: bash -x -e {0} run: | diff --git a/.github/workflows/nccl-k8s.yaml b/.github/workflows/nccl-k8s.yaml index c58395292..bcfd6de76 100644 --- a/.github/workflows/nccl-k8s.yaml +++ b/.github/workflows/nccl-k8s.yaml @@ -1,26 +1,12 @@ name: NCCL on Kubernetes on: - schedule: - - cron: '30 8 * * *' - pull_request: - types: - - opened - - reopened - - ready_for_review - - synchronize - paths-ignore: - - '**.md' - - '.github/triage/**' workflow_dispatch: inputs: - # Note that cuda-dl-base installs the NCCL tests, while the vanilla nvidia/cuda - # images do not; when JAX-Toolbox moves to using cuda-dl-base this workflow ought - # to be modified to test one of the JAX-Toolbox containers. CONTAINER: type: string - description: Container to test, this is assumed to already contain the NCCL tests e.g. cuda-dl-base or derived - default: '' - required: false + default: ghcr.io/nvidia/jax:base + description: Container to test, this is assumed to already contain the NCCL tests + required: true concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -35,5 +21,5 @@ jobs: nccl-tests: uses: ./.github/workflows/_test_nccl.yaml with: - CONTAINER: ${{ inputs.CONTAINER || 'nvcr.io/nvidia/cuda-dl-base:25.02-cuda12.8-devel-ubuntu24.04' }} + CONTAINER: ${{ inputs.CONTAINER }} secrets: inherit