From 720b18e74e07d4026bbe908b7a3a399797aa51c3 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Thu, 9 Jan 2025 13:56:28 -0800 Subject: [PATCH 1/7] Remove test runs on V100 --- .github/workflows/_test_rosetta.yaml | 97 ---------------------------- .github/workflows/_test_unit.yaml | 2 +- README.md | 25 ------- 3 files changed, 1 insertion(+), 123 deletions(-) delete mode 100644 .github/workflows/_test_rosetta.yaml diff --git a/.github/workflows/_test_rosetta.yaml b/.github/workflows/_test_rosetta.yaml deleted file mode 100644 index 017662ea3..000000000 --- a/.github/workflows/_test_rosetta.yaml +++ /dev/null @@ -1,97 +0,0 @@ -name: ~test Rosetta - -on: - workflow_call: - inputs: - ROSETTA_IMAGE: - type: string - description: 'Rosetta image build by NVIDIA/JAX-Toolbox' - required: true - default: 'ghcr.io/nvidia/t5x:latest' - outputs: - TEST_ARTIFACT_NAME: - description: 'Name of the unit test artifact for downstream workflows' - value: ${{ jobs.rosetta-unit-tests.outputs.TEST_ARTIFACT_NAME }} - TEST_STATUS: - description: 'Summary pass/fail value indicating if results from tests are acceptable' - value: ${{ jobs.publish-test.outputs.STATUS }} - -env: - TEST_ARTIFACT_NAME: rosetta-test-logs - TEST_LOG_LOCAL_PATH: /log/unit-report.jsonl - -jobs: - rosetta-unit-tests: - runs-on: [self-hosted, V100] - outputs: - TEST_ARTIFACT_NAME: ${{ env.TEST_ARTIFACT_NAME }} - steps: - - name: Print environment variables - run: | - env - - - name: Print GPU information - run: nvidia-smi - - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Pull Rosetta image - shell: bash -x -e {0} - run: | - docker pull ${{ inputs.ROSETTA_IMAGE }} - docker tag ${{ inputs.ROSETTA_IMAGE }} rosetta:latest - - - name: Run Rosetta tests w/ docker - shell: docker run --gpus all -v {0}:/cmd.sh -v /log:/log rosetta:latest bash -x -e /cmd.sh - run: | - ROSETTA_PATH=$(dirname $(python -c "import rosetta; print(*rosetta.__path__)")) - pip install "${ROSETTA_PATH}[test]" pytest-reportlog - pytest --report-log=${{ env.TEST_LOG_LOCAL_PATH }} ${ROSETTA_PATH} || true - - - name: Upload unit test json logs - uses: actions/upload-artifact@v4 - with: - name: ${{ env.TEST_ARTIFACT_NAME }} - path: ${{ env.TEST_LOG_LOCAL_PATH }} - - publish-test: - needs: rosetta-unit-tests - uses: ./.github/workflows/_publish_badge.yaml - if: ( always() ) - secrets: inherit - with: - ENDPOINT_FILENAME: 'rosetta-unit-test-status.json' - PUBLISH: false - SCRIPT: | - ARTIFACTS="${{ needs.rosetta-unit-tests.outputs.TEST_ARTIFACT_NAME }}/*.jsonl" - all_outcomes() { - cat $ARTIFACTS | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome' - } - cnt_type() { - cat $ARTIFACTS | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l - } - SKIPPED_TESTS=$(cnt_type skipped) - FAILED_TESTS=$(cnt_type failed) - PASSED_TESTS=$(cnt_type passed) - TOTAL_TESTS=$(all_outcomes | wc -l) - echo "## Unit/Integration test breakdown" | tee -a $GITHUB_STEP_SUMMARY - all_outcomes | sort | uniq -c | tee -a $GITHUB_STEP_SUMMARY - if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]]; then - BADGE_COLOR=brightgreen - echo "STATUS=success" >> $GITHUB_OUTPUT - else - echo "STATUS=failure" >> $GITHUB_OUTPUT - if [[ $PASSED_TESTS -eq 0 ]]; then - BADGE_COLOR=red - else - BADGE_COLOR=yellow - fi - fi - echo "LABEL='V100 Unit'" >> $GITHUB_OUTPUT - echo "MESSAGE='${PASSED_TESTS}/${SKIPPED_TESTS}/${FAILED_TESTS} pass/skip/fail'" >> $GITHUB_OUTPUT - echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT diff --git a/.github/workflows/_test_unit.yaml b/.github/workflows/_test_unit.yaml index fa29557e0..d820eb348 100644 --- a/.github/workflows/_test_unit.yaml +++ b/.github/workflows/_test_unit.yaml @@ -37,7 +37,7 @@ jobs: strategy: fail-fast: false matrix: - GPU_ARCH: [V100, A100] + GPU_ARCH: [A100] include: - EXTRA_LABEL: "self-hosted" # ensures A100 job lands on dedicated runner for this particular job diff --git a/README.md b/README.md index 4438f7efc..491ac32df 100644 --- a/README.md +++ b/README.md @@ -71,18 +71,11 @@ We support and test the following JAX frameworks and model architectures. More d - - -

- - - -
@@ -91,10 +84,6 @@ We support and test the following JAX frameworks and model architectures. More d
- - - -
@@ -119,10 +108,6 @@ We support and test the following JAX frameworks and model architectures. More d - - - -
@@ -148,8 +133,6 @@ We support and test the following JAX frameworks and model architectures. More d [tests disabled] - @@ -168,10 +151,6 @@ We support and test the following JAX frameworks and model architectures. More d - - - -
@@ -313,10 +292,6 @@ We support and test the following JAX frameworks and model architectures. More d - - - -
From 5685614312b6576c4cdf0006e081e1abc68c53ba Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Thu, 9 Jan 2025 14:47:52 -0800 Subject: [PATCH 2/7] Unit tests run only on A100 --- .github/workflows/_test_unit.yaml | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/.github/workflows/_test_unit.yaml b/.github/workflows/_test_unit.yaml index d820eb348..0fa15bbc3 100644 --- a/.github/workflows/_test_unit.yaml +++ b/.github/workflows/_test_unit.yaml @@ -34,23 +34,14 @@ jobs: secrets: inherit run-unit-test: - strategy: - fail-fast: false - matrix: - GPU_ARCH: [A100] - include: - - EXTRA_LABEL: "self-hosted" - # ensures A100 job lands on dedicated runner for this particular job - - GPU_ARCH: A100 - EXTRA_LABEL: "${{ github.run_id }}" - name: ${{ inputs.TEST_NAME }}-${{ matrix.GPU_ARCH }}-unit-test + name: ${{ inputs.TEST_NAME }}-A100-unit-test runs-on: - self-hosted - - "${{ matrix.GPU_ARCH }}" - - "${{ matrix.EXTRA_LABEL }}" + - A100 + - "${{ github.run_id }}" env: - ARTIFACT_NAME_FULL: ${{ inputs.TEST_NAME }}-unit-test-${{ matrix.GPU_ARCH }} - BADGE_FILENAME_FULL: badge-${{ inputs.TEST_NAME }}-unit-test-${{ matrix.GPU_ARCH }}.json + ARTIFACT_NAME_FULL: ${{ inputs.TEST_NAME }}-unit-test-A100 + BADGE_FILENAME_FULL: badge-${{ inputs.TEST_NAME }}-unit-test-A100.json steps: - name: Print environment variables run: env @@ -94,7 +85,7 @@ jobs: if [[ ${errors} > 0 ]] || [[ ${total_tests} == 0 ]]; then echo "badge_color=red" >> $GITHUB_OUTPUT echo "badge_message=error" >> $GITHUB_OUTPUT - echo "summary=${{ inputs.TEST_NAME }} unit test on ${{ matrix.GPU_ARCH }} did not complete due to errors." >> $GITHUB_OUTPUT + echo "summary=${{ inputs.TEST_NAME }} unit test on A100 did not complete due to errors." >> $GITHUB_OUTPUT exit 1 else if [[ ${failed_tests} == 0 ]]; then @@ -103,7 +94,7 @@ jobs: echo "badge_color=yellow" >> $GITHUB_OUTPUT fi echo "badge_message=${passed_tests}/${total_tests} passed" >> $GITHUB_OUTPUT - echo "summary=${{ inputs.TEST_NAME }} unit test on ${{ matrix.GPU_ARCH }}: ${total_tests} total tests, ${errors} errors, ${passed_tests} passed, ${failed_tests} failed." >> $GITHUB_OUTPUT + echo "summary=${{ inputs.TEST_NAME }} unit test on A100: ${total_tests} total tests, ${errors} errors, ${passed_tests} passed, ${failed_tests} failed." >> $GITHUB_OUTPUT fi - name: Generate sitrep @@ -114,7 +105,7 @@ jobs: # bring in utility functions source .github/workflows/scripts/to_json.sh - badge_label='${{ inputs.TEST_NAME }} ${{ matrix.GPU_ARCH }} Unit' + badge_label='${{ inputs.TEST_NAME }} A100 Unit' total_tests=${{ steps.test-stats.outputs.TOTAL_TESTS }} \ errors=${{ steps.test-stats.outputs.ERRORS }} \ From 318786ddc0d6c741c5362ce04e4112f0dfc31fb3 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Fri, 10 Jan 2025 13:57:05 -0800 Subject: [PATCH 3/7] Bring _test_rosetta workflow back --- .github/workflows/_test_rosetta.yaml | 97 ++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 .github/workflows/_test_rosetta.yaml diff --git a/.github/workflows/_test_rosetta.yaml b/.github/workflows/_test_rosetta.yaml new file mode 100644 index 000000000..06c5365bb --- /dev/null +++ b/.github/workflows/_test_rosetta.yaml @@ -0,0 +1,97 @@ +name: ~test Rosetta + +on: + workflow_call: + inputs: + ROSETTA_IMAGE: + type: string + description: 'Rosetta image build by NVIDIA/JAX-Toolbox' + required: true + default: 'ghcr.io/nvidia/t5x:latest' + outputs: + TEST_ARTIFACT_NAME: + description: 'Name of the unit test artifact for downstream workflows' + value: ${{ jobs.rosetta-unit-tests.outputs.TEST_ARTIFACT_NAME }} + TEST_STATUS: + description: 'Summary pass/fail value indicating if results from tests are acceptable' + value: ${{ jobs.publish-test.outputs.STATUS }} + +env: + TEST_ARTIFACT_NAME: rosetta-test-logs + TEST_LOG_LOCAL_PATH: /log/unit-report.jsonl + +jobs: + rosetta-unit-tests: + runs-on: [self-hosted, V100] + outputs: + TEST_ARTIFACT_NAME: ${{ env.TEST_ARTIFACT_NAME }} + steps: + - name: Print environment variables + run: | + env + + - name: Print GPU information + run: nvidia-smi + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Pull Rosetta image + shell: bash -x -e {0} + run: | + docker pull ${{ inputs.ROSETTA_IMAGE }} + docker tag ${{ inputs.ROSETTA_IMAGE }} rosetta:latest + + - name: Run Rosetta tests w/ docker + shell: docker run --gpus all -v {0}:/cmd.sh -v /log:/log rosetta:latest bash -x -e /cmd.sh + run: | + ROSETTA_PATH=$(dirname $(python -c "import rosetta; print(*rosetta.__path__)")) + pip install "${ROSETTA_PATH}[test]" pytest-reportlog + pytest --report-log=${{ env.TEST_LOG_LOCAL_PATH }} ${ROSETTA_PATH} || true + + - name: Upload unit test json logs + uses: actions/upload-artifact@v4 + with: + name: ${{ env.TEST_ARTIFACT_NAME }} + path: ${{ env.TEST_LOG_LOCAL_PATH }} + + publish-test: + needs: rosetta-unit-tests + uses: ./.github/workflows/_publish_badge.yaml + if: ( always() ) + secrets: inherit + with: + ENDPOINT_FILENAME: 'rosetta-unit-test-status.json' + PUBLISH: false + SCRIPT: | + ARTIFACTS="${{ needs.rosetta-unit-tests.outputs.TEST_ARTIFACT_NAME }}/*.jsonl" + all_outcomes() { + cat $ARTIFACTS | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome' + } + cnt_type() { + cat $ARTIFACTS | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l + } + SKIPPED_TESTS=$(cnt_type skipped) + FAILED_TESTS=$(cnt_type failed) + PASSED_TESTS=$(cnt_type passed) + TOTAL_TESTS=$(all_outcomes | wc -l) + echo "## Unit/Integration test breakdown" | tee -a $GITHUB_STEP_SUMMARY + all_outcomes | sort | uniq -c | tee -a $GITHUB_STEP_SUMMARY + if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]]; then + BADGE_COLOR=brightgreen + echo "STATUS=success" >> $GITHUB_OUTPUT + else + echo "STATUS=failure" >> $GITHUB_OUTPUT + if [[ $PASSED_TESTS -eq 0 ]]; then + BADGE_COLOR=red + else + BADGE_COLOR=yellow + fi + fi + echo "LABEL='V100 Unit'" >> $GITHUB_OUTPUT + echo "MESSAGE='${PASSED_TESTS}/${SKIPPED_TESTS}/${FAILED_TESTS} pass/skip/fail'" >> $GITHUB_OUTPUT + echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT \ No newline at end of file From 05073cd7eb1d8413155c4fc0b233b52301c1e38f Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Fri, 10 Jan 2025 13:57:28 -0800 Subject: [PATCH 4/7] Bring _test_rosetta workflow back --- .github/workflows/_test_rosetta.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/_test_rosetta.yaml b/.github/workflows/_test_rosetta.yaml index 06c5365bb..05f7d3867 100644 --- a/.github/workflows/_test_rosetta.yaml +++ b/.github/workflows/_test_rosetta.yaml @@ -94,4 +94,5 @@ jobs: fi echo "LABEL='V100 Unit'" >> $GITHUB_OUTPUT echo "MESSAGE='${PASSED_TESTS}/${SKIPPED_TESTS}/${FAILED_TESTS} pass/skip/fail'" >> $GITHUB_OUTPUT - echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT \ No newline at end of file + echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT + \ No newline at end of file From 4151c27844be33aaab03c57a9e44c60241cdbc92 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Fri, 10 Jan 2025 13:57:42 -0800 Subject: [PATCH 5/7] Bring _test_rosetta workflow back --- .github/workflows/_test_rosetta.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/_test_rosetta.yaml b/.github/workflows/_test_rosetta.yaml index 05f7d3867..06c5365bb 100644 --- a/.github/workflows/_test_rosetta.yaml +++ b/.github/workflows/_test_rosetta.yaml @@ -94,5 +94,4 @@ jobs: fi echo "LABEL='V100 Unit'" >> $GITHUB_OUTPUT echo "MESSAGE='${PASSED_TESTS}/${SKIPPED_TESTS}/${FAILED_TESTS} pass/skip/fail'" >> $GITHUB_OUTPUT - echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT - \ No newline at end of file + echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT \ No newline at end of file From 6d8a6a5d4c40e6da7a65e5150e8df9090e0c9886 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Fri, 10 Jan 2025 14:29:57 -0800 Subject: [PATCH 6/7] _test_rosetta workflow to run on A100 --- .github/workflows/_test_rosetta.yaml | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_test_rosetta.yaml b/.github/workflows/_test_rosetta.yaml index 06c5365bb..172d0cc02 100644 --- a/.github/workflows/_test_rosetta.yaml +++ b/.github/workflows/_test_rosetta.yaml @@ -8,6 +8,10 @@ on: description: 'Rosetta image build by NVIDIA/JAX-Toolbox' required: true default: 'ghcr.io/nvidia/t5x:latest' + TIMEOUT_MINUTES: + type: number + description: 'Maximum test runtime, in minutes' + default: 60 outputs: TEST_ARTIFACT_NAME: description: 'Name of the unit test artifact for downstream workflows' @@ -21,8 +25,19 @@ env: TEST_LOG_LOCAL_PATH: /log/unit-report.jsonl jobs: + runner: + uses: ./.github/workflows/_runner_ondemand_slurm.yaml + with: + NAME: "A100" + LABELS: "A100,${{ github.run_id }}" + TIME: "${{ inputs.TIMEOUT_MINUTES }}:00" + secrets: inherit + rosetta-unit-tests: - runs-on: [self-hosted, V100] + runs-on: + - self-hosted + - A100 + - "${{ github.run_id }}" outputs: TEST_ARTIFACT_NAME: ${{ env.TEST_ARTIFACT_NAME }} steps: @@ -92,6 +107,6 @@ jobs: BADGE_COLOR=yellow fi fi - echo "LABEL='V100 Unit'" >> $GITHUB_OUTPUT + echo "LABEL='A100 Unit'" >> $GITHUB_OUTPUT echo "MESSAGE='${PASSED_TESTS}/${SKIPPED_TESTS}/${FAILED_TESTS} pass/skip/fail'" >> $GITHUB_OUTPUT echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT \ No newline at end of file From 9a350b6d21e591816b6a629a11aa10f29a95cc2d Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Mon, 13 Jan 2025 11:45:06 -0800 Subject: [PATCH 7/7] Vemove V100 from _test_unit workflow, remaining the matrix structure for future HW --- .github/workflows/_test_unit.yaml | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/.github/workflows/_test_unit.yaml b/.github/workflows/_test_unit.yaml index 0fa15bbc3..d820eb348 100644 --- a/.github/workflows/_test_unit.yaml +++ b/.github/workflows/_test_unit.yaml @@ -34,14 +34,23 @@ jobs: secrets: inherit run-unit-test: - name: ${{ inputs.TEST_NAME }}-A100-unit-test + strategy: + fail-fast: false + matrix: + GPU_ARCH: [A100] + include: + - EXTRA_LABEL: "self-hosted" + # ensures A100 job lands on dedicated runner for this particular job + - GPU_ARCH: A100 + EXTRA_LABEL: "${{ github.run_id }}" + name: ${{ inputs.TEST_NAME }}-${{ matrix.GPU_ARCH }}-unit-test runs-on: - self-hosted - - A100 - - "${{ github.run_id }}" + - "${{ matrix.GPU_ARCH }}" + - "${{ matrix.EXTRA_LABEL }}" env: - ARTIFACT_NAME_FULL: ${{ inputs.TEST_NAME }}-unit-test-A100 - BADGE_FILENAME_FULL: badge-${{ inputs.TEST_NAME }}-unit-test-A100.json + ARTIFACT_NAME_FULL: ${{ inputs.TEST_NAME }}-unit-test-${{ matrix.GPU_ARCH }} + BADGE_FILENAME_FULL: badge-${{ inputs.TEST_NAME }}-unit-test-${{ matrix.GPU_ARCH }}.json steps: - name: Print environment variables run: env @@ -85,7 +94,7 @@ jobs: if [[ ${errors} > 0 ]] || [[ ${total_tests} == 0 ]]; then echo "badge_color=red" >> $GITHUB_OUTPUT echo "badge_message=error" >> $GITHUB_OUTPUT - echo "summary=${{ inputs.TEST_NAME }} unit test on A100 did not complete due to errors." >> $GITHUB_OUTPUT + echo "summary=${{ inputs.TEST_NAME }} unit test on ${{ matrix.GPU_ARCH }} did not complete due to errors." >> $GITHUB_OUTPUT exit 1 else if [[ ${failed_tests} == 0 ]]; then @@ -94,7 +103,7 @@ jobs: echo "badge_color=yellow" >> $GITHUB_OUTPUT fi echo "badge_message=${passed_tests}/${total_tests} passed" >> $GITHUB_OUTPUT - echo "summary=${{ inputs.TEST_NAME }} unit test on A100: ${total_tests} total tests, ${errors} errors, ${passed_tests} passed, ${failed_tests} failed." >> $GITHUB_OUTPUT + echo "summary=${{ inputs.TEST_NAME }} unit test on ${{ matrix.GPU_ARCH }}: ${total_tests} total tests, ${errors} errors, ${passed_tests} passed, ${failed_tests} failed." >> $GITHUB_OUTPUT fi - name: Generate sitrep @@ -105,7 +114,7 @@ jobs: # bring in utility functions source .github/workflows/scripts/to_json.sh - badge_label='${{ inputs.TEST_NAME }} A100 Unit' + badge_label='${{ inputs.TEST_NAME }} ${{ matrix.GPU_ARCH }} Unit' total_tests=${{ steps.test-stats.outputs.TOTAL_TESTS }} \ errors=${{ steps.test-stats.outputs.ERRORS }} \