diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs index 521cfa811..3ca460cd2 100755 --- a/.ci/scripts/run-docs +++ b/.ci/scripts/run-docs @@ -129,7 +129,8 @@ fi if [ "$1" == "distributed" ]; then echo "::group::Create script to run distributed" - python3 torchchat/utils/scripts/updown.py --file docs/distributed.md > ./run-distributed.sh + python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --replace 'llama3.1:stories110M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-distributed.sh + python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --suppress huggingface-cli,HF_TOKEN > ./run-distributed.sh # for good measure, if something happened to updown processor, # and it did not error out, fail with an exit 1 echo "exit 1" >> ./run-distributed.sh diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml index f32473435..37c27822b 100644 --- a/.github/workflows/run-readme-pr.yml +++ b/.github/workflows/run-readme-pr.yml @@ -306,3 +306,25 @@ jobs: echo "::endgroup::" TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs native + + test-distributed-cuda: + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + with: + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.4" + timeout: 60 + script: | + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + .ci/scripts/run-docs distributed + + echo "::group::Completion" + echo "tests complete" + echo "*******************************************" + echo "::endgroup::"