From 618e72ec7cbde4360591e1f8a5412736f92fe8da Mon Sep 17 00:00:00 2001 From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com> Date: Mon, 20 Jan 2025 00:30:36 -0800 Subject: [PATCH 1/4] Add distributed tests to run-readme-pr.yml Need to ensure this is the right runner, @lessw2020 can you please have a look -- torchchat uses the same runners as pytorch. --- .github/workflows/run-readme-pr.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml index f32473435..37c27822b 100644 --- a/.github/workflows/run-readme-pr.yml +++ b/.github/workflows/run-readme-pr.yml @@ -306,3 +306,25 @@ jobs: echo "::endgroup::" TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs native + + test-distributed-cuda: + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + with: + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.4" + timeout: 60 + script: | + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + .ci/scripts/run-docs distributed + + echo "::group::Completion" + echo "tests complete" + echo "*******************************************" + echo "::endgroup::" From 3d6b563e777574e76a52ca570e62902e550feac7 Mon Sep 17 00:00:00 2001 From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com> Date: Thu, 23 Jan 2025 15:43:12 -0800 Subject: [PATCH 2/4] Update run-docs Remove HF login because tokens not available as git secret --- .ci/scripts/run-docs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs index 521cfa811..017ce362f 100755 --- a/.ci/scripts/run-docs +++ b/.ci/scripts/run-docs @@ -129,7 +129,7 @@ fi if [ "$1" == "distributed" ]; then echo "::group::Create script to run distributed" - python3 torchchat/utils/scripts/updown.py --file docs/distributed.md > ./run-distributed.sh + python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --suppress huggingface-cli,HF_TOKEN > ./run-distributed.sh # for good measure, if something happened to updown processor, # and it did not error out, fail with an exit 1 echo "exit 1" >> ./run-distributed.sh From f6e92d21c7fde10a87534fc2ebb2fcab27dd556c Mon Sep 17 00:00:00 2001 From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com> Date: Thu, 23 Jan 2025 15:48:07 -0800 Subject: [PATCH 3/4] Update run-docs Replace llama3.1 with open-llama to avoid need for token. If this turns out running too long, then we can switch to stories110M --- .ci/scripts/run-docs | 1 + 1 file changed, 1 insertion(+) diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs index 017ce362f..b2d6c1abb 100755 --- a/.ci/scripts/run-docs +++ b/.ci/scripts/run-docs @@ -129,6 +129,7 @@ fi if [ "$1" == "distributed" ]; then echo "::group::Create script to run distributed" + python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --replace 'llama3.1:open-llama,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-distributed.sh python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --suppress huggingface-cli,HF_TOKEN > ./run-distributed.sh # for good measure, if something happened to updown processor, # and it did not error out, fail with an exit 1 From 64e5b591665bf7e8abf16ed45d2bf7221795964c Mon Sep 17 00:00:00 2001 From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com> Date: Thu, 23 Jan 2025 18:12:01 -0800 Subject: [PATCH 4/4] Update run-docs open-llama -> stories. --- .ci/scripts/run-docs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs index b2d6c1abb..3ca460cd2 100755 --- a/.ci/scripts/run-docs +++ b/.ci/scripts/run-docs @@ -129,7 +129,7 @@ fi if [ "$1" == "distributed" ]; then echo "::group::Create script to run distributed" - python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --replace 'llama3.1:open-llama,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-distributed.sh + python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --replace 'llama3.1:stories110M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-distributed.sh python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --suppress huggingface-cli,HF_TOKEN > ./run-distributed.sh # for good measure, if something happened to updown processor, # and it did not error out, fail with an exit 1