From 618e72ec7cbde4360591e1f8a5412736f92fe8da Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Mon, 20 Jan 2025 00:30:36 -0800
Subject: [PATCH 1/4] Add distributed tests to run-readme-pr.yml

Need to ensure this is the right runner, @lessw2020 can you please have a look -- torchchat uses the same runners as pytorch.
---
 .github/workflows/run-readme-pr.yml | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml
index f32473435..37c27822b 100644
--- a/.github/workflows/run-readme-pr.yml
+++ b/.github/workflows/run-readme-pr.yml
@@ -306,3 +306,25 @@ jobs:
         echo "::endgroup::"
 
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs native
+
+  test-distributed-cuda:
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.4"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        .ci/scripts/run-docs distributed
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"

From 3d6b563e777574e76a52ca570e62902e550feac7 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Thu, 23 Jan 2025 15:43:12 -0800
Subject: [PATCH 2/4] Update run-docs

Remove HF login because tokens not available as git secret
---
 .ci/scripts/run-docs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs
index 521cfa811..017ce362f 100755
--- a/.ci/scripts/run-docs
+++ b/.ci/scripts/run-docs
@@ -129,7 +129,7 @@ fi
 if [ "$1" == "distributed" ]; then
 
         echo "::group::Create script to run distributed"
-        python3 torchchat/utils/scripts/updown.py --file docs/distributed.md > ./run-distributed.sh
+        python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --suppress huggingface-cli,HF_TOKEN > ./run-distributed.sh
         # for good measure, if something happened to updown processor,
         # and it did not error out, fail with an exit 1
         echo "exit 1" >> ./run-distributed.sh

From f6e92d21c7fde10a87534fc2ebb2fcab27dd556c Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Thu, 23 Jan 2025 15:48:07 -0800
Subject: [PATCH 3/4] Update run-docs

Replace llama3.1 with open-llama to avoid need for token.
If this turns out running too long, then we can switch to stories110M
---
 .ci/scripts/run-docs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs
index 017ce362f..b2d6c1abb 100755
--- a/.ci/scripts/run-docs
+++ b/.ci/scripts/run-docs
@@ -129,6 +129,7 @@ fi
 if [ "$1" == "distributed" ]; then
 
         echo "::group::Create script to run distributed"
+        python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --replace 'llama3.1:open-llama,-l 3:-l 2'  --suppress huggingface-cli,HF_TOKEN  > ./run-distributed.sh
         python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --suppress huggingface-cli,HF_TOKEN > ./run-distributed.sh
         # for good measure, if something happened to updown processor,
         # and it did not error out, fail with an exit 1

From 64e5b591665bf7e8abf16ed45d2bf7221795964c Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Thu, 23 Jan 2025 18:12:01 -0800
Subject: [PATCH 4/4] Update run-docs

open-llama -> stories.
---
 .ci/scripts/run-docs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs
index b2d6c1abb..3ca460cd2 100755
--- a/.ci/scripts/run-docs
+++ b/.ci/scripts/run-docs
@@ -129,7 +129,7 @@ fi
 if [ "$1" == "distributed" ]; then
 
         echo "::group::Create script to run distributed"
-        python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --replace 'llama3.1:open-llama,-l 3:-l 2'  --suppress huggingface-cli,HF_TOKEN  > ./run-distributed.sh
+        python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --replace 'llama3.1:stories110M,-l 3:-l 2'  --suppress huggingface-cli,HF_TOKEN  > ./run-distributed.sh
         python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --suppress huggingface-cli,HF_TOKEN > ./run-distributed.sh
         # for good measure, if something happened to updown processor,
         # and it did not error out, fail with an exit 1