diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs
index 521cfa811..3ca460cd2 100755
--- a/.ci/scripts/run-docs
+++ b/.ci/scripts/run-docs
@@ -129,7 +129,8 @@ fi
 if [ "$1" == "distributed" ]; then
 
         echo "::group::Create script to run distributed"
-        python3 torchchat/utils/scripts/updown.py --file docs/distributed.md > ./run-distributed.sh
+        python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --replace 'llama3.1:stories110M,-l 3:-l 2'  --suppress huggingface-cli,HF_TOKEN  > ./run-distributed.sh
+        python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --suppress huggingface-cli,HF_TOKEN > ./run-distributed.sh
         # for good measure, if something happened to updown processor,
         # and it did not error out, fail with an exit 1
         echo "exit 1" >> ./run-distributed.sh
diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml
index f32473435..37c27822b 100644
--- a/.github/workflows/run-readme-pr.yml
+++ b/.github/workflows/run-readme-pr.yml
@@ -306,3 +306,25 @@ jobs:
         echo "::endgroup::"
 
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs native
+
+  test-distributed-cuda:
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.4"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        .ci/scripts/run-docs distributed
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"