ci: separate Jetstream Pytorch test to its own workflow

The main workflow was failing due to an OS error. I suspect that being related to a problem of space. Separating the workflow will make it easier to analyse this issue.
huggingface · Sep 18, 2024 · 4b48358 · 4b48358
1 parent d7b8978
commit 4b48358
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 11 deletions.
diff --git a/.github/workflows/test-pytorch-xla-tpu-tgi-jetstream.yml b/.github/workflows/test-pytorch-xla-tpu-tgi-jetstream.yml
@@ -0,0 +1,32 @@
+name: Optimum TPU / Test TGI on TPU / Jetstream Pytorch
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "text-generation-inference/**"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "text-generation-inference/**"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  do-the-job:
+    name: Run TGI tests - Jetstream Pytorch
+    runs-on: optimum-tpu
+    container:
+      image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.4.0_3.10_tpuvm
+      options: --shm-size "16gb" --ipc host --privileged
+    env:
+      PJRT_DEVICE: TPU
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Build and test TGI server
+        run: |
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_TPU_CI }} make tgi_test_jetstream
diff --git a/.github/workflows/test-pytorch-xla-tpu-tgi.yml b/.github/workflows/test-pytorch-xla-tpu-tgi.yml
@@ -19,7 +19,6 @@ jobs:
     name: Run TGI tests
     runs-on: optimum-tpu
     container:
-      # Use a nightly image that works with TPU (release was not working)
       image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.4.0_3.10_tpuvm
       options: --shm-size "16gb" --ipc host --privileged
     env:
@@ -31,13 +30,3 @@ jobs:
       - name: Build and test TGI server
         run: |
           HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_TPU_CI }} make tgi_test
-
-      # Use a different step to test the Jetstream Pytorch version, to avoid conflicts with torch-xla[tpu]
-      - name: Install and test TGI server (Jetstream Pytorch)
-        run: |
-          pip install -U .[jetstream-pt] \
-            -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html \
-            -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html \
-            -f https://storage.googleapis.com/libtpu-releases/index.html
-          JETSTREAM_PT=1 HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_TPU_CI }} python -m \
-            pytest -sv text-generation-inference/tests -k jetstream
diff --git a/Makefile b/Makefile
@@ -87,6 +87,17 @@ tgi_server:
 	make -C text-generation-inference/server clean
 	VERSION=${VERSION} TGI_VERSION=${TGI_VERSION} make -C text-generation-inference/server gen-server
 
+jetstream_requirements:
+	python -m pip install .[jetstream-pt] \
+            -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html \
+            -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html \
+            -f https://storage.googleapis.com/libtpu-releases/index.html
+
+tgi_test_jetstream: test_installs jetstream_requirements tgi_server
+	find text-generation-inference -name "text_generation_server-$(VERSION)-py3-none-any.whl" \
+	                               -exec python -m pip install --force-reinstall {} \;
+	JETSTREAM_PT=1 python -m pytest -sv text-generation-inference/tests
+
 tgi_test: test_installs tgi_server
 	find text-generation-inference -name "text_generation_server-$(VERSION)-py3-none-any.whl" \
 	                               -exec python -m pip install --force-reinstall {} \;