huggingface · IlyasMoutawwakil · Sep 23, 2024 · Sep 23, 2024 · Sep 23, 2024 · Sep 23, 2024
diff --git a/.github/workflows/test_cli_cuda_pytorch.yaml b/.github/workflows/test_cli_cuda_pytorch.yaml
@@ -74,6 +74,10 @@ jobs:
         run: |
           pip install -e .[testing,diffusers,timm,peft,deepspeed]
 
-      - name: Run tests
+      - name: Run tests (parallel)
+        run: |
+          pytest tests/test_cli.py -x -s -k "cli and cuda and pytorch and (dp or ddp or device_map)"
+
+      - name: Run tests (sequential)
         run: |
-          FORCE_SERIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and pytorch and (dp or ddp or device_map or deepspeed)"
+          FORCE_SEQUENTIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and pytorch and (deepspeed_inference)"
diff --git a/.github/workflows/test_cli_cuda_tensorrt_llm.yaml b/.github/workflows/test_cli_cuda_tensorrt_llm.yaml
@@ -20,13 +20,14 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
 
 jobs:
-  cli_cuda_tensorrt_llm_tests:
+  cli_cuda_tensorrt_llm_single_gpu_tests:
     if: ${{
       (github.event_name == 'push') ||
       (github.event_name == 'workflow_dispatch') ||
       contains( github.event.pull_request.labels.*.name, 'cli') ||
       contains( github.event.pull_request.labels.*.name, 'cuda') ||
       contains( github.event.pull_request.labels.*.name, 'tensorrt_llm') ||
+      contains( github.event.pull_request.labels.*.name, 'single_gpu') ||
       contains( github.event.pull_request.labels.*.name, 'cli_cuda_tensorrt_llm')
       }}
 
@@ -46,4 +47,33 @@ jobs:
 
       - name: Run tests
         run: |
-          pytest tests/test_cli.py -x -s -k "cli and cuda and tensorrt_llm"
+          pytest tests/test_cli.py -x -s -k "cli and cuda and tensorrt_llm and not (tp or pp)"
+
+  cli_cuda_tensorrt_llm_multi_gpu_tests:
+    if: ${{
+      (github.event_name == 'push') ||
+      (github.event_name == 'workflow_dispatch') ||
+      contains( github.event.pull_request.labels.*.name, 'cli') ||
+      contains( github.event.pull_request.labels.*.name, 'cuda') ||
+      contains( github.event.pull_request.labels.*.name, 'tensorrt_llm') ||
+      contains( github.event.pull_request.labels.*.name, 'multi_gpu') ||
+      contains( github.event.pull_request.labels.*.name, 'cli_cuda_tensorrt_llm_multi_gpu')
+      }}
+
+    runs-on: [multi-gpu, nvidia-gpu, 4-a10, ci]
+
+    container:
+      image: huggingface/optimum-nvidia:latest
+      options: --ipc host --gpus all
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          pip install -e .[testing]
+
+      - name: Run tests (sequential)
+        run: |
+          FORCE_SEQUENTIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and tensorrt_llm and (tp or pp)"
diff --git a/.github/workflows/test_cli_cuda_torch_ort.yaml b/.github/workflows/test_cli_cuda_torch_ort.yaml
@@ -78,4 +78,4 @@ jobs:
 
       - name: Run tests
         run: |
-          FORCE_SERIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and torch_ort and (dp or ddp or device_map) and not (peft)"
+          pytest tests/test_cli.py -x -s -k "cli and cuda and torch_ort and (dp or ddp or device_map)"
diff --git a/.github/workflows/test_cli_cuda_vllm.yaml b/.github/workflows/test_cli_cuda_vllm.yaml
@@ -45,9 +45,9 @@ jobs:
         run: |
           pip install -e .[testing]
 
-      - name: Run tests
+      - name: Run tests (sequential)
         run: |
-          FORCE_SERIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and vllm and not (tp or pp)"
+          FORCE_SEQUENTIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and vllm and not (tp or pp)"
 
   run_cli_cuda_vllm_multi_gpu_tests:
     if: ${{
@@ -74,6 +74,6 @@ jobs:
         run: |
           pip install -e .[testing]
 
-      - name: Run tests
+      - name: Run tests (sequential)
         run: |
-          FORCE_SERIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and vllm and (tp or pp)"
+          FORCE_SEQUENTIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and vllm and (tp or pp)"
diff --git a/.github/workflows/test_cli_misc.yaml b/.github/workflows/test_cli_misc.yaml
@@ -61,4 +61,5 @@ jobs:
           pip install -e .[testing]
 
       - name: Run tests
-        run: pytest tests/test_cli.py -s -k "cli and not (cpu or cuda or rocm or mps)"
+        run: |
+          pytest tests/test_cli.py -s -k "cli and not (cpu or cuda or rocm or mps)"
diff --git a/.github/workflows/test_cli_rocm_pytorch.yaml b/.github/workflows/test_cli_rocm_pytorch.yaml
@@ -82,6 +82,10 @@ jobs:
         run: |
           pip install -e .[testing,diffusers,timm,deepspeed,peft,autoawq,auto-gptq] "deepspeed<0.15"
 
-      - name: Run tests
+      - name: Run tests (parallel)
+        run: |
+          pytest tests/test_cli.py -x -s -k "cli and cuda and pytorch and (dp or ddp or device_map)"
+
+      - name: Run tests (sequential)
         run: |
-          FORCE_SERIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and pytorch and (dp or ddp or device_map or deepspeed) and not bnb"
+          FORCE_SEQUENTIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and pytorch and (deepspeed_inference)"
diff --git a/optimum_benchmark/launchers/torchrun/launcher.py b/optimum_benchmark/launchers/torchrun/launcher.py
@@ -1,4 +1,5 @@
 import os
+import sys
 import traceback
 from contextlib import ExitStack
 from logging import Logger
@@ -156,6 +157,10 @@ def entrypoint(worker: Callable[..., BenchmarkReport], worker_args: List[Any], l
     else:
         setup_logging(level="ERROR", to_file=log_to_file, prefix=f"RANK-PROCESS-{rank}")
 
+    if sys.platform == "win32":
+        logger.info("\t+ Disabline libuv on Windows")
+        os.environ["USE_LIBUV"] = "0"
+
     if torch.cuda.is_available():
         logger.info(f"\t+ Setting torch.distributed cuda device to {rank}")
         device = torch.device("cuda", rank)

diff --git a/tests/configs/_deepspeed_inference_.yaml b/tests/configs/_deepspeed_inference_.yaml
@@ -17,8 +17,6 @@ scenario:
     batch_size: 2
 
 hydra:
-  launcher:
-    n_jobs: 1
   job:
     env_set:
       LOG_ALL_RANKS: 1
diff --git a/tests/configs/_tensorrt_llm_pp_.yaml b/tests/configs/_tensorrt_llm_pp_.yaml
@@ -0,0 +1,6 @@
+backend:
+  model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  gpus_per_node: 2
+  device_ids: 0,1
+  world_size: 2
+  pp: 2
diff --git a/tests/configs/_tensorrt_llm_tp_.yaml b/tests/configs/_tensorrt_llm_tp_.yaml
@@ -0,0 +1,6 @@
+backend:
+  model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  gpus_per_node: 2
+  device_ids: 0,1
+  world_size: 2
+  tp: 2
diff --git a/tests/configs/cuda_inference_tensorrt_llm_pp.yaml b/tests/configs/cuda_inference_tensorrt_llm_pp.yaml
@@ -0,0 +1,10 @@
+defaults:
+  # order of inheritance, last one overrides previous ones
+  - _base_ # inherits from base config
+  - _cuda_ # inherits from cuda config
+  - _inference_ # inherits from inference config
+  - _tensorrt_llm_pp_ # inherits from tensorrt_llm_pp config
+  - _self_ # hydra 1.1 compatibility
+  - override backend: tensorrt-llm
+
+name: cuda_inference_tensorrt_llm_pp
diff --git a/tests/configs/cuda_inference_tensorrt_llm_tp.yaml b/tests/configs/cuda_inference_tensorrt_llm_tp.yaml
@@ -0,0 +1,10 @@
+defaults:
+  # order of inheritance, last one overrides previous ones
+  - _base_ # inherits from base config
+  - _cuda_ # inherits from cuda config
+  - _inference_ # inherits from inference config
+  - _tensorrt_llm_tp_ # inherits from tensorrt_llm_tp config
+  - _self_ # hydra 1.1 compatibility
+  - override backend: tensorrt-llm
+
+name: cuda_inference_tensorrt_llm_tp
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -10,7 +10,8 @@
 LOGGER = getLogger("test-cli")
 
 
-FORCE_SERIAL = os.environ.get("FORCE_SERIAL", "0") == "1"
+FORCE_SEQUENTIAL = os.environ.get("FORCE_SEQUENTIAL", "0") == "1"
+
 TEST_CONFIG_DIR = Path(__file__).parent / "configs"
 TEST_CONFIG_NAMES = [
     config.split(".")[0]
@@ -30,16 +31,16 @@ def test_cli_configs(config_name):
         TEST_CONFIG_DIR,
         "--config-name",
         config_name,
-        # to run the tests faster
-        "hydra/launcher=joblib",
-        "hydra.launcher.batch_size=1",
-        "hydra.launcher.prefer=threads",
     ]
 
-    if FORCE_SERIAL:
-        args += ["hydra.launcher.n_jobs=1"]
-    else:
-        args += ["hydra.launcher.n_jobs=-1"]
+    if not FORCE_SEQUENTIAL:
+        args += [
+            # to run the tests faster
+            "hydra/launcher=joblib",
+            "hydra.launcher.n_jobs=-1",
+            "hydra.launcher.batch_size=1",
+            "hydra.launcher.prefer=threads",
+        ]
 
     if ROCR_VISIBLE_DEVICES is not None:
         args += [f'backend.device_ids="{ROCR_VISIBLE_DEVICES}"']
@@ -50,7 +51,7 @@ def test_cli_configs(config_name):
     assert popen.returncode == 0, f"Failed to run {config_name}"
 
 
-@pytest.mark.parametrize("launcher", ["inline", "process"])
+@pytest.mark.parametrize("launcher", ["inline", "process", "torchrun"])
 def test_cli_exit_code_0(launcher):
     args_0 = [
         "optimum-benchmark",
@@ -59,7 +60,7 @@ def test_cli_exit_code_0(launcher):
         "--config-name",
         "_base_",
         "name=test",
-        f"launcher={launcher}",
+        "launcher=" + launcher,
         # compatible task and model
         "backend.task=text-classification",
         "backend.model=bert-base-uncased",
@@ -79,7 +80,7 @@ def test_cli_exit_code_1(launcher):
         "--config-name",
         "_base_",
         "name=test",
-        f"launcher={launcher}",
+        "launcher=" + launcher,
         # incompatible task and model to trigger an error
         "backend.task=image-classification",
         "backend.model=bert-base-uncased",
@@ -102,7 +103,7 @@ def test_cli_numactl(launcher):
         "--config-name",
         "_base_",
         "name=test",
-        f"launcher={launcher}",
+        "launcher=" + launcher,
         "launcher.numactl=True",
         "backend.task=text-classification",
         "backend.model=bert-base-uncased",