Removing barriers (#273)

huggingface · Sep 23, 2024 · 1c43fa2 · 1c43fa2
1 parent 01e4e59
commit 1c43fa2
Show file tree

Hide file tree

Showing 23 changed files with 29 additions and 75 deletions.
diff --git a/.github/workflows/test_api_cpu.yaml b/.github/workflows/test_api_cpu.yaml
@@ -51,4 +51,4 @@ jobs:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           PUSH_REPO_ID: optimum-benchmark/cpu
         run: |
-          pytest -s -k "api and cpu"
+          pytest tests/test_api.py -s -k "api and cpu"
diff --git a/.github/workflows/test_api_cuda.yaml b/.github/workflows/test_api_cuda.yaml
@@ -48,4 +48,4 @@ jobs:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           PUSH_REPO_ID: optimum-benchmark/cuda
         run: |
-          pytest -s -x -k "api and cuda"
+          pytest tests/test_api.py -x -s -k "api and cuda"
diff --git a/.github/workflows/test_api_misc.yaml b/.github/workflows/test_api_misc.yaml
@@ -59,4 +59,4 @@ jobs:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           PUSH_REPO_ID: optimum-benchmark/misc-${{ matrix.os }}-${{ matrix.python }}
         run: |
-          pytest -s -k "api and not (cpu or cuda or rocm or mps)"
+          pytest tests/test_api.py -s -k "api and not (cpu or cuda or rocm or mps)"
diff --git a/.github/workflows/test_api_rocm.yaml b/.github/workflows/test_api_rocm.yaml
@@ -54,4 +54,4 @@ jobs:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           PUSH_REPO_ID: optimum-benchmark/rocm
         run: |
-          pytest -s -x -k "api and cuda"
+          pytest tests/test_api.py -x -s -k "api and cuda"
diff --git a/.github/workflows/test_cli_cpu_ipex.yaml b/.github/workflows/test_cli_cpu_ipex.yaml
@@ -48,4 +48,4 @@ jobs:
           pip install -e .[testing,ipex,diffusers,timm]
 
       - name: Run tests
-        run: pytest -s -k "cli and cpu and ipex"
+        run: pytest tests/test_cli.py -s -k "cli and cpu and ipex"
diff --git a/.github/workflows/test_cli_cpu_llama_cpp.yaml b/.github/workflows/test_cli_cpu_llama_cpp.yaml
@@ -48,4 +48,4 @@ jobs:
           pip install -e .[testing,llama-cpp]
 
       - name: Run tests
-        run: pytest -s -k "llama_cpp"
+        run: pytest tests/test_cli.py -s -k "llama_cpp"
diff --git a/.github/workflows/test_cli_cpu_neural_compressor.yaml b/.github/workflows/test_cli_cpu_neural_compressor.yaml
@@ -48,4 +48,4 @@ jobs:
           pip install -e .[testing,neural-compressor,diffusers,timm]
 
       - name: Run tests
-        run: pytest -s -k "cli and cpu and neural_compressor"
+        run: pytest tests/test_cli.py -s -k "cli and cpu and neural_compressor"
diff --git a/.github/workflows/test_cli_cpu_onnxruntime.yaml b/.github/workflows/test_cli_cpu_onnxruntime.yaml
@@ -48,4 +48,4 @@ jobs:
           pip install -e .[testing,onnxruntime,diffusers,timm]
 
       - name: Run tests
-        run: pytest -s -k "cli and cpu and onnxruntime"
+        run: pytest tests/test_cli.py -s -k "cli and cpu and onnxruntime"
diff --git a/.github/workflows/test_cli_cpu_openvino.yaml b/.github/workflows/test_cli_cpu_openvino.yaml
@@ -48,4 +48,4 @@ jobs:
           pip install -e .[testing,openvino,diffusers,timm]
 
       - name: Run tests
-        run: pytest -s -k "cli and cpu and openvino"
+        run: pytest tests/test_cli.py -s -k "cli and cpu and openvino"
diff --git a/.github/workflows/test_cli_cpu_py_txi.yaml b/.github/workflows/test_cli_cpu_py_txi.yaml
@@ -48,4 +48,4 @@ jobs:
           pip install -e .[testing,py-txi]
 
       - name: Run tests
-        run: pytest -s -k "cli and cpu and py_txi"
+        run: pytest tests/test_cli.py -s -k "cli and cpu and py_txi"
diff --git a/.github/workflows/test_cli_cpu_pytorch.yaml b/.github/workflows/test_cli_cpu_pytorch.yaml
@@ -48,4 +48,4 @@ jobs:
           pip install -e .[testing,diffusers,timm,peft]
 
       - name: Run tests
-        run: pytest -s -k "cli and cpu and pytorch"
+        run: pytest tests/test_cli.py -s -k "cli and cpu and pytorch"
diff --git a/.github/workflows/test_cli_cuda_onnxruntime.yaml b/.github/workflows/test_cli_cuda_onnxruntime.yaml
@@ -46,4 +46,4 @@ jobs:
 
       - name: Run tests
         run: |
-          pytest -x -s -k "cli and cuda and onnxruntime"
+          pytest tests/test_cli.py -x -s -k "cli and cuda and onnxruntime"
diff --git a/.github/workflows/test_cli_cuda_py_txi.yaml b/.github/workflows/test_cli_cuda_py_txi.yaml
@@ -47,4 +47,4 @@ jobs:
           pip install -e .[testing,py-txi]
 
       - name: Run tests
-        run: pytest -s -k "cli and cuda and py_txi"
+        run: pytest tests/test_cli.py -x -s -k "cli and cuda and py_txi"
diff --git a/.github/workflows/test_cli_cuda_pytorch.yaml b/.github/workflows/test_cli_cuda_pytorch.yaml
@@ -47,7 +47,7 @@ jobs:
 
       - name: Run tests
         run: |
-          pytest -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed)"
+          pytest tests/test_cli.py -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed)"
 
   run_cli_cuda_pytorch_multi_gpu_tests:
     if: ${{
@@ -76,4 +76,4 @@ jobs:
 
       - name: Run tests
         run: |
-          pytest -x -s -k "cli and cuda and pytorch and (dp or ddp or device_map or deepspeed)"
+          FORCE_SERIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and pytorch and (dp or ddp or device_map or deepspeed)"
diff --git a/.github/workflows/test_cli_cuda_tensorrt_llm.yaml b/.github/workflows/test_cli_cuda_tensorrt_llm.yaml
@@ -46,4 +46,4 @@ jobs:
 
       - name: Run tests
         run: |
-          pytest -x -s -k "cli and cuda and tensorrt_llm"
+          pytest tests/test_cli.py -x -s -k "cli and cuda and tensorrt_llm"
diff --git a/.github/workflows/test_cli_cuda_torch_ort.yaml b/.github/workflows/test_cli_cuda_torch_ort.yaml
@@ -48,7 +48,7 @@ jobs:
 
       - name: Run tests
         run: |
-          pytest -x -s -k "cli and cuda and torch_ort and not (dp or ddp or device_map) and not (peft)"
+          pytest tests/test_cli.py -x -s -k "cli and cuda and torch_ort and not (dp or ddp or device_map) and not (peft)"
 
   run_cli_cuda_torch_ort_multi_gpu_tests:
     if: ${{
@@ -78,4 +78,4 @@ jobs:
 
       - name: Run tests
         run: |
-          pytest -x -s -k "cli and cuda and torch_ort and (dp or ddp or device_map) and not (peft)"
+          FORCE_SERIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and torch_ort and (dp or ddp or device_map) and not (peft)"
diff --git a/.github/workflows/test_cli_cuda_vllm.yaml b/.github/workflows/test_cli_cuda_vllm.yaml
@@ -47,7 +47,7 @@ jobs:
 
       - name: Run tests
         run: |
-          FORCE_SERIAL=1 pytest -x -s -k "cli and cuda and vllm and not (tp or pp)"
+          FORCE_SERIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and vllm and not (tp or pp)"
 
   run_cli_cuda_vllm_multi_gpu_tests:
     if: ${{
@@ -76,4 +76,4 @@ jobs:
 
       - name: Run tests
         run: |
-          FORCE_SERIAL=1 pytest -x -s -k "cli and cuda and vllm and (tp or pp)"
+          FORCE_SERIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and vllm and (tp or pp)"
diff --git a/.github/workflows/test_cli_misc.yaml b/.github/workflows/test_cli_misc.yaml
@@ -61,4 +61,4 @@ jobs:
           pip install -e .[testing]
 
       - name: Run tests
-        run: pytest -s -k "cli and not (cpu or cuda or rocm or mps)"
+        run: pytest tests/test_cli.py -s -k "cli and not (cpu or cuda or rocm or mps)"
diff --git a/.github/workflows/test_cli_rocm_pytorch.yaml b/.github/workflows/test_cli_rocm_pytorch.yaml
@@ -40,7 +40,6 @@ jobs:
         --device /dev/kfd
         --device /dev/dri
         --env ROCR_VISIBLE_DEVICES
-        --env HIP_VISIBLE_DEVICES=0
 
     steps:
       - name: Checkout code
@@ -52,7 +51,7 @@ jobs:
 
       - name: Run tests
         run: |
-          pytest -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed) and not bnb"
+          pytest tests/test_cli.py -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed) and not bnb"
 
   run_cli_rocm_pytorch_multi_gpu_tests:
     if: ${{
@@ -74,7 +73,6 @@ jobs:
         --device /dev/kfd
         --device /dev/dri
         --env ROCR_VISIBLE_DEVICES
-        --env HIP_VISIBLE_DEVICES=0,1
 
     steps:
       - name: Checkout code
@@ -86,4 +84,4 @@ jobs:
 
       - name: Run tests
         run: |
-          pytest -x -s -k "cli and cuda and pytorch and (dp or ddp or device_map or deepspeed) and not bnb"
+          FORCE_SERIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and pytorch and (dp or ddp or device_map or deepspeed) and not bnb"
diff --git a/optimum_benchmark/trackers/energy.py b/optimum_benchmark/trackers/energy.py
@@ -5,14 +5,11 @@
 from logging import getLogger
 from typing import List, Literal, Optional, Union
 
-from ..import_utils import is_codecarbon_available, is_torch_available, is_torch_distributed_available
+from ..import_utils import is_codecarbon_available, is_torch_available
 
 if is_torch_available():
     import torch
 
-if is_torch_distributed_available():
-    import torch.distributed
-
 if is_codecarbon_available():
     from codecarbon import EmissionsTracker, OfflineEmissionsTracker
     from codecarbon.output import EmissionsData
@@ -115,9 +112,7 @@ def __init__(self, backend: str, device: str, device_ids: Optional[Union[str, in
         self.device_ids = device_ids
 
         self.is_gpu = self.device == "cuda"
-        self.is_engine = self.backend in ["vllm", "tensorrt-llm"]
         self.is_pytorch_cuda = (self.backend, self.device) == ("pytorch", "cuda")
-        self.is_distributed = is_torch_distributed_available() and torch.distributed.is_initialized()
 
         LOGGER.info("\t+ Tracking CPU and RAM energy")
 
@@ -188,19 +183,13 @@ def __init__(self, backend: str, device: str, device_ids: Optional[Union[str, in
 
     @contextmanager
     def track(self, file_prefix: str = "task"):
-        if not self.is_engine and self.is_distributed:
-            torch.distributed.barrier()
-
         if self.is_pytorch_cuda:
             torch.cuda.synchronize()
 
         self.emission_tracker.start_task()
 
         yield
 
-        if not self.is_engine and self.is_distributed:
-            torch.distributed.barrier()
-
         if self.is_pytorch_cuda:
             torch.cuda.synchronize()
 

diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py
@@ -4,11 +4,6 @@
 from logging import getLogger
 from typing import List, Literal, Optional, Union
 
-from ..import_utils import is_torch_distributed_available
-
-if is_torch_distributed_available():
-    import torch.distributed
-
 import numpy as np
 import torch
 from transformers import LogitsProcessor, TrainerCallback
@@ -123,9 +118,7 @@ def __init__(self, device: str, backend: str):
         self.device = device
         self.backend = backend
 
-        self.is_engine = self.backend in ["vllm", "tensorrt-llm"]
         self.is_pytorch_cuda = (self.backend, self.device) == ("pytorch", "cuda")
-        self.is_distributed = is_torch_distributed_available() and torch.distributed.is_initialized()
 
         if self.is_pytorch_cuda:
             LOGGER.info("\t+ Tracking latency using Pytorch CUDA events")
@@ -143,17 +136,11 @@ def reset(self):
 
     @contextmanager
     def track(self):
-        if not self.is_engine and self.is_distributed:
-            torch.distributed.barrier()
-
         if self.is_pytorch_cuda:
             yield from self._pytorch_cuda_latency()
         else:
             yield from self._cpu_latency()
 
-        if not self.is_engine and self.is_distributed:
-            torch.distributed.barrier()
-
     def _pytorch_cuda_latency(self):
         self.start_events.append(torch.cuda.Event(enable_timing=True))
         self.start_events[-1].record()
@@ -259,9 +246,7 @@ def __init__(self, device: str, backend: str):
         self.device = device
         self.backend = backend
 
-        self.is_engine = self.backend in ["vllm", "tensorrt-llm"]
         self.is_pytorch_cuda = (self.backend, self.device) == ("pytorch", "cuda")
-        self.is_distributed = is_torch_distributed_available() and torch.distributed.is_initialized()
 
         if self.is_pytorch_cuda:
             LOGGER.info("\t+ Tracking latency using Pytorch CUDA events")
@@ -292,17 +277,12 @@ def track(self):
         self.prefilled = False
         self.per_token_events.append([])
 
-        if not self.is_engine and self.is_distributed:
-            torch.distributed.barrier()
-
         if self.is_pytorch_cuda:
             self.prefill_start_events.append(torch.cuda.Event(enable_timing=True))
             self.prefill_start_events[-1].record()
         else:
             self.prefill_start_events.append(time.perf_counter())
 
-        # this is where generate is called,
-        # and for each decoded token, we record an event
         yield
 
         if self.is_pytorch_cuda:
@@ -311,9 +291,6 @@ def track(self):
         else:
             self.decode_end_events.append(time.perf_counter())
 
-        if not self.is_engine and self.is_distributed:
-            torch.distributed.barrier()
-
         self.prefilled = False
 
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):

diff --git a/optimum_benchmark/trackers/memory.py b/optimum_benchmark/trackers/memory.py
@@ -11,15 +11,12 @@
     is_pynvml_available,
     is_pyrsmi_available,
     is_torch_available,
-    is_torch_distributed_available,
 )
 from ..system_utils import is_nvidia_system, is_rocm_system
 
 if is_rocm_system() and is_pyrsmi_available():
     from pyrsmi import rocml
 
-if is_torch_distributed_available():
-    import torch.distributed
 
 if is_nvidia_system() and is_pynvml_available():
     import pynvml
@@ -102,9 +99,7 @@ def __init__(self, device: str, backend: str, device_ids: Optional[Union[str, in
         self.monitored_pid = os.getpid()
 
         self.is_gpu = device == "cuda"
-        self.is_engine = backend in ["vllm", "tensorrt-llm"]
         self.is_pytorch_cuda = (self.backend, self.device) == ("pytorch", "cuda")
-        self.is_distributed = is_torch_distributed_available() and torch.distributed.is_initialized()
 
         LOGGER.info(f"\t+ Tracking RAM memory of process [{self.monitored_pid}]")
 
@@ -147,31 +142,25 @@ def reset(self):
 
     @contextmanager
     def track(self):
-        if not self.is_engine and self.is_distributed:
-            torch.distributed.barrier()
-
         if self.is_pytorch_cuda:
             yield from self._cuda_pytorch_memory()
         elif self.is_gpu:
             yield from self._gpu_memory()
         else:
             yield from self._cpu_memory()
 
-        if not self.is_engine and self.is_distributed:
-            torch.distributed.barrier()
-
     def _cuda_pytorch_memory(self):
         self.max_allocated_memory = 0
         self.max_reserved_memory = 0
 
-        torch.cuda.synchronize()
-
         for device in range(self.num_pytorch_devices):
             try:
                 torch.cuda.reset_peak_memory_stats(device=device)
             except Exception as e:
                 LOGGER.warning(f"\t\t+ Could not reset max memory stats for device {device}: {e}")
 
+        torch.cuda.synchronize()
+
         yield from self._gpu_memory()
 
         torch.cuda.synchronize()

diff --git a/tests/configs/_inference_.yaml b/tests/configs/_inference_.yaml
@@ -11,10 +11,11 @@ scenario:
 
   input_shapes:
     batch_size: 1
+    sequence_length: 16
 
   generate_kwargs:
-    max_new_tokens: 5
-    min_new_tokens: 5
+    max_new_tokens: 16
+    min_new_tokens: 16
 
   call_kwargs:
     num_inference_steps: 2