Skip to content

Commit

Permalink
Removing barriers (#273)
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyasMoutawwakil committed Sep 23, 2024
1 parent 01e4e59 commit 1c43fa2
Show file tree
Hide file tree
Showing 23 changed files with 29 additions and 75 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test_api_cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,4 @@ jobs:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
PUSH_REPO_ID: optimum-benchmark/cpu
run: |
pytest -s -k "api and cpu"
pytest tests/test_api.py -s -k "api and cpu"
2 changes: 1 addition & 1 deletion .github/workflows/test_api_cuda.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,4 @@ jobs:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
PUSH_REPO_ID: optimum-benchmark/cuda
run: |
pytest -s -x -k "api and cuda"
pytest tests/test_api.py -x -s -k "api and cuda"
2 changes: 1 addition & 1 deletion .github/workflows/test_api_misc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,4 @@ jobs:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
PUSH_REPO_ID: optimum-benchmark/misc-${{ matrix.os }}-${{ matrix.python }}
run: |
pytest -s -k "api and not (cpu or cuda or rocm or mps)"
pytest tests/test_api.py -s -k "api and not (cpu or cuda or rocm or mps)"
2 changes: 1 addition & 1 deletion .github/workflows/test_api_rocm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,4 @@ jobs:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
PUSH_REPO_ID: optimum-benchmark/rocm
run: |
pytest -s -x -k "api and cuda"
pytest tests/test_api.py -x -s -k "api and cuda"
2 changes: 1 addition & 1 deletion .github/workflows/test_cli_cpu_ipex.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,4 @@ jobs:
pip install -e .[testing,ipex,diffusers,timm]
- name: Run tests
run: pytest -s -k "cli and cpu and ipex"
run: pytest tests/test_cli.py -s -k "cli and cpu and ipex"
2 changes: 1 addition & 1 deletion .github/workflows/test_cli_cpu_llama_cpp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,4 @@ jobs:
pip install -e .[testing,llama-cpp]
- name: Run tests
run: pytest -s -k "llama_cpp"
run: pytest tests/test_cli.py -s -k "llama_cpp"
2 changes: 1 addition & 1 deletion .github/workflows/test_cli_cpu_neural_compressor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,4 @@ jobs:
pip install -e .[testing,neural-compressor,diffusers,timm]
- name: Run tests
run: pytest -s -k "cli and cpu and neural_compressor"
run: pytest tests/test_cli.py -s -k "cli and cpu and neural_compressor"
2 changes: 1 addition & 1 deletion .github/workflows/test_cli_cpu_onnxruntime.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,4 @@ jobs:
pip install -e .[testing,onnxruntime,diffusers,timm]
- name: Run tests
run: pytest -s -k "cli and cpu and onnxruntime"
run: pytest tests/test_cli.py -s -k "cli and cpu and onnxruntime"
2 changes: 1 addition & 1 deletion .github/workflows/test_cli_cpu_openvino.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,4 @@ jobs:
pip install -e .[testing,openvino,diffusers,timm]
- name: Run tests
run: pytest -s -k "cli and cpu and openvino"
run: pytest tests/test_cli.py -s -k "cli and cpu and openvino"
2 changes: 1 addition & 1 deletion .github/workflows/test_cli_cpu_py_txi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,4 @@ jobs:
pip install -e .[testing,py-txi]
- name: Run tests
run: pytest -s -k "cli and cpu and py_txi"
run: pytest tests/test_cli.py -s -k "cli and cpu and py_txi"
2 changes: 1 addition & 1 deletion .github/workflows/test_cli_cpu_pytorch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,4 @@ jobs:
pip install -e .[testing,diffusers,timm,peft]
- name: Run tests
run: pytest -s -k "cli and cpu and pytorch"
run: pytest tests/test_cli.py -s -k "cli and cpu and pytorch"
2 changes: 1 addition & 1 deletion .github/workflows/test_cli_cuda_onnxruntime.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,4 @@ jobs:
- name: Run tests
run: |
pytest -x -s -k "cli and cuda and onnxruntime"
pytest tests/test_cli.py -x -s -k "cli and cuda and onnxruntime"
2 changes: 1 addition & 1 deletion .github/workflows/test_cli_cuda_py_txi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,4 @@ jobs:
pip install -e .[testing,py-txi]
- name: Run tests
run: pytest -s -k "cli and cuda and py_txi"
run: pytest tests/test_cli.py -x -s -k "cli and cuda and py_txi"
4 changes: 2 additions & 2 deletions .github/workflows/test_cli_cuda_pytorch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
- name: Run tests
run: |
pytest -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed)"
pytest tests/test_cli.py -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed)"
run_cli_cuda_pytorch_multi_gpu_tests:
if: ${{
Expand Down Expand Up @@ -76,4 +76,4 @@ jobs:
- name: Run tests
run: |
pytest -x -s -k "cli and cuda and pytorch and (dp or ddp or device_map or deepspeed)"
FORCE_SERIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and pytorch and (dp or ddp or device_map or deepspeed)"
2 changes: 1 addition & 1 deletion .github/workflows/test_cli_cuda_tensorrt_llm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,4 @@ jobs:
- name: Run tests
run: |
pytest -x -s -k "cli and cuda and tensorrt_llm"
pytest tests/test_cli.py -x -s -k "cli and cuda and tensorrt_llm"
4 changes: 2 additions & 2 deletions .github/workflows/test_cli_cuda_torch_ort.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
- name: Run tests
run: |
pytest -x -s -k "cli and cuda and torch_ort and not (dp or ddp or device_map) and not (peft)"
pytest tests/test_cli.py -x -s -k "cli and cuda and torch_ort and not (dp or ddp or device_map) and not (peft)"
run_cli_cuda_torch_ort_multi_gpu_tests:
if: ${{
Expand Down Expand Up @@ -78,4 +78,4 @@ jobs:
- name: Run tests
run: |
pytest -x -s -k "cli and cuda and torch_ort and (dp or ddp or device_map) and not (peft)"
FORCE_SERIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and torch_ort and (dp or ddp or device_map) and not (peft)"
4 changes: 2 additions & 2 deletions .github/workflows/test_cli_cuda_vllm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
- name: Run tests
run: |
FORCE_SERIAL=1 pytest -x -s -k "cli and cuda and vllm and not (tp or pp)"
FORCE_SERIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and vllm and not (tp or pp)"
run_cli_cuda_vllm_multi_gpu_tests:
if: ${{
Expand Down Expand Up @@ -76,4 +76,4 @@ jobs:
- name: Run tests
run: |
FORCE_SERIAL=1 pytest -x -s -k "cli and cuda and vllm and (tp or pp)"
FORCE_SERIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and vllm and (tp or pp)"
2 changes: 1 addition & 1 deletion .github/workflows/test_cli_misc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,4 +61,4 @@ jobs:
pip install -e .[testing]
- name: Run tests
run: pytest -s -k "cli and not (cpu or cuda or rocm or mps)"
run: pytest tests/test_cli.py -s -k "cli and not (cpu or cuda or rocm or mps)"
6 changes: 2 additions & 4 deletions .github/workflows/test_cli_rocm_pytorch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ jobs:
--device /dev/kfd
--device /dev/dri
--env ROCR_VISIBLE_DEVICES
--env HIP_VISIBLE_DEVICES=0

steps:
- name: Checkout code
Expand All @@ -52,7 +51,7 @@ jobs:
- name: Run tests
run: |
pytest -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed) and not bnb"
pytest tests/test_cli.py -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed) and not bnb"
run_cli_rocm_pytorch_multi_gpu_tests:
if: ${{
Expand All @@ -74,7 +73,6 @@ jobs:
--device /dev/kfd
--device /dev/dri
--env ROCR_VISIBLE_DEVICES
--env HIP_VISIBLE_DEVICES=0,1

steps:
- name: Checkout code
Expand All @@ -86,4 +84,4 @@ jobs:
- name: Run tests
run: |
pytest -x -s -k "cli and cuda and pytorch and (dp or ddp or device_map or deepspeed) and not bnb"
FORCE_SERIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and pytorch and (dp or ddp or device_map or deepspeed) and not bnb"
13 changes: 1 addition & 12 deletions optimum_benchmark/trackers/energy.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,11 @@
from logging import getLogger
from typing import List, Literal, Optional, Union

from ..import_utils import is_codecarbon_available, is_torch_available, is_torch_distributed_available
from ..import_utils import is_codecarbon_available, is_torch_available

if is_torch_available():
import torch

if is_torch_distributed_available():
import torch.distributed

if is_codecarbon_available():
from codecarbon import EmissionsTracker, OfflineEmissionsTracker
from codecarbon.output import EmissionsData
Expand Down Expand Up @@ -115,9 +112,7 @@ def __init__(self, backend: str, device: str, device_ids: Optional[Union[str, in
self.device_ids = device_ids

self.is_gpu = self.device == "cuda"
self.is_engine = self.backend in ["vllm", "tensorrt-llm"]
self.is_pytorch_cuda = (self.backend, self.device) == ("pytorch", "cuda")
self.is_distributed = is_torch_distributed_available() and torch.distributed.is_initialized()

LOGGER.info("\t+ Tracking CPU and RAM energy")

Expand Down Expand Up @@ -188,19 +183,13 @@ def __init__(self, backend: str, device: str, device_ids: Optional[Union[str, in

@contextmanager
def track(self, file_prefix: str = "task"):
if not self.is_engine and self.is_distributed:
torch.distributed.barrier()

if self.is_pytorch_cuda:
torch.cuda.synchronize()

self.emission_tracker.start_task()

yield

if not self.is_engine and self.is_distributed:
torch.distributed.barrier()

if self.is_pytorch_cuda:
torch.cuda.synchronize()

Expand Down
23 changes: 0 additions & 23 deletions optimum_benchmark/trackers/latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,6 @@
from logging import getLogger
from typing import List, Literal, Optional, Union

from ..import_utils import is_torch_distributed_available

if is_torch_distributed_available():
import torch.distributed

import numpy as np
import torch
from transformers import LogitsProcessor, TrainerCallback
Expand Down Expand Up @@ -123,9 +118,7 @@ def __init__(self, device: str, backend: str):
self.device = device
self.backend = backend

self.is_engine = self.backend in ["vllm", "tensorrt-llm"]
self.is_pytorch_cuda = (self.backend, self.device) == ("pytorch", "cuda")
self.is_distributed = is_torch_distributed_available() and torch.distributed.is_initialized()

if self.is_pytorch_cuda:
LOGGER.info("\t+ Tracking latency using Pytorch CUDA events")
Expand All @@ -143,17 +136,11 @@ def reset(self):

@contextmanager
def track(self):
if not self.is_engine and self.is_distributed:
torch.distributed.barrier()

if self.is_pytorch_cuda:
yield from self._pytorch_cuda_latency()
else:
yield from self._cpu_latency()

if not self.is_engine and self.is_distributed:
torch.distributed.barrier()

def _pytorch_cuda_latency(self):
self.start_events.append(torch.cuda.Event(enable_timing=True))
self.start_events[-1].record()
Expand Down Expand Up @@ -259,9 +246,7 @@ def __init__(self, device: str, backend: str):
self.device = device
self.backend = backend

self.is_engine = self.backend in ["vllm", "tensorrt-llm"]
self.is_pytorch_cuda = (self.backend, self.device) == ("pytorch", "cuda")
self.is_distributed = is_torch_distributed_available() and torch.distributed.is_initialized()

if self.is_pytorch_cuda:
LOGGER.info("\t+ Tracking latency using Pytorch CUDA events")
Expand Down Expand Up @@ -292,17 +277,12 @@ def track(self):
self.prefilled = False
self.per_token_events.append([])

if not self.is_engine and self.is_distributed:
torch.distributed.barrier()

if self.is_pytorch_cuda:
self.prefill_start_events.append(torch.cuda.Event(enable_timing=True))
self.prefill_start_events[-1].record()
else:
self.prefill_start_events.append(time.perf_counter())

# this is where generate is called,
# and for each decoded token, we record an event
yield

if self.is_pytorch_cuda:
Expand All @@ -311,9 +291,6 @@ def track(self):
else:
self.decode_end_events.append(time.perf_counter())

if not self.is_engine and self.is_distributed:
torch.distributed.barrier()

self.prefilled = False

def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
Expand Down
15 changes: 2 additions & 13 deletions optimum_benchmark/trackers/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,12 @@
is_pynvml_available,
is_pyrsmi_available,
is_torch_available,
is_torch_distributed_available,
)
from ..system_utils import is_nvidia_system, is_rocm_system

if is_rocm_system() and is_pyrsmi_available():
from pyrsmi import rocml

if is_torch_distributed_available():
import torch.distributed

if is_nvidia_system() and is_pynvml_available():
import pynvml
Expand Down Expand Up @@ -102,9 +99,7 @@ def __init__(self, device: str, backend: str, device_ids: Optional[Union[str, in
self.monitored_pid = os.getpid()

self.is_gpu = device == "cuda"
self.is_engine = backend in ["vllm", "tensorrt-llm"]
self.is_pytorch_cuda = (self.backend, self.device) == ("pytorch", "cuda")
self.is_distributed = is_torch_distributed_available() and torch.distributed.is_initialized()

LOGGER.info(f"\t+ Tracking RAM memory of process [{self.monitored_pid}]")

Expand Down Expand Up @@ -147,31 +142,25 @@ def reset(self):

@contextmanager
def track(self):
if not self.is_engine and self.is_distributed:
torch.distributed.barrier()

if self.is_pytorch_cuda:
yield from self._cuda_pytorch_memory()
elif self.is_gpu:
yield from self._gpu_memory()
else:
yield from self._cpu_memory()

if not self.is_engine and self.is_distributed:
torch.distributed.barrier()

def _cuda_pytorch_memory(self):
self.max_allocated_memory = 0
self.max_reserved_memory = 0

torch.cuda.synchronize()

for device in range(self.num_pytorch_devices):
try:
torch.cuda.reset_peak_memory_stats(device=device)
except Exception as e:
LOGGER.warning(f"\t\t+ Could not reset max memory stats for device {device}: {e}")

torch.cuda.synchronize()

yield from self._gpu_memory()

torch.cuda.synchronize()
Expand Down
5 changes: 3 additions & 2 deletions tests/configs/_inference_.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@ scenario:

input_shapes:
batch_size: 1
sequence_length: 16

generate_kwargs:
max_new_tokens: 5
min_new_tokens: 5
max_new_tokens: 16
min_new_tokens: 16

call_kwargs:
num_inference_steps: 2

0 comments on commit 1c43fa2

Please sign in to comment.