Skip to content

Commit 7217f43

Browse files
committed
Merge remote-tracking branch 'upstream/main' into main
2 parents b7a5618 + 263d8ee commit 7217f43

File tree

885 files changed

+86648
-24137
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

885 files changed

+86648
-24137
lines changed

.buildkite/check-wheel-size.py

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,43 @@
11
import os
2+
import sys
23
import zipfile
34

4-
MAX_SIZE_MB = 250
5+
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB
6+
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250))
57

68

79
def print_top_10_largest_files(zip_file):
10+
"""Print the top 10 largest files in the given zip file."""
811
with zipfile.ZipFile(zip_file, 'r') as z:
912
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
1013
file_sizes.sort(key=lambda x: x[1], reverse=True)
1114
for f, size in file_sizes[:10]:
12-
print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
15+
print(f"{f}: {size / (1024 * 1024):.2f} MBs uncompressed.")
1316

1417

1518
def check_wheel_size(directory):
19+
"""Check the size of .whl files in the given directory."""
1620
for root, _, files in os.walk(directory):
17-
for f in files:
18-
if f.endswith(".whl"):
19-
wheel_path = os.path.join(root, f)
20-
wheel_size = os.path.getsize(wheel_path)
21-
wheel_size_mb = wheel_size / (1024 * 1024)
22-
if wheel_size_mb > MAX_SIZE_MB:
23-
print(
24-
f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
25-
f"compare to the allowed size ({MAX_SIZE_MB} MB).")
21+
for file_name in files:
22+
if file_name.endswith(".whl"):
23+
wheel_path = os.path.join(root, file_name)
24+
wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
25+
if wheel_size_mb > VLLM_MAX_SIZE_MB:
26+
print(f"Not allowed: Wheel {wheel_path} is larger "
27+
f"({wheel_size_mb:.2f} MB) than the limit "
28+
f"({VLLM_MAX_SIZE_MB} MB).")
2629
print_top_10_largest_files(wheel_path)
2730
return 1
2831
else:
2932
print(f"Wheel {wheel_path} is within the allowed size "
30-
f"({wheel_size_mb} MB).")
33+
f"({wheel_size_mb:.2f} MB).")
3134
return 0
3235

3336

3437
if __name__ == "__main__":
35-
import sys
36-
sys.exit(check_wheel_size(sys.argv[1]))
38+
if len(sys.argv) < 2:
39+
print("Usage: python check-wheel-size.py <directory>")
40+
sys.exit(1)
41+
42+
directory = sys.argv[1]
43+
sys.exit(check_wheel_size(directory))
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
2+
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
3+
tasks:
4+
- name: "gsm8k"
5+
metrics:
6+
- name: "exact_match,strict-match"
7+
value: 0.764
8+
- name: "exact_match,flexible-extract"
9+
value: 0.764
10+
limit: 250
11+
num_fewshot: 5

.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ tasks:
44
- name: "gsm8k"
55
metrics:
66
- name: "exact_match,strict-match"
7-
value: 0.409
7+
value: 0.419
88
- name: "exact_match,flexible-extract"
9-
value: 0.406
9+
value: 0.416
1010
limit: 1000
1111
num_fewshot: 5

.buildkite/lm-eval-harness/configs/models-small.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Meta-Llama-3-8B-Instruct.yaml
2-
Meta-Llama-3-8B-Instruct-FP8.yaml
32
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
43
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
4+
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
55
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
66
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
77
Minitron-4B-Base-FP8.yaml

.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# We can use this script to compute baseline accuracy on GSM for transformers.
33
#
44
# Make sure you have lm-eval-harness installed:
5-
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
5+
# pip install lm-eval==0.4.4
66

77
usage() {
88
echo``

.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# We use this for fp8, which HF does not support.
44
#
55
# Make sure you have lm-eval-harness installed:
6-
# pip install lm-eval==0.4.3
6+
# pip install lm-eval==0.4.4
77

88
usage() {
99
echo``

.buildkite/lm-eval-harness/test_lm_eval_correctness.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,15 @@ def test_lm_eval_correctness():
4949
results = launch_lm_eval(eval_config)
5050

5151
# Confirm scores match ground truth.
52+
success = True
5253
for task in eval_config["tasks"]:
5354
for metric in task["metrics"]:
5455
ground_truth = metric["value"]
5556
measured_value = results["results"][task["name"]][metric["name"]]
5657
print(f'{task["name"]} | {metric["name"]}: '
5758
f'ground_truth={ground_truth} | measured={measured_value}')
58-
assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
59+
success = success and numpy.isclose(
60+
ground_truth, measured_value, rtol=RTOL)
61+
62+
# Assert at the end, print all scores even on failure for debugging.
63+
assert success

.buildkite/nightly-benchmarks/benchmark-pipeline.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@ steps:
88
containers:
99
- image: badouralix/curl-jq
1010
command:
11-
- sh
12-
- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
11+
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
1312
- wait
1413
- label: "A100"
1514
agents:
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
2+
## Description
3+
4+
This file contains the downloading link for benchmarking results.
5+
6+
- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
7+
- [benchmarking results](artifact://results.zip)
8+
- [benchmarking code](artifact://nightly-benchmarks.zip)
9+
10+
Please download the visualization scripts in the post
11+
12+
13+
## Results reproduction
14+
15+
- Find the docker we use in `benchmarking pipeline`
16+
- Deploy the docker, and inside the docker:
17+
- Download `nightly-benchmarks.zip`.
18+
- In the same folder, run the following code
19+
```
20+
export HF_TOKEN=<your HF token>
21+
apt update
22+
apt install -y git
23+
unzip nightly-benchmarks.zip
24+
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
25+
```
26+
27+
And the results will be inside `./benchmarks/results`.
28+
Lines changed: 36 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,39 @@
11

22
# Nightly benchmark
33

4-
The main goal of this benchmarking is two-fold:
5-
- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
6-
- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
7-
8-
9-
## Docker images
10-
11-
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
12-
- vllm/vllm-openai:v0.5.0.post1
13-
- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
14-
- openmmlab/lmdeploy:v0.5.0
15-
- ghcr.io/huggingface/text-generation-inference:2.1
16-
17-
<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->
18-
19-
20-
## Hardware
21-
22-
One AWS node with 8x NVIDIA A100 GPUs.
23-
24-
25-
## Workload description
26-
27-
We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
28-
29-
- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
30-
- Output length: the corresponding output length of these 500 prompts.
31-
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
32-
- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
33-
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
34-
35-
<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
36-
37-
## Plots
38-
39-
In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
40-
41-
<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
42-
43-
## Results
44-
45-
{nightly_results_benchmarking_table}
4+
This benchmark aims to:
5+
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
6+
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
7+
8+
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
9+
10+
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
11+
12+
13+
## Setup
14+
15+
- Docker images:
16+
- vLLM: `vllm/vllm-openai:v0.6.2`
17+
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
18+
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
19+
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
20+
- *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
21+
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
22+
- Hardware
23+
- 8x Nvidia A100 GPUs
24+
- Workload:
25+
- Dataset
26+
- ShareGPT dataset
27+
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
28+
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
29+
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
30+
- Models: llama-3 8B, llama-3 70B.
31+
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
32+
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
33+
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
34+
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
35+
36+
# Known issues
37+
38+
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
39+
- TGI does not support `ignore-eos` flag.

0 commit comments

Comments
 (0)