From de7ff8f6b1d7f54c234e263cdc80d8fd6aa49092 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Sun, 11 Aug 2024 17:53:44 -0700 Subject: [PATCH 1/6] Add histogram test --- .../metrics_test/vllm_metrics_test.py | 11 +++ src/utils/metrics.py | 75 ++++++++++++++++++- 2 files changed, 85 insertions(+), 1 deletion(-) diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index d2059057..fbe6675f 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -125,6 +125,17 @@ def test_vllm_metrics(self): # vllm:generation_tokens_total self.assertEqual(metrics_dict["vllm:generation_tokens_total"], 48) + # vllm:time_to_first_token_seconds + self.assertEqual(metrics_dict["vllm:time_to_first_token_seconds_count"], 3) + self.assertTrue( + 0 < metrics_dict["vllm:time_to_first_token_seconds_sum"] < 0.0005 + ) + # vllm:time_per_output_token_seconds + self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_count"], 45) + self.assertTrue( + 0 <= metrics_dict["vllm:time_per_output_token_seconds_sum"] <= 0.005 + ) + def test_vllm_metrics_disabled(self): # Test vLLM metrics self.vllm_infer( diff --git a/src/utils/metrics.py b/src/utils/metrics.py index fc6e69bd..0374fa3b 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -24,7 +24,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from typing import Dict, Union +from typing import Dict, List, Union import triton_python_backend_utils as pb_utils from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase @@ -46,6 +46,16 @@ def __init__(self, labels): description="Number of generation tokens processed.", kind=pb_utils.MetricFamily.COUNTER, ) + self.histogram_time_to_first_token_family = pb_utils.MetricFamily( + name="vllm:time_to_first_token_seconds", + description="Histogram of time to first token in seconds.", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) + self.histogram_time_per_output_token_family = pb_utils.MetricFamily( + name="vllm:time_per_output_token_seconds", + description="Histogram of time per output token in seconds.", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) # Initialize metrics # Iteration stats @@ -55,6 +65,49 @@ def __init__(self, labels): self.counter_generation_tokens = self.counter_generation_tokens_family.Metric( labels=labels ) + self.histogram_time_to_first_token = ( + self.histogram_time_to_first_token_family.Metric( + labels=labels, + buckets=[ + 0.001, + 0.005, + 0.01, + 0.02, + 0.04, + 0.06, + 0.08, + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 2.5, + 5.0, + 7.5, + 10.0, + ], + ) + ) + self.histogram_time_per_output_token = ( + self.histogram_time_per_output_token_family.Metric( + labels=labels, + buckets=[ + 0.01, + 0.025, + 0.05, + 0.075, + 0.1, + 0.15, + 0.2, + 0.3, + 0.4, + 0.5, + 0.75, + 1.0, + 2.5, + ], + ) + ) class VllmStatLogger(VllmStatLoggerBase): @@ -82,6 +135,19 @@ def _log_counter(self, counter, data: Union[int, float]) -> None: if data != 0: counter.increment(data) + def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None: + """Convenience function for logging list to histogram. + + Args: + histogram: A histogram metric instance. + data: A list of int or float data to observe into the histogram metric. + + Returns: + None + """ + for datum in data: + histogram.observe(datum) + def log(self, stats: VllmStats) -> None: """Report stats to Triton metrics server. @@ -97,3 +163,10 @@ def log(self, stats: VllmStats) -> None: self._log_counter( self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter ) + self._log_histogram( + self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter + ) + self._log_histogram( + self.metrics.histogram_time_per_output_token, + stats.time_per_output_tokens_iter, + ) From 9534298678d8ef855e197fd48f35fea7045138ef Mon Sep 17 00:00:00 2001 From: Yingge He Date: Wed, 14 Aug 2024 10:59:48 -0700 Subject: [PATCH 2/6] Longer time for A100 --- ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index fbe6675f..bea63ede 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -127,14 +127,14 @@ def test_vllm_metrics(self): # vllm:time_to_first_token_seconds self.assertEqual(metrics_dict["vllm:time_to_first_token_seconds_count"], 3) - self.assertTrue( - 0 < metrics_dict["vllm:time_to_first_token_seconds_sum"] < 0.0005 - ) + self.assertTrue(0 < metrics_dict["vllm:time_to_first_token_seconds_sum"] < 0.01) + self.assertEqual(metrics_dict["vllm:time_to_first_token_seconds_bucket"], 3) # vllm:time_per_output_token_seconds self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_count"], 45) self.assertTrue( - 0 <= metrics_dict["vllm:time_per_output_token_seconds_sum"] <= 0.005 + 0 < metrics_dict["vllm:time_per_output_token_seconds_sum"] < 0.1 ) + self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_bucket"], 45) def test_vllm_metrics_disabled(self): # Test vLLM metrics From 38ac8d6435bd884d21f50b41f186c26c129bc6be Mon Sep 17 00:00:00 2001 From: Yingge He Date: Thu, 15 Aug 2024 14:13:48 -0700 Subject: [PATCH 3/6] Update comment --- src/utils/metrics.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/utils/metrics.py b/src/utils/metrics.py index 0374fa3b..5f007b02 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -65,6 +65,8 @@ def __init__(self, labels): self.counter_generation_tokens = self.counter_generation_tokens_family.Metric( labels=labels ) + # Use the same bucket boundaries from vLLM sample metrics. + # https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96 self.histogram_time_to_first_token = ( self.histogram_time_to_first_token_family.Metric( labels=labels, From ebdf14eefa07cd91b48f878299d149bab5e0e6de Mon Sep 17 00:00:00 2001 From: Yingge He Date: Thu, 15 Aug 2024 15:51:05 -0700 Subject: [PATCH 4/6] Add histogram metrics to doc --- README.md | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/README.md b/README.md index 4a7d2878..410205be 100644 --- a/README.md +++ b/README.md @@ -223,6 +223,45 @@ vllm:prompt_tokens_total{model="vllm_model",version="1"} 10 # HELP vllm:generation_tokens_total Number of generation tokens processed. # TYPE vllm:generation_tokens_total counter vllm:generation_tokens_total{model="vllm_model",version="1"} 16 +# HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds. +# TYPE vllm:time_to_first_token_seconds histogram +vllm:time_to_first_token_seconds_count{model="vllm_model",version="1"} 1 +vllm:time_to_first_token_seconds_sum{model="vllm_model",version="1"} 0.03233122825622559 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.001"} 0 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.005"} 0 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.01"} 0 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.02"} 0 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.04"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.06"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.08"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.1"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.25"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.5"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.75"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="1"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="2.5"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="5"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="7.5"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="10"} 1 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1 +# HELP vllm:time_per_output_token_seconds Histogram of time per output token in seconds. +# TYPE vllm:time_per_output_token_seconds histogram +vllm:time_per_output_token_seconds_count{model="vllm_model",version="1"} 15 +vllm:time_per_output_token_seconds_sum{model="vllm_model",version="1"} 0.04501533508300781 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.01"} 14 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.025"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.05"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.075"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.1"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.15"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.2"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.3"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.4"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.5"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.75"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="1"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="2.5"} 15 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 15 ``` To enable vLLM engine colleting metrics, "disable_log_stats" option need to be either false or left empty (false by default) in [model.json](https://github.com/triton-inference-server/vllm_backend/blob/main/samples/model_repository/vllm_model/1/model.json). From 0d67322d3df4959c63109bcad6c9aefb0168b536 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Fri, 16 Aug 2024 11:33:09 -0700 Subject: [PATCH 5/6] Update docs --- README.md | 45 ++++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 410205be..a446cbd9 100644 --- a/README.md +++ b/README.md @@ -203,7 +203,7 @@ you need to specify a different `shm-region-prefix-name` for each server. See for more information. ## Triton Metrics -Starting with the 24.08 release of Triton, users can now obtain partial +Starting with the 24.08 release of Triton, users can now obtain specific vLLM metrics by querying the Triton metrics endpoint (see complete vLLM metrics [here](https://docs.vllm.ai/en/latest/serving/metrics.html)). This can be accomplished by launching a Triton server in any of the ways described above @@ -213,9 +213,19 @@ the following: ```bash curl localhost:8002/metrics ``` -VLLM stats are reported by the metrics endpoint in fields that -are prefixed with `vllm:`. Your output for these fields should look -similar to the following: +VLLM stats are reported by the metrics endpoint in fields that are prefixed with +`vllm:`. Triton currently supports reporting of the following metrics from vLLM. +```bash +# Number of prefill tokens processed. +counter_prompt_tokens +# Number of generation tokens processed. +counter_generation_tokens +# Histogram of time to first token in seconds. +histogram_time_to_first_token +# Histogram of time per output token in seconds. +histogram_time_per_output_token +``` +Your output for these fields should look similar to the following: ```bash # HELP vllm:prompt_tokens_total Number of prefill tokens processed. # TYPE vllm:prompt_tokens_total counter @@ -229,20 +239,7 @@ vllm:time_to_first_token_seconds_count{model="vllm_model",version="1"} 1 vllm:time_to_first_token_seconds_sum{model="vllm_model",version="1"} 0.03233122825622559 vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.001"} 0 vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.005"} 0 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.01"} 0 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.02"} 0 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.04"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.06"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.08"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.1"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.25"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.5"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.75"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="1"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="2.5"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="5"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="7.5"} 1 -vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="10"} 1 +... vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1 # HELP vllm:time_per_output_token_seconds Histogram of time per output token in seconds. # TYPE vllm:time_per_output_token_seconds histogram @@ -250,17 +247,7 @@ vllm:time_per_output_token_seconds_count{model="vllm_model",version="1"} 15 vllm:time_per_output_token_seconds_sum{model="vllm_model",version="1"} 0.04501533508300781 vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.01"} 14 vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.025"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.05"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.075"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.1"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.15"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.2"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.3"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.4"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.5"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.75"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="1"} 15 -vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="2.5"} 15 +... vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 15 ``` To enable vLLM engine colleting metrics, "disable_log_stats" option need to be either false From 10d8a695c539297d10c156c1818042caa95583b0 Mon Sep 17 00:00:00 2001 From: Yingge He Date: Fri, 16 Aug 2024 14:07:07 -0700 Subject: [PATCH 6/6] Make metrics test more robust --- ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index bea63ede..dbb6124c 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -127,13 +127,11 @@ def test_vllm_metrics(self): # vllm:time_to_first_token_seconds self.assertEqual(metrics_dict["vllm:time_to_first_token_seconds_count"], 3) - self.assertTrue(0 < metrics_dict["vllm:time_to_first_token_seconds_sum"] < 0.01) + self.assertTrue(metrics_dict["vllm:time_to_first_token_seconds_sum"] > 0) self.assertEqual(metrics_dict["vllm:time_to_first_token_seconds_bucket"], 3) # vllm:time_per_output_token_seconds self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_count"], 45) - self.assertTrue( - 0 < metrics_dict["vllm:time_per_output_token_seconds_sum"] < 0.1 - ) + self.assertTrue(metrics_dict["vllm:time_per_output_token_seconds_sum"] > 0) self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_bucket"], 45) def test_vllm_metrics_disabled(self):