diff --git a/README.md b/README.md index 4a7d2878..a446cbd9 100644 --- a/README.md +++ b/README.md @@ -203,7 +203,7 @@ you need to specify a different `shm-region-prefix-name` for each server. See for more information. ## Triton Metrics -Starting with the 24.08 release of Triton, users can now obtain partial +Starting with the 24.08 release of Triton, users can now obtain specific vLLM metrics by querying the Triton metrics endpoint (see complete vLLM metrics [here](https://docs.vllm.ai/en/latest/serving/metrics.html)). This can be accomplished by launching a Triton server in any of the ways described above @@ -213,9 +213,19 @@ the following: ```bash curl localhost:8002/metrics ``` -VLLM stats are reported by the metrics endpoint in fields that -are prefixed with `vllm:`. Your output for these fields should look -similar to the following: +VLLM stats are reported by the metrics endpoint in fields that are prefixed with +`vllm:`. Triton currently supports reporting of the following metrics from vLLM. +```bash +# Number of prefill tokens processed. +counter_prompt_tokens +# Number of generation tokens processed. +counter_generation_tokens +# Histogram of time to first token in seconds. +histogram_time_to_first_token +# Histogram of time per output token in seconds. +histogram_time_per_output_token +``` +Your output for these fields should look similar to the following: ```bash # HELP vllm:prompt_tokens_total Number of prefill tokens processed. # TYPE vllm:prompt_tokens_total counter @@ -223,6 +233,22 @@ vllm:prompt_tokens_total{model="vllm_model",version="1"} 10 # HELP vllm:generation_tokens_total Number of generation tokens processed. # TYPE vllm:generation_tokens_total counter vllm:generation_tokens_total{model="vllm_model",version="1"} 16 +# HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds. +# TYPE vllm:time_to_first_token_seconds histogram +vllm:time_to_first_token_seconds_count{model="vllm_model",version="1"} 1 +vllm:time_to_first_token_seconds_sum{model="vllm_model",version="1"} 0.03233122825622559 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.001"} 0 +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.005"} 0 +... +vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1 +# HELP vllm:time_per_output_token_seconds Histogram of time per output token in seconds. +# TYPE vllm:time_per_output_token_seconds histogram +vllm:time_per_output_token_seconds_count{model="vllm_model",version="1"} 15 +vllm:time_per_output_token_seconds_sum{model="vllm_model",version="1"} 0.04501533508300781 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.01"} 14 +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.025"} 15 +... +vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 15 ``` To enable vLLM engine colleting metrics, "disable_log_stats" option need to be either false or left empty (false by default) in [model.json](https://github.com/triton-inference-server/vllm_backend/blob/main/samples/model_repository/vllm_model/1/model.json). diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index d2059057..dbb6124c 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -125,6 +125,15 @@ def test_vllm_metrics(self): # vllm:generation_tokens_total self.assertEqual(metrics_dict["vllm:generation_tokens_total"], 48) + # vllm:time_to_first_token_seconds + self.assertEqual(metrics_dict["vllm:time_to_first_token_seconds_count"], 3) + self.assertTrue(metrics_dict["vllm:time_to_first_token_seconds_sum"] > 0) + self.assertEqual(metrics_dict["vllm:time_to_first_token_seconds_bucket"], 3) + # vllm:time_per_output_token_seconds + self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_count"], 45) + self.assertTrue(metrics_dict["vllm:time_per_output_token_seconds_sum"] > 0) + self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_bucket"], 45) + def test_vllm_metrics_disabled(self): # Test vLLM metrics self.vllm_infer( diff --git a/src/utils/metrics.py b/src/utils/metrics.py index fc6e69bd..5f007b02 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -24,7 +24,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from typing import Dict, Union +from typing import Dict, List, Union import triton_python_backend_utils as pb_utils from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase @@ -46,6 +46,16 @@ def __init__(self, labels): description="Number of generation tokens processed.", kind=pb_utils.MetricFamily.COUNTER, ) + self.histogram_time_to_first_token_family = pb_utils.MetricFamily( + name="vllm:time_to_first_token_seconds", + description="Histogram of time to first token in seconds.", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) + self.histogram_time_per_output_token_family = pb_utils.MetricFamily( + name="vllm:time_per_output_token_seconds", + description="Histogram of time per output token in seconds.", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) # Initialize metrics # Iteration stats @@ -55,6 +65,51 @@ def __init__(self, labels): self.counter_generation_tokens = self.counter_generation_tokens_family.Metric( labels=labels ) + # Use the same bucket boundaries from vLLM sample metrics. + # https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96 + self.histogram_time_to_first_token = ( + self.histogram_time_to_first_token_family.Metric( + labels=labels, + buckets=[ + 0.001, + 0.005, + 0.01, + 0.02, + 0.04, + 0.06, + 0.08, + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 2.5, + 5.0, + 7.5, + 10.0, + ], + ) + ) + self.histogram_time_per_output_token = ( + self.histogram_time_per_output_token_family.Metric( + labels=labels, + buckets=[ + 0.01, + 0.025, + 0.05, + 0.075, + 0.1, + 0.15, + 0.2, + 0.3, + 0.4, + 0.5, + 0.75, + 1.0, + 2.5, + ], + ) + ) class VllmStatLogger(VllmStatLoggerBase): @@ -82,6 +137,19 @@ def _log_counter(self, counter, data: Union[int, float]) -> None: if data != 0: counter.increment(data) + def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None: + """Convenience function for logging list to histogram. + + Args: + histogram: A histogram metric instance. + data: A list of int or float data to observe into the histogram metric. + + Returns: + None + """ + for datum in data: + histogram.observe(datum) + def log(self, stats: VllmStats) -> None: """Report stats to Triton metrics server. @@ -97,3 +165,10 @@ def log(self, stats: VllmStats) -> None: self._log_counter( self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter ) + self._log_histogram( + self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter + ) + self._log_histogram( + self.metrics.histogram_time_per_output_token, + stats.time_per_output_tokens_iter, + )