From 52e3d69826c58cad57486ba2a83c24d16831c816 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Wed, 29 Jan 2025 04:11:16 +0000 Subject: [PATCH] [V1][Metrics] Add TTFT and TPOT histograms (#12530) Signed-off-by: Mark McLoughlin --- tests/entrypoints/openai/test_metrics.py | 6 ++++++ vllm/v1/engine/output_processor.py | 4 +++- vllm/v1/metrics/loggers.py | 25 ++++++++++++++++++++++++ vllm/v1/metrics/stats.py | 11 +++++++++++ 4 files changed, 45 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 9a84c82b62fdf..901ba8e8e5ef3 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -208,6 +208,12 @@ async def test_metrics_counts(server: RemoteOpenAIServer, "vllm:request_generation_tokens_sum", "vllm:request_generation_tokens_bucket", "vllm:request_generation_tokens_count", + "vllm:time_to_first_token_seconds_sum", + "vllm:time_to_first_token_seconds_bucket", + "vllm:time_to_first_token_seconds_count", + "vllm:time_per_output_token_seconds_sum", + "vllm:time_per_output_token_seconds_bucket", + "vllm:time_per_output_token_seconds_count", ] diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 39217b8090140..234ef8194ca93 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -27,6 +27,7 @@ def __init__( prompt: Optional[str], prompt_token_ids: List[int], detokenizer: IncrementalDetokenizer, + arrival_time: float, queue: Optional[asyncio.Queue[RequestOutput]], ): self.request_id = request_id @@ -37,7 +38,7 @@ def __init__( self.is_prefilling = True self.queue = queue - self.stats = RequestStateStats() + self.stats = RequestStateStats(last_token_time=arrival_time) @classmethod def from_new_request( @@ -54,6 +55,7 @@ def from_new_request( tokenizer=tokenizer, request=request, ), + arrival_time=request.arrival_time, queue=queue, ) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 87d9d63652c05..9bb24d1948651 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -121,6 +121,26 @@ def __init__(self, model_config: ModelConfig): buckets=build_1_2_5_buckets(max_model_len), labelnames=labelnames).labels(*labelvalues) + self.histogram_time_to_first_token = \ + prometheus_client.Histogram( + name="vllm:time_to_first_token_seconds", + documentation="Histogram of time to first token in seconds.", + buckets=[ + 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, + 0.75, 1.0, 2.5, 5.0, 7.5, 10.0 + ], + labelnames=labelnames).labels(*labelvalues) + + self.histogram_time_per_output_token = \ + prometheus_client.Histogram( + name="vllm:time_per_output_token_seconds", + documentation="Histogram of time per output token in seconds.", + buckets=[ + 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, + 0.75, 1.0, 2.5 + ], + labelnames=labelnames).labels(*labelvalues) + def log(self, scheduler_stats: SchedulerStats, iteration_stats: IterationStats): """Log to prometheus.""" @@ -137,6 +157,11 @@ def log(self, scheduler_stats: SchedulerStats, self.histogram_num_generation_tokens_request.observe( finished_request.num_generation_tokens) + for ttft in iteration_stats.time_to_first_tokens_iter: + self.histogram_time_to_first_token.observe(ttft) + for tpot in iteration_stats.time_per_output_tokens_iter: + self.histogram_time_per_output_token.observe(tpot) + @staticmethod def _unregister_vllm_metrics(): # Unregister any existing vLLM collectors (for CI/CD diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 55d85a7992cc5..f4c276f0b6902 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -1,3 +1,4 @@ +import time from dataclasses import dataclass from typing import TYPE_CHECKING, List @@ -22,6 +23,7 @@ class RequestStateStats: """Stats that need to be tracked across delta updates.""" num_generation_tokens: int = 0 + last_token_time: float = 0.0 @dataclass @@ -40,6 +42,8 @@ def __init__(self, log_stats: bool): self.num_generation_tokens = 0 self.num_prompt_tokens = 0 self.finished_requests: List[FinishedRequestStats] = [] + self.time_to_first_tokens_iter: List[float] = [] + self.time_per_output_tokens_iter: List[float] = [] def update_from_output(self, output: "EngineCoreOutput", is_prefilling: bool, prompt_len: int, @@ -48,6 +52,8 @@ def update_from_output(self, output: "EngineCoreOutput", return num_new_generation_tokens = len(output.new_token_ids) + now = time.time() + last_token_latency = now - request_state_stats.last_token_time self.num_generation_tokens += num_new_generation_tokens if is_prefilling: @@ -58,7 +64,12 @@ def update_from_output(self, output: "EngineCoreOutput", assert (num_new_generation_tokens > 0) self.num_prompt_tokens += prompt_len + self.time_to_first_tokens_iter.append(last_token_latency) + else: + self.time_per_output_tokens_iter.append(last_token_latency) + request_state_stats.num_generation_tokens += num_new_generation_tokens + request_state_stats.last_token_time = now def update_from_finished_request(self, request_output: "RequestOutput", request_state_stats: RequestStateStats):