Skip to content

Commit

Permalink
[V1][Metrics] Add TTFT and TPOT histograms (vllm-project#12530)
Browse files Browse the repository at this point in the history
Signed-off-by: Mark McLoughlin <[email protected]>
  • Loading branch information
markmc authored and rasmith committed Jan 30, 2025
1 parent 90f2a75 commit 52e3d69
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 1 deletion.
6 changes: 6 additions & 0 deletions tests/entrypoints/openai/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,12 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
"vllm:request_generation_tokens_sum",
"vllm:request_generation_tokens_bucket",
"vllm:request_generation_tokens_count",
"vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count",
"vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count",
]


Expand Down
4 changes: 3 additions & 1 deletion vllm/v1/engine/output_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def __init__(
prompt: Optional[str],
prompt_token_ids: List[int],
detokenizer: IncrementalDetokenizer,
arrival_time: float,
queue: Optional[asyncio.Queue[RequestOutput]],
):
self.request_id = request_id
Expand All @@ -37,7 +38,7 @@ def __init__(
self.is_prefilling = True
self.queue = queue

self.stats = RequestStateStats()
self.stats = RequestStateStats(last_token_time=arrival_time)

@classmethod
def from_new_request(
Expand All @@ -54,6 +55,7 @@ def from_new_request(
tokenizer=tokenizer,
request=request,
),
arrival_time=request.arrival_time,
queue=queue,
)

Expand Down
25 changes: 25 additions & 0 deletions vllm/v1/metrics/loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,26 @@ def __init__(self, model_config: ModelConfig):
buckets=build_1_2_5_buckets(max_model_len),
labelnames=labelnames).labels(*labelvalues)

self.histogram_time_to_first_token = \
prometheus_client.Histogram(
name="vllm:time_to_first_token_seconds",
documentation="Histogram of time to first token in seconds.",
buckets=[
0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
0.75, 1.0, 2.5, 5.0, 7.5, 10.0
],
labelnames=labelnames).labels(*labelvalues)

self.histogram_time_per_output_token = \
prometheus_client.Histogram(
name="vllm:time_per_output_token_seconds",
documentation="Histogram of time per output token in seconds.",
buckets=[
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5,
0.75, 1.0, 2.5
],
labelnames=labelnames).labels(*labelvalues)

def log(self, scheduler_stats: SchedulerStats,
iteration_stats: IterationStats):
"""Log to prometheus."""
Expand All @@ -137,6 +157,11 @@ def log(self, scheduler_stats: SchedulerStats,
self.histogram_num_generation_tokens_request.observe(
finished_request.num_generation_tokens)

for ttft in iteration_stats.time_to_first_tokens_iter:
self.histogram_time_to_first_token.observe(ttft)
for tpot in iteration_stats.time_per_output_tokens_iter:
self.histogram_time_per_output_token.observe(tpot)

@staticmethod
def _unregister_vllm_metrics():
# Unregister any existing vLLM collectors (for CI/CD
Expand Down
11 changes: 11 additions & 0 deletions vllm/v1/metrics/stats.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import time
from dataclasses import dataclass
from typing import TYPE_CHECKING, List

Expand All @@ -22,6 +23,7 @@ class RequestStateStats:
"""Stats that need to be tracked across delta updates."""

num_generation_tokens: int = 0
last_token_time: float = 0.0


@dataclass
Expand All @@ -40,6 +42,8 @@ def __init__(self, log_stats: bool):
self.num_generation_tokens = 0
self.num_prompt_tokens = 0
self.finished_requests: List[FinishedRequestStats] = []
self.time_to_first_tokens_iter: List[float] = []
self.time_per_output_tokens_iter: List[float] = []

def update_from_output(self, output: "EngineCoreOutput",
is_prefilling: bool, prompt_len: int,
Expand All @@ -48,6 +52,8 @@ def update_from_output(self, output: "EngineCoreOutput",
return

num_new_generation_tokens = len(output.new_token_ids)
now = time.time()
last_token_latency = now - request_state_stats.last_token_time

self.num_generation_tokens += num_new_generation_tokens
if is_prefilling:
Expand All @@ -58,7 +64,12 @@ def update_from_output(self, output: "EngineCoreOutput",
assert (num_new_generation_tokens > 0)
self.num_prompt_tokens += prompt_len

self.time_to_first_tokens_iter.append(last_token_latency)
else:
self.time_per_output_tokens_iter.append(last_token_latency)

request_state_stats.num_generation_tokens += num_new_generation_tokens
request_state_stats.last_token_time = now

def update_from_finished_request(self, request_output: "RequestOutput",
request_state_stats: RequestStateStats):
Expand Down

0 comments on commit 52e3d69

Please sign in to comment.