Skip to content

Commit

Permalink
[V1][Metrics] Add GPU cache usage % gauge (#12561)
Browse files Browse the repository at this point in the history
Signed-off-by: Mark McLoughlin <[email protected]>
  • Loading branch information
markmc authored Jan 30, 2025
1 parent 1c1bb0b commit f17f1d4
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 2 deletions.
1 change: 1 addition & 0 deletions tests/entrypoints/openai/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
EXPECTED_METRICS_V1 = [
"vllm:num_requests_running",
"vllm:num_requests_waiting",
"vllm:gpu_cache_usage_perc",
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
"vllm:request_prompt_tokens_sum",
Expand Down
5 changes: 5 additions & 0 deletions vllm/v1/core/kv_cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,11 @@ def __init__(
# is finished.
self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {}

@property
def usage(self) -> float:
return 1.0 - (self.free_block_queue.num_free_blocks /
self.num_gpu_blocks)

def get_computed_blocks(
self, request: Request) -> Tuple[List[KVCacheBlock], int]:
"""Get the computed (cached) blocks for the request.
Expand Down
1 change: 1 addition & 0 deletions vllm/v1/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,7 @@ def make_stats(self) -> SchedulerStats:
return SchedulerStats(
num_running_reqs=len(self.running),
num_waiting_reqs=len(self.waiting),
gpu_cache_usage=self.kv_cache_manager.usage,
)


Expand Down
11 changes: 10 additions & 1 deletion vllm/v1/metrics/loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,13 @@ def log(self, scheduler_stats: SchedulerStats,
logger.info(
"Avg prompt throughput: %.1f tokens/s, "
"Avg generation throughput: %.1f tokens/s, "
"Running: %d reqs, Waiting: %d reqs ",
"Running: %d reqs, Waiting: %d reqs "
"GPU KV cache usage: %.1f%%.",
prompt_throughput,
generation_throughput,
scheduler_stats.num_running_reqs,
scheduler_stats.num_waiting_reqs,
scheduler_stats.gpu_cache_usage * 100,
)


Expand All @@ -97,6 +99,11 @@ def __init__(self, model_config: ModelConfig):
documentation="Number of requests waiting to be processed.",
labelnames=labelnames).labels(*labelvalues)

self.gauge_gpu_cache_usage = prometheus_client.Gauge(
name="vllm:gpu_cache_usage_perc",
documentation="GPU KV-cache usage. 1 means 100 percent usage.",
labelnames=labelnames).labels(*labelvalues)

self.counter_prompt_tokens = prometheus_client.Counter(
name="vllm:prompt_tokens_total",
documentation="Number of prefill tokens processed.",
Expand Down Expand Up @@ -147,6 +154,8 @@ def log(self, scheduler_stats: SchedulerStats,
self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)

self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage)

self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
self.counter_generation_tokens.inc(
iteration_stats.num_generation_tokens)
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/metrics/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class SchedulerStats:
num_running_reqs: int = 0
num_waiting_reqs: int = 0

# gpu_cache_usage: float = 0.0
gpu_cache_usage: float = 0.0
# gpu_prefix_cache_hit_rate: float = 0.0


Expand Down

0 comments on commit f17f1d4

Please sign in to comment.