TensorRT-LLM backend update (#731)

kaiyux · web-flow · commit 15cb989b0052 · 2025-04-01T22:46:09.000+08:00
diff --git a/README.md b/README.md
@@ -707,6 +707,7 @@ similar to the following (assuming your model is an inflight batcher model):
 ```bash
 # HELP nv_trt_llm_request_metrics TRT LLM request metrics
 # TYPE nv_trt_llm_request_metrics gauge
+nv_trt_llm_request_metrics{model="tensorrt_llm",request_type="waiting",version="1"} 1
 nv_trt_llm_request_metrics{model="tensorrt_llm",request_type="context",version="1"} 1
 nv_trt_llm_request_metrics{model="tensorrt_llm",request_type="scheduled",version="1"} 1
 nv_trt_llm_request_metrics{model="tensorrt_llm",request_type="max",version="1"} 512
@@ -718,6 +719,7 @@ nv_trt_llm_runtime_memory_metrics{memory_type="gpu",model="tensorrt_llm",version
 nv_trt_llm_runtime_memory_metrics{memory_type="cpu",model="tensorrt_llm",version="1"} 0
 # HELP nv_trt_llm_kv_cache_block_metrics TRT LLM KV cache block metrics
 # TYPE nv_trt_llm_kv_cache_block_metrics gauge
+nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="fraction",model="tensorrt_llm",version="1"} 0.4875
 nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="tokens_per",model="tensorrt_llm",version="1"} 64
 nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="used",model="tensorrt_llm",version="1"} 1
 nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="free",model="tensorrt_llm",version="1"} 6239
diff --git a/all_models/inflight_batcher_llm/tensorrt_llm/1/model.py b/all_models/inflight_batcher_llm/tensorrt_llm/1/model.py
@@ -1149,6 +1149,11 @@ def create_metrics(self, model: str, version: str, is_v1_model: bool):
                 "request_type": "context",
                 **common_labels
             }),
+            "num_waiting_requests":
+            self.request_metric_family.Metric(labels={
+                "request_type": "waiting",
+                **common_labels
+            }),
             # Runtime metrics
             "cpu_mem_usage":
             self.runtime_memory_metric_family.Metric(labels={
@@ -1186,6 +1191,11 @@ def create_metrics(self, model: str, version: str, is_v1_model: bool):
                 "kv_cache_block_type": "tokens_per",
                 **common_labels
             }),
+            "fraction_used_blocks":
+            self.kv_cache_metric_family.Metric(labels={
+                "kv_cache_block_type": "fraction",
+                **common_labels
+            }),
             # General metrics
             "timestamp":
             self.general_metric_family.Metric(labels={
@@ -1493,12 +1503,61 @@ def update_metrics_per_request(self, req_id):
         self.all_metrics[METRIC_TOTAL_OUTPUT_TOKENS].observe(output_tokens)
         self.all_metrics[METRIC_TOTAL_INPUT_TOKENS].observe(input_tokens)
 
+    def get_composite_metric_map(self, stat):
+
+        def get_metric(metric_name, family_stats=None):
+            if family_stats is None:
+                if hasattr(stat, metric_name):
+                    return getattr(stat, metric_name)
+                elif stat.kv_cache_stats is not None and hasattr(
+                        stat.kv_cache_stats, metric_name):
+                    return getattr(stat.kv_cache_stats, metric_name)
+                elif stat.static_batching_stats is not None and hasattr(
+                        stat.static_batching_stats, metric_name):
+                    return getattr(stat.static_batching_stats, metric_name)
+                elif stat.inflight_batching_stats is not None and hasattr(
+                        stat.inflight_batching_stats, metric_name):
+                    return getattr(stat.inflight_batching_stats, metric_name)
+            elif family_stats is not None and hasattr(family_stats,
+                                                      metric_name):
+                return getattr(family_stats, metric_name)
+            pb_utils.Logger.log_warn(
+                f"Constituent metric \"{metric_name}\" not found.")
+            return None
+
+        composite_metrics = {}
+
+        # compute fraction_used_blocks
+        max_blocks = get_metric("max_num_blocks", stat.kv_cache_stats)
+        used_blocks = get_metric("used_num_blocks", stat.kv_cache_stats)
+        if max_blocks is not None and used_blocks is not None:
+            composite_metrics[
+                "fraction_used_blocks"] = 0.0 if max_blocks <= 0 else used_blocks / max_blocks
+        else:
+            pb_utils.Logger.log_warn(
+                f"fraction_used_blocks is missing one or more constituent metric."
+            )
+
+        # compute num_waiting_requests
+        active_requests = get_metric("num_active_requests")
+        scheduled_requests = get_metric("num_scheduled_requests")
+        if active_requests is not None and scheduled_requests is not None:
+            composite_metrics[
+                "num_waiting_requests"] = active_requests - scheduled_requests
+        else:
+            pb_utils.Logger.log_warn(
+                f"num_waiting_requests is missing one or more constituent metric."
+            )
+
+        return composite_metrics
+
     def metrics_loop(self):
         """Updates triton metrics using stats from the executor."""
         while self.running:
             time.sleep(self.stats_check_period_ms / 1000.0)
             for stat in self.executor.get_latest_iteration_stats():
                 try:
+                    composite_metrics = self.get_composite_metric_map(stat)
                     for key, metric in self.all_metrics.items():
                         # Skip processing for both histogram metrics
                         if isinstance(key, str) and key in [
@@ -1518,6 +1577,8 @@ def metrics_loop(self):
                         elif stat.inflight_batching_stats is not None and hasattr(
                                 stat.inflight_batching_stats, key):
                             value = getattr(stat.inflight_batching_stats, key)
+                        elif key in composite_metrics:
+                            value = composite_metrics[key]
                         if value is not None:
                             if key == "timestamp":
                                 value = convert_timestamp_to_seconds(value)
diff --git a/ci/L0_backend_trtllm/custom_metrics_verification_tests.py b/ci/L0_backend_trtllm/custom_metrics_verification_tests.py
@@ -37,6 +37,7 @@
     "request_type=scheduled": "Scheduled Requests",
     "request_type=max": "Max Request Count",
     "request_type=active": "Active Request Count",
+    "request_type=waiting": "Waiting Requests",
     "memory_type=pinned": "Runtime Pinned Memory Usage",
     "memory_type=gpu": "Runtime GPU Memory Usage",
     "memory_type=cpu": "Runtime CPU Memory Usage",
@@ -45,6 +46,7 @@
     "kv_cache_block_type=free": "Free KV cache blocks",
     "kv_cache_block_type=max": "Max KV cache blocks",
     "kv_cache_block_type=reused": "Reused KV cache blocks",
+    "kv_cache_block_type=fraction": "Fraction used KV cache blocks",
     "inflight_batcher_specific_metric=total_context_tokens":
     "Total Context Tokens",
     "inflight_batcher_specific_metric=micro_batch_id": "MicroBatch ID",
diff --git a/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc b/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc
@@ -35,16 +35,18 @@ namespace triton::backend::inflight_batcher_llm::custom_metrics_reporter
 {
 
 const std::vector<std::string> CustomMetricsReporter::request_keys_{
-    "Active Request Count", "Max Request Count", "Scheduled Requests", "Context Requests"};
-const std::vector<std::string> CustomMetricsReporter::request_labels_{"active", "max", "scheduled", "context"};
+    "Active Request Count", "Max Request Count", "Scheduled Requests", "Context Requests", "Waiting Requests"};
+const std::vector<std::string> CustomMetricsReporter::request_labels_{
+    "active", "max", "scheduled", "context", "waiting"};
 
 const std::vector<std::string> CustomMetricsReporter::runtime_memory_keys_{
     "Runtime CPU Memory Usage", "Runtime GPU Memory Usage", "Runtime Pinned Memory Usage"};
 const std::vector<std::string> CustomMetricsReporter::runtime_memory_labels_{"cpu", "gpu", "pinned"};
 
 const std::vector<std::string> CustomMetricsReporter::kv_cache_keys_{"Max KV cache blocks", "Free KV cache blocks",
-    "Used KV cache blocks", "Tokens per KV cache block", "Reused KV cache blocks"};
-const std::vector<std::string> CustomMetricsReporter::kv_cache_labels_{"max", "free", "used", "tokens_per", "reused"};
+    "Used KV cache blocks", "Tokens per KV cache block", "Reused KV cache blocks", "Fraction used KV cache blocks"};
+const std::vector<std::string> CustomMetricsReporter::kv_cache_labels_{
+    "max", "free", "used", "tokens_per", "reused", "fraction"};
 
 const std::vector<std::string> CustomMetricsReporter::dis_serving_keys_{"KV cache transfer time", "Request count"};
 const std::vector<std::string> CustomMetricsReporter::dis_serving_labels_{"kv_cache_transfer_ms", "request_count"};
diff --git a/inflight_batcher_llm/src/model_instance_state.cc b/inflight_batcher_llm/src/model_instance_state.cc
@@ -1426,6 +1426,8 @@ void ModelInstanceState::WaitForStats()
                 statJson.append("\"Paused Requests\":" + std::to_string(modelStats.numPausedRequests) + ",");
                 statJson.append("\"Scheduled Requests\":" + std::to_string(modelStats.numScheduledRequests) + ",");
                 statJson.append("\"Total Context Tokens\":" + std::to_string(modelStats.numCtxTokens) + ",");
+                statJson.append("\"Waiting Requests\":"
+                    + std::to_string(stat.numActiveRequests - modelStats.numScheduledRequests) + ",");
             }
             else if (stat.staticBatchingStats.has_value())
             {
@@ -1435,6 +1437,8 @@ void ModelInstanceState::WaitForStats()
                 statJson.append("\"Total Context Tokens\":" + std::to_string(modelStats.numCtxTokens) + ",");
                 statJson.append("\"Total Generation Tokens\":" + std::to_string(modelStats.numGenTokens) + ",");
                 statJson.append("\"Empty Generation Slots\":" + std::to_string(modelStats.emptyGenSlots) + ",");
+                statJson.append("\"Waiting Requests\":"
+                    + std::to_string(stat.numActiveRequests - modelStats.numScheduledRequests) + ",");
             }
             else
             {
@@ -1450,6 +1454,13 @@ void ModelInstanceState::WaitForStats()
                 statJson.append("\"Tokens per KV cache block\":" + std::to_string(kvStats.tokensPerBlock) + ",");
                 statJson.append("\"Used KV cache blocks\":" + std::to_string(kvStats.usedNumBlocks) + ",");
                 statJson.append("\"Reused KV cache blocks\":" + std::to_string(kvStats.reusedBlocks) + ",");
+                // Calculate and append the used KV cache block fraction.
+                double fraction = 0.0;
+                if (static_cast<double>(kvStats.maxNumBlocks) > 0.0)
+                {
+                    fraction = static_cast<double>(kvStats.usedNumBlocks) / static_cast<double>(kvStats.maxNumBlocks);
+                }
+                statJson.append("\"Fraction used KV cache blocks\":" + std::to_string(fraction) + ",");
             }
 
             // requestStats is a list where each item is associated with an iteration,
diff --git a/tensorrt_llm b/tensorrt_llm
@@ -1 +1 @@
-Subproject commit c2ffce7dbd283000255948ae04792f13f13e05dc
+Subproject commit 75495730bca7a4c94feeed29b17014594f1a1598
diff --git a/tools/version.txt b/tools/version.txt
@@ -1 +1 @@
-2ec76b5a2868c70e6a9b5a1b9d73dc6ea84ca6de
+087f498586bab566e8bee1c87392cc85af4bc819

Original file line number	Diff line number	Diff line change
`@@ -1426,6 +1426,8 @@ void ModelInstanceState::WaitForStats()`
`1426`	`1426`	`statJson.append("\"Paused Requests\":" + std::to_string(modelStats.numPausedRequests) + ",");`
`1427`	`1427`	`statJson.append("\"Scheduled Requests\":" + std::to_string(modelStats.numScheduledRequests) + ",");`
`1428`	`1428`	`statJson.append("\"Total Context Tokens\":" + std::to_string(modelStats.numCtxTokens) + ",");`
	`1429`	`+ statJson.append("\"Waiting Requests\":"`
	`1430`	`+ + std::to_string(stat.numActiveRequests - modelStats.numScheduledRequests) + ",");`
`1429`	`1431`	`}`
`1430`	`1432`	`else if (stat.staticBatchingStats.has_value())`
`1431`	`1433`	`{`
`@@ -1435,6 +1437,8 @@ void ModelInstanceState::WaitForStats()`
`1435`	`1437`	`statJson.append("\"Total Context Tokens\":" + std::to_string(modelStats.numCtxTokens) + ",");`
`1436`	`1438`	`statJson.append("\"Total Generation Tokens\":" + std::to_string(modelStats.numGenTokens) + ",");`
`1437`	`1439`	`statJson.append("\"Empty Generation Slots\":" + std::to_string(modelStats.emptyGenSlots) + ",");`
	`1440`	`+ statJson.append("\"Waiting Requests\":"`
	`1441`	`+ + std::to_string(stat.numActiveRequests - modelStats.numScheduledRequests) + ",");`
`1438`	`1442`	`}`
`1439`	`1443`	`else`
`1440`	`1444`	`{`
`@@ -1450,6 +1454,13 @@ void ModelInstanceState::WaitForStats()`
`1450`	`1454`	`statJson.append("\"Tokens per KV cache block\":" + std::to_string(kvStats.tokensPerBlock) + ",");`
`1451`	`1455`	`statJson.append("\"Used KV cache blocks\":" + std::to_string(kvStats.usedNumBlocks) + ",");`
`1452`	`1456`	`statJson.append("\"Reused KV cache blocks\":" + std::to_string(kvStats.reusedBlocks) + ",");`
	`1457`	`+ // Calculate and append the used KV cache block fraction.`
	`1458`	`+ double fraction = 0.0;`
	`1459`	`+ if (static_cast<double>(kvStats.maxNumBlocks) > 0.0)`
	`1460`	`+ {`
	`1461`	`+ fraction = static_cast<double>(kvStats.usedNumBlocks) / static_cast<double>(kvStats.maxNumBlocks);`
	`1462`	`+ }`
	`1463`	`+ statJson.append("\"Fraction used KV cache blocks\":" + std::to_string(fraction) + ",");`
`1453`	`1464`	`}`
`1454`	`1465`
`1455`	`1466`	`// requestStats is a list where each item is associated with an iteration,`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-2ec76b5a2868c70e6a9b5a1b9d73dc6ea84ca6de`
	`1`	`+087f498586bab566e8bee1c87392cc85af4bc819`