Skip to content

Commit 15cb989

Browse files
authored
TensorRT-LLM backend update (#731)
1 parent 6c88297 commit 15cb989

File tree

7 files changed

+84
-6
lines changed

7 files changed

+84
-6
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -707,6 +707,7 @@ similar to the following (assuming your model is an inflight batcher model):
707707
```bash
708708
# HELP nv_trt_llm_request_metrics TRT LLM request metrics
709709
# TYPE nv_trt_llm_request_metrics gauge
710+
nv_trt_llm_request_metrics{model="tensorrt_llm",request_type="waiting",version="1"} 1
710711
nv_trt_llm_request_metrics{model="tensorrt_llm",request_type="context",version="1"} 1
711712
nv_trt_llm_request_metrics{model="tensorrt_llm",request_type="scheduled",version="1"} 1
712713
nv_trt_llm_request_metrics{model="tensorrt_llm",request_type="max",version="1"} 512
@@ -718,6 +719,7 @@ nv_trt_llm_runtime_memory_metrics{memory_type="gpu",model="tensorrt_llm",version
718719
nv_trt_llm_runtime_memory_metrics{memory_type="cpu",model="tensorrt_llm",version="1"} 0
719720
# HELP nv_trt_llm_kv_cache_block_metrics TRT LLM KV cache block metrics
720721
# TYPE nv_trt_llm_kv_cache_block_metrics gauge
722+
nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="fraction",model="tensorrt_llm",version="1"} 0.4875
721723
nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="tokens_per",model="tensorrt_llm",version="1"} 64
722724
nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="used",model="tensorrt_llm",version="1"} 1
723725
nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="free",model="tensorrt_llm",version="1"} 6239

all_models/inflight_batcher_llm/tensorrt_llm/1/model.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1149,6 +1149,11 @@ def create_metrics(self, model: str, version: str, is_v1_model: bool):
11491149
"request_type": "context",
11501150
**common_labels
11511151
}),
1152+
"num_waiting_requests":
1153+
self.request_metric_family.Metric(labels={
1154+
"request_type": "waiting",
1155+
**common_labels
1156+
}),
11521157
# Runtime metrics
11531158
"cpu_mem_usage":
11541159
self.runtime_memory_metric_family.Metric(labels={
@@ -1186,6 +1191,11 @@ def create_metrics(self, model: str, version: str, is_v1_model: bool):
11861191
"kv_cache_block_type": "tokens_per",
11871192
**common_labels
11881193
}),
1194+
"fraction_used_blocks":
1195+
self.kv_cache_metric_family.Metric(labels={
1196+
"kv_cache_block_type": "fraction",
1197+
**common_labels
1198+
}),
11891199
# General metrics
11901200
"timestamp":
11911201
self.general_metric_family.Metric(labels={
@@ -1493,12 +1503,61 @@ def update_metrics_per_request(self, req_id):
14931503
self.all_metrics[METRIC_TOTAL_OUTPUT_TOKENS].observe(output_tokens)
14941504
self.all_metrics[METRIC_TOTAL_INPUT_TOKENS].observe(input_tokens)
14951505

1506+
def get_composite_metric_map(self, stat):
1507+
1508+
def get_metric(metric_name, family_stats=None):
1509+
if family_stats is None:
1510+
if hasattr(stat, metric_name):
1511+
return getattr(stat, metric_name)
1512+
elif stat.kv_cache_stats is not None and hasattr(
1513+
stat.kv_cache_stats, metric_name):
1514+
return getattr(stat.kv_cache_stats, metric_name)
1515+
elif stat.static_batching_stats is not None and hasattr(
1516+
stat.static_batching_stats, metric_name):
1517+
return getattr(stat.static_batching_stats, metric_name)
1518+
elif stat.inflight_batching_stats is not None and hasattr(
1519+
stat.inflight_batching_stats, metric_name):
1520+
return getattr(stat.inflight_batching_stats, metric_name)
1521+
elif family_stats is not None and hasattr(family_stats,
1522+
metric_name):
1523+
return getattr(family_stats, metric_name)
1524+
pb_utils.Logger.log_warn(
1525+
f"Constituent metric \"{metric_name}\" not found.")
1526+
return None
1527+
1528+
composite_metrics = {}
1529+
1530+
# compute fraction_used_blocks
1531+
max_blocks = get_metric("max_num_blocks", stat.kv_cache_stats)
1532+
used_blocks = get_metric("used_num_blocks", stat.kv_cache_stats)
1533+
if max_blocks is not None and used_blocks is not None:
1534+
composite_metrics[
1535+
"fraction_used_blocks"] = 0.0 if max_blocks <= 0 else used_blocks / max_blocks
1536+
else:
1537+
pb_utils.Logger.log_warn(
1538+
f"fraction_used_blocks is missing one or more constituent metric."
1539+
)
1540+
1541+
# compute num_waiting_requests
1542+
active_requests = get_metric("num_active_requests")
1543+
scheduled_requests = get_metric("num_scheduled_requests")
1544+
if active_requests is not None and scheduled_requests is not None:
1545+
composite_metrics[
1546+
"num_waiting_requests"] = active_requests - scheduled_requests
1547+
else:
1548+
pb_utils.Logger.log_warn(
1549+
f"num_waiting_requests is missing one or more constituent metric."
1550+
)
1551+
1552+
return composite_metrics
1553+
14961554
def metrics_loop(self):
14971555
"""Updates triton metrics using stats from the executor."""
14981556
while self.running:
14991557
time.sleep(self.stats_check_period_ms / 1000.0)
15001558
for stat in self.executor.get_latest_iteration_stats():
15011559
try:
1560+
composite_metrics = self.get_composite_metric_map(stat)
15021561
for key, metric in self.all_metrics.items():
15031562
# Skip processing for both histogram metrics
15041563
if isinstance(key, str) and key in [
@@ -1518,6 +1577,8 @@ def metrics_loop(self):
15181577
elif stat.inflight_batching_stats is not None and hasattr(
15191578
stat.inflight_batching_stats, key):
15201579
value = getattr(stat.inflight_batching_stats, key)
1580+
elif key in composite_metrics:
1581+
value = composite_metrics[key]
15211582
if value is not None:
15221583
if key == "timestamp":
15231584
value = convert_timestamp_to_seconds(value)

ci/L0_backend_trtllm/custom_metrics_verification_tests.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
"request_type=scheduled": "Scheduled Requests",
3838
"request_type=max": "Max Request Count",
3939
"request_type=active": "Active Request Count",
40+
"request_type=waiting": "Waiting Requests",
4041
"memory_type=pinned": "Runtime Pinned Memory Usage",
4142
"memory_type=gpu": "Runtime GPU Memory Usage",
4243
"memory_type=cpu": "Runtime CPU Memory Usage",
@@ -45,6 +46,7 @@
4546
"kv_cache_block_type=free": "Free KV cache blocks",
4647
"kv_cache_block_type=max": "Max KV cache blocks",
4748
"kv_cache_block_type=reused": "Reused KV cache blocks",
49+
"kv_cache_block_type=fraction": "Fraction used KV cache blocks",
4850
"inflight_batcher_specific_metric=total_context_tokens":
4951
"Total Context Tokens",
5052
"inflight_batcher_specific_metric=micro_batch_id": "MicroBatch ID",

inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,16 +35,18 @@ namespace triton::backend::inflight_batcher_llm::custom_metrics_reporter
3535
{
3636

3737
const std::vector<std::string> CustomMetricsReporter::request_keys_{
38-
"Active Request Count", "Max Request Count", "Scheduled Requests", "Context Requests"};
39-
const std::vector<std::string> CustomMetricsReporter::request_labels_{"active", "max", "scheduled", "context"};
38+
"Active Request Count", "Max Request Count", "Scheduled Requests", "Context Requests", "Waiting Requests"};
39+
const std::vector<std::string> CustomMetricsReporter::request_labels_{
40+
"active", "max", "scheduled", "context", "waiting"};
4041

4142
const std::vector<std::string> CustomMetricsReporter::runtime_memory_keys_{
4243
"Runtime CPU Memory Usage", "Runtime GPU Memory Usage", "Runtime Pinned Memory Usage"};
4344
const std::vector<std::string> CustomMetricsReporter::runtime_memory_labels_{"cpu", "gpu", "pinned"};
4445

4546
const std::vector<std::string> CustomMetricsReporter::kv_cache_keys_{"Max KV cache blocks", "Free KV cache blocks",
46-
"Used KV cache blocks", "Tokens per KV cache block", "Reused KV cache blocks"};
47-
const std::vector<std::string> CustomMetricsReporter::kv_cache_labels_{"max", "free", "used", "tokens_per", "reused"};
47+
"Used KV cache blocks", "Tokens per KV cache block", "Reused KV cache blocks", "Fraction used KV cache blocks"};
48+
const std::vector<std::string> CustomMetricsReporter::kv_cache_labels_{
49+
"max", "free", "used", "tokens_per", "reused", "fraction"};
4850

4951
const std::vector<std::string> CustomMetricsReporter::dis_serving_keys_{"KV cache transfer time", "Request count"};
5052
const std::vector<std::string> CustomMetricsReporter::dis_serving_labels_{"kv_cache_transfer_ms", "request_count"};

inflight_batcher_llm/src/model_instance_state.cc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1426,6 +1426,8 @@ void ModelInstanceState::WaitForStats()
14261426
statJson.append("\"Paused Requests\":" + std::to_string(modelStats.numPausedRequests) + ",");
14271427
statJson.append("\"Scheduled Requests\":" + std::to_string(modelStats.numScheduledRequests) + ",");
14281428
statJson.append("\"Total Context Tokens\":" + std::to_string(modelStats.numCtxTokens) + ",");
1429+
statJson.append("\"Waiting Requests\":"
1430+
+ std::to_string(stat.numActiveRequests - modelStats.numScheduledRequests) + ",");
14291431
}
14301432
else if (stat.staticBatchingStats.has_value())
14311433
{
@@ -1435,6 +1437,8 @@ void ModelInstanceState::WaitForStats()
14351437
statJson.append("\"Total Context Tokens\":" + std::to_string(modelStats.numCtxTokens) + ",");
14361438
statJson.append("\"Total Generation Tokens\":" + std::to_string(modelStats.numGenTokens) + ",");
14371439
statJson.append("\"Empty Generation Slots\":" + std::to_string(modelStats.emptyGenSlots) + ",");
1440+
statJson.append("\"Waiting Requests\":"
1441+
+ std::to_string(stat.numActiveRequests - modelStats.numScheduledRequests) + ",");
14381442
}
14391443
else
14401444
{
@@ -1450,6 +1454,13 @@ void ModelInstanceState::WaitForStats()
14501454
statJson.append("\"Tokens per KV cache block\":" + std::to_string(kvStats.tokensPerBlock) + ",");
14511455
statJson.append("\"Used KV cache blocks\":" + std::to_string(kvStats.usedNumBlocks) + ",");
14521456
statJson.append("\"Reused KV cache blocks\":" + std::to_string(kvStats.reusedBlocks) + ",");
1457+
// Calculate and append the used KV cache block fraction.
1458+
double fraction = 0.0;
1459+
if (static_cast<double>(kvStats.maxNumBlocks) > 0.0)
1460+
{
1461+
fraction = static_cast<double>(kvStats.usedNumBlocks) / static_cast<double>(kvStats.maxNumBlocks);
1462+
}
1463+
statJson.append("\"Fraction used KV cache blocks\":" + std::to_string(fraction) + ",");
14531464
}
14541465

14551466
// requestStats is a list where each item is associated with an iteration,

tensorrt_llm

tools/version.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2ec76b5a2868c70e6a9b5a1b9d73dc6ea84ca6de
1+
087f498586bab566e8bee1c87392cc85af4bc819

0 commit comments

Comments
 (0)