Skip to content

Commit 62cd00f

Browse files
Shixiaowei02kaiyux
andauthored
Update TensorRT-LLM backend (#504)
Co-authored-by: Kaiyu Xie <[email protected]>
1 parent 566b4ff commit 62cd00f

29 files changed

+745
-63
lines changed

README.md

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -637,6 +637,49 @@ nv_inference_compute_output_duration_us{model="tensorrt_llm",version="1"} 0
637637
nv_inference_pending_request_count{model="tensorrt_llm",version="1"} 0
638638
```
639639

640+
## Multi-instance Support
641+
642+
TensorRT-LLM backend relies on MPI to coordinate the execution of a model across multiple GPUs
643+
and nodes. Currently, there are two different modes supported to run a model across multiple GPUs:
644+
645+
1. [Leader mode](#leader-mode)
646+
2. [Orchestrator mode](#orchestrator-mode)
647+
648+
### Leader Mode
649+
650+
In leader mode, TensorRT-LLM backend spawns one Triton Server process for every
651+
GPU. The process with rank 0 is the leader process. Other Triton Server processes,
652+
do not return from the `TRITONBACKEND_ModelInstanceInitialize` call to avoid
653+
port collision and allowing the other processes to receive requests.
654+
655+
The overview of this mode is described in the diagram below:
656+
657+
![Leader Mode Overview](./images/leader-mode.png)
658+
659+
This mode is friendly with [slurm](https://slurm.schedmd.com) deployments since
660+
it doesn't use
661+
[MPI_Comm_spawn](https://www.open-mpi.org/doc/v4.1/man3/MPI_Comm_spawn.3.php).
662+
663+
### Orchestrator Mode
664+
665+
In orchestrator mode, the TensorRT-LLM backend spawns a single Triton Server process
666+
that acts as an orchestrator and spawns one Triton Server process for every
667+
GPU that each model requires. This mode is mainly used when serving multiple models
668+
with TensorRT-LLM backend. In this mode, the `MPI` world size must be one as
669+
TRT-LLM backend will automatically create new workers as needed. The overview
670+
of this mode is described in the diagram below:
671+
672+
![Orchestrator Mode Overview](./images/orchestrator-mode.png)
673+
674+
Since this mode uses [MPI_Comm_spawn](https://www.open-mpi.org/doc/v4.1/man3/MPI_Comm_spawn.3.php),
675+
it might not work properly with [slurm](https://slurm.schedmd.com) deployments.
676+
Additionally, this currently only works for single node deployments.
677+
678+
### Running Multiple Instances of LLaMa Model
679+
680+
Please refer to [Running Multiple Instances of the LLaMa Model](docs/llama_multi_instance.md)
681+
for more information on running multiple instances of LLaMa model in different configurations.
682+
640683
## Testing the TensorRT-LLM Backend
641684
Please follow the guide in [`ci/README.md`](ci/README.md) to see how to run
642685
the testing for TensorRT-LLM backend.

all_models/gpt/postprocessing/1/model.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@ def initialize(self, args):
3535
legacy=False,
3636
padding_side="left",
3737
trust_remote_code=True)
38-
self.tokenizer.pad_token = self.tokenizer.eos_token
38+
if not self.tokenizer.pad_token:
39+
self.tokenizer.pad_token = self.tokenizer.eos_token
3940

4041
# Parse model output configs
4142
output_config = pb_utils.get_output_config_by_name(

all_models/gpt/preprocessing/1/model.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,9 @@ def initialize(self, args):
3838
padding_side='left',
3939
legacy=False,
4040
trust_remote_code=True)
41-
self.tokenizer.pad_token = self.tokenizer.eos_token
41+
if not self.tokenizer.pad_token:
42+
self.tokenizer.pad_token = self.tokenizer.eos_token
43+
4244
self.pad_id = self.tokenizer.encode(self.tokenizer.pad_token,
4345
add_special_tokens=False)[0]
4446

all_models/inflight_batcher_llm/ensemble/config.pbtxt

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,12 @@ input [
3333
data_type: TYPE_STRING
3434
dims: [ -1 ]
3535
},
36+
{
37+
name: "decoder_text_input"
38+
data_type: TYPE_STRING
39+
dims: [ -1 ]
40+
optional: true
41+
},
3642
{
3743
name: "max_tokens"
3844
data_type: TYPE_INT32
@@ -207,6 +213,10 @@ ensemble_scheduling {
207213
key: "QUERY"
208214
value: "text_input"
209215
}
216+
input_map {
217+
key: "DECODER_QUERY"
218+
value: "decoder_text_input"
219+
}
210220
input_map {
211221
key: "REQUEST_OUTPUT_LEN"
212222
value: "max_tokens"
@@ -243,6 +253,14 @@ ensemble_scheduling {
243253
key: "INPUT_ID"
244254
value: "_INPUT_ID"
245255
}
256+
output_map {
257+
key: "REQUEST_DECODER_INPUT_LEN"
258+
value: "_REQUEST_DECODER_INPUT_LEN"
259+
}
260+
output_map {
261+
key: "DECODER_INPUT_ID"
262+
value: "_DECODER_INPUT_ID"
263+
}
246264
output_map {
247265
key: "REQUEST_OUTPUT_LEN"
248266
value: "_REQUEST_OUTPUT_LEN"
@@ -275,10 +293,18 @@ ensemble_scheduling {
275293
key: "input_ids"
276294
value: "_INPUT_ID"
277295
}
296+
input_map {
297+
key: "decoder_input_ids"
298+
value: "_DECODER_INPUT_ID"
299+
}
278300
input_map {
279301
key: "input_lengths"
280302
value: "_REQUEST_INPUT_LEN"
281303
}
304+
input_map {
305+
key: "decoder_input_lengths"
306+
value: "_REQUEST_DECODER_INPUT_LEN"
307+
}
282308
input_map {
283309
key: "request_output_len"
284310
value: "_REQUEST_OUTPUT_LEN"

all_models/inflight_batcher_llm/postprocessing/1/model.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,8 @@ def initialize(self, args):
8282
legacy=False,
8383
padding_side='left',
8484
trust_remote_code=True)
85-
self.tokenizer.pad_token = self.tokenizer.eos_token
85+
if not self.tokenizer.pad_token:
86+
self.tokenizer.pad_token = self.tokenizer.eos_token
8687

8788
# Parse model output configs
8889
output_config = pb_utils.get_output_config_by_name(

all_models/inflight_batcher_llm/preprocessing/1/model.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,9 @@ def initialize(self, args):
8484
trust_remote_code=True)
8585
if isinstance(self.tokenizer, T5Tokenizer):
8686
self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id()
87-
self.tokenizer.pad_token = self.tokenizer.eos_token
87+
88+
if not self.tokenizer.pad_token:
89+
self.tokenizer.pad_token = self.tokenizer.eos_token
8890

8991
self.tokenizer_end_id = self.tokenizer.encode(
9092
self.tokenizer.eos_token, add_special_tokens=False)[0]
@@ -93,7 +95,8 @@ def initialize(self, args):
9395

9496
# Parse model output configs and convert Triton types to numpy types
9597
output_names = [
96-
"INPUT_ID", "REQUEST_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS",
98+
"INPUT_ID", "DECODER_INPUT_ID", "REQUEST_INPUT_LEN",
99+
"REQUEST_DECODER_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS",
97100
"OUT_END_ID", "OUT_PAD_ID"
98101
]
99102
input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"]
@@ -142,6 +145,11 @@ def execute(self, requests):
142145
# Get input tensors
143146
query = pb_utils.get_input_tensor_by_name(request,
144147
'QUERY').as_numpy()
148+
decoder_query = pb_utils.get_input_tensor_by_name(
149+
request, 'DECODER_QUERY')
150+
if decoder_query is not None:
151+
decoder_query = decoder_query.as_numpy()
152+
145153
batch_dim = query.shape[0]
146154
if batch_dim != 1:
147155

@@ -194,6 +202,15 @@ def execute(self, requests):
194202

195203
# Preprocessing input data.
196204
input_id, request_input_len = self._create_request(query)
205+
print(input_id)
206+
print(request_input_len)
207+
if decoder_query is not None:
208+
decoder_input_id, request_decoder_input_len = self._create_request(
209+
decoder_query)
210+
else:
211+
decoder_input_id = pad_id * np.ones((1, 1), np.int32)
212+
request_decoder_input_len = 1 * np.ones((1, 1), np.int32)
213+
197214
bad_words = self._to_word_list_format(bad_words_dict)
198215
stop_words = self._to_word_list_format(stop_words_dict)
199216

@@ -208,6 +225,13 @@ def execute(self, requests):
208225
request_input_len_tensor = pb_utils.Tensor(
209226
'REQUEST_INPUT_LEN',
210227
request_input_len.astype(self.request_input_len_dtype))
228+
decoder_input_id_tensor = pb_utils.Tensor(
229+
'DECODER_INPUT_ID',
230+
decoder_input_id.astype(self.decoder_input_id_dtype))
231+
request_decoder_input_len_tensor = pb_utils.Tensor(
232+
'REQUEST_DECODER_INPUT_LEN',
233+
request_decoder_input_len.astype(
234+
self.request_decoder_input_len_dtype))
211235
request_output_len_tensor = pb_utils.Tensor(
212236
'REQUEST_OUTPUT_LEN', request_output_len)
213237
bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
@@ -221,8 +245,9 @@ def execute(self, requests):
221245
np.array(pad_id, dtype=np.int32))
222246

223247
inference_response = pb_utils.InferenceResponse(output_tensors=[
224-
input_id_tensor, bad_words_ids_tensor, stop_words_ids_tensor,
225-
request_input_len_tensor, request_output_len_tensor,
248+
input_id_tensor, decoder_input_id_tensor, bad_words_ids_tensor,
249+
stop_words_ids_tensor, request_input_len_tensor,
250+
request_decoder_input_len_tensor, request_output_len_tensor,
226251
embedding_bias_tensor, end_id_tensor, pad_id_tensor
227252
])
228253
responses.append(inference_response)

all_models/inflight_batcher_llm/preprocessing/config.pbtxt

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,12 @@ input [
3333
data_type: TYPE_STRING
3434
dims: [ -1 ]
3535
},
36+
{
37+
name: "DECODER_QUERY"
38+
data_type: TYPE_STRING
39+
dims: [ -1 ]
40+
optional: true
41+
},
3642
{
3743
name: "REQUEST_OUTPUT_LEN"
3844
data_type: TYPE_INT32
@@ -86,6 +92,16 @@ output [
8692
data_type: TYPE_INT32
8793
dims: [ 1 ]
8894
},
95+
{
96+
name: "DECODER_INPUT_ID"
97+
data_type: TYPE_INT32
98+
dims: [ -1 ]
99+
},
100+
{
101+
name: "REQUEST_DECODER_INPUT_LEN"
102+
data_type: TYPE_INT32
103+
dims: [ 1 ]
104+
},
89105
{
90106
name: "BAD_WORDS_IDS"
91107
data_type: TYPE_INT32

all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,20 @@ input [
6262
optional: true
6363
allow_ragged_batch: true
6464
},
65+
{
66+
name: "decoder_input_ids"
67+
data_type: TYPE_INT32
68+
dims: [ -1 ]
69+
optional: true
70+
allow_ragged_batch: true
71+
},
72+
{
73+
name: "decoder_input_lengths"
74+
data_type: TYPE_INT32
75+
dims: [ 1 ]
76+
optional: true
77+
reshape: { shape: [ ] }
78+
},
6579
{
6680
name: "draft_logits"
6781
data_type: TYPE_FP32
@@ -368,6 +382,12 @@ parameters: {
368382
string_value: "${engine_dir}"
369383
}
370384
}
385+
parameters: {
386+
key: "encoder_model_path"
387+
value: {
388+
string_value: "${encoder_engine_dir}"
389+
}
390+
}
371391
parameters: {
372392
key: "max_tokens_in_paged_kv_cache"
373393
value: {

all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ def _single_value(data: Optional[np.ndarray]):
5858
@dataclass
5959
class Request:
6060
text_input: np.ndarray = np.array([])
61+
decoder_text_input: np.ndarray = None
6162
max_tokens: np.ndarray = np.array([])
6263
bad_words: Optional[np.ndarray] = None
6364
stop_words: Optional[np.ndarray] = None
@@ -112,7 +113,9 @@ class DraftRequest:
112113
@dataclass
113114
class PreprocResponse:
114115
input_ids: np.ndarray = np.array([])
116+
decoder_input_ids: np.ndarray = None
115117
input_lengths: np.ndarray = np.array([])
118+
decoder_input_lengths: np.ndarray = None
116119
bad_words_list: Optional[np.ndarray] = None
117120
stop_words_list: Optional[np.ndarray] = None
118121
embedding_bias: Optional[np.ndarray] = None
@@ -129,6 +132,8 @@ def with_new_inputs(cls,
129132
if input_ids is not None else other.input_ids),
130133
input_lengths=(input_lengths if input_lengths is not None else
131134
other.input_lengths),
135+
decoder_input_ids=other.decoder_input_ids,
136+
decoder_input_lengths=other.decoder_input_lengths,
132137
bad_words_list=other.bad_words_list,
133138
stop_words_list=other.stop_words_list,
134139
end_id=other.end_id,

all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/triton_decoder.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,9 @@ def __init__(self,
5050

5151
self._preproc_outputs = [
5252
"INPUT_ID",
53+
"DECODER_INPUT_ID",
5354
"REQUEST_INPUT_LEN",
55+
"REQUEST_DECODER_INPUT_LEN",
5456
"BAD_WORDS_IDS",
5557
"STOP_WORDS_IDS",
5658
"EMBEDDING_BIAS",
@@ -73,6 +75,7 @@ def __init__(self,
7375

7476
self.input_names = [
7577
"text_input",
78+
"decoder_text_input",
7679
"max_tokens",
7780
"bad_words",
7881
"stop_words",
@@ -217,6 +220,7 @@ def preprocess(self, request: Request) -> PreprocResponse:
217220
def _get_preproc_tensors(self, request: Request):
218221
name_map = {
219222
"text_input": "QUERY",
223+
"decoder_text_input": "DECODER_QUERY",
220224
"max_tokens": "REQUEST_OUTPUT_LEN",
221225
"bad_words": "BAD_WORDS_DICT",
222226
"stop_words": "STOP_WORDS_DICT",
@@ -230,7 +234,9 @@ def _get_preproc_tensors(self, request: Request):
230234
def _get_preproc_response(self, triton_output):
231235
name_map = {
232236
"INPUT_ID": "input_ids",
237+
"DECODER_INPUT_ID": "decoder_input_ids",
233238
"REQUEST_INPUT_LEN": "input_lengths",
239+
"REQUEST_DECODER_INPUT_LEN": "decoder_input_lengths",
234240
"BAD_WORDS_IDS": "bad_words_list",
235241
"STOP_WORDS_IDS": "stop_words_list",
236242
"EMBEDDING_BIAS": "embedding_bias",
@@ -303,6 +309,7 @@ def _get_llm_tensors(self,
303309
def _get_tensors_from_preproc(self, preproc: PreprocResponse):
304310
name_map = {
305311
"input_ids": "input_ids",
312+
"decoder_input_ids": "decoder_input_ids",
306313
"input_lengths": "input_lengths",
307314
"bad_words_list": "bad_words_list",
308315
"stop_words_list": "stop_words_list",

all_models/inflight_batcher_llm/tensorrt_llm_bls/config.pbtxt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@ input [
3838
data_type: TYPE_STRING
3939
dims: [ -1 ]
4040
},
41+
{
42+
name: "decoder_text_input"
43+
data_type: TYPE_STRING
44+
dims: [ -1 ]
45+
optional: true
46+
},
4147
{
4248
name: "max_tokens"
4349
data_type: TYPE_INT32

0 commit comments

Comments
 (0)