triton-inference-server
diff --git a/‎.pre-commit-config.yaml
Lines changed: 1 addition & 0 deletions b/‎.pre-commit-config.yaml
Lines changed: 1 addition & 0 deletions
diff --git a/‎all_models/inflight_batcher_llm/tensorrt_llm/1/model.py
Lines changed: 53 additions & 37 deletions b/‎all_models/inflight_batcher_llm/tensorrt_llm/1/model.py
Lines changed: 53 additions & 37 deletions
diff --git a/‎all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
Lines changed: 21 additions & 0 deletions b/‎all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
Lines changed: 21 additions & 0 deletions
diff --git a/‎all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py
Lines changed: 10 additions & 8 deletions b/‎all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py
Lines changed: 10 additions & 8 deletions
diff --git a/‎all_models/tests/test_python_backend.py
Lines changed: 20 additions & 11 deletions b/‎all_models/tests/test_python_backend.py
Lines changed: 20 additions & 11 deletions
diff --git a/‎all_models/whisper/whisper_bls/1/fbank.py
Lines changed: 99 additions & 0 deletions b/‎all_models/whisper/whisper_bls/1/fbank.py
Lines changed: 99 additions & 0 deletions
@@ -47,3 +47,4 @@ repos:
         exclude: tools/dataset/
         args:
         - --skip=".git,tensorrt_llm"
+        - --exclude-file=all_models/whisper/whisper_bls/1/tokenizer.py
@@ -343,7 +343,8 @@ def convert_request(request, exclude_input_from_output, decoupled):
     return requests
 
 
-def convert_response(response, batch_index):
+def convert_response(response, batch_index, batch_size, num_return_sequences):
+
     if response.has_error():
         return pb_utils.InferenceResponse(output_tensors=[],
                                           error=pb_utils.TritonError(
@@ -356,40 +357,50 @@ def convert_response(response, batch_index):
                          -1, np.int32)
     for idx, beam in enumerate(result.output_token_ids):
         output_ids[0, idx, :len(beam)] = beam
+
     output_tensors = [
         pb_utils.Tensor("output_ids", output_ids),
         pb_utils.Tensor("sequence_length", beam_lengths),
     ]
-    output_tensors.append(
-        pb_utils.Tensor(
-            "cum_log_probs",
-            np.expand_dims(np.array(result.cum_log_probs, np.float32), 0)
-            if result.cum_log_probs is not None else np.zeros(
-                (1, 1), np.float32)))
-    output_tensors.append(
-        pb_utils.Tensor(
-            "output_log_probs",
-            np.expand_dims(np.array(result.log_probs, np.float32), 0) if
-            result.log_probs is not None else np.zeros((1, 1, 1), np.float32)))
-    output_tensors.append(
-        pb_utils.Tensor(
-            "context_logits",
-            np.expand_dims(np.array(result.context_logits, np.float32), 0)
-            if result.context_logits is not None else np.zeros(
-                (1, 1, 1), np.float32)))
-    output_tensors.append(
-        pb_utils.Tensor(
-            "generation_logits",
-            np.expand_dims(np.array(result.generation_logits, np.float32), 0)
-            if result.generation_logits is not None else np.zeros(
-                (1, 1, 1, 1), np.float32)))
-    output_tensors.append(
-        pb_utils.Tensor("batch_index",
-                        np.expand_dims(np.array([batch_index], np.int32), 0)))
-    output_tensors.append(
-        pb_utils.Tensor(
-            "sequence_index",
-            np.expand_dims(np.array([result.sequence_index], np.int32), 0)))
+
+    if result.cum_log_probs is not None:
+        output_tensors.append(
+            pb_utils.Tensor(
+                "cum_log_probs",
+                np.expand_dims(np.array(result.cum_log_probs, np.float32), 0)))
+
+    if result.log_probs is not None:
+        output_tensors.append(
+            pb_utils.Tensor(
+                "output_log_probs",
+                np.expand_dims(np.array(result.log_probs, np.float32), 0)))
+
+    if result.context_logits is not None:
+        output_tensors.append(
+            pb_utils.Tensor(
+                "context_logits",
+                np.expand_dims(np.array(result.context_logits, np.float32),
+                               0)))
+
+    if result.generation_logits is not None:
+        output_tensors.append(
+            pb_utils.Tensor(
+                "generation_logits",
+                np.expand_dims(np.array(result.generation_logits, np.float32),
+                               0)))
+
+    if batch_size > 1:
+        output_tensors.append(
+            pb_utils.Tensor(
+                "batch_index",
+                np.expand_dims(np.array([batch_index], np.int32), 0)))
+
+    if num_return_sequences > 1:
+        output_tensors.append(
+            pb_utils.Tensor(
+                "sequence_index",
+                np.expand_dims(np.array([result.sequence_index], np.int32),
+                               0)))
 
     return pb_utils.InferenceResponse(output_tensors), result.is_final
 
@@ -466,6 +477,8 @@ def get_kv_cache_config(self, model_config):
             "free_gpu_memory_fraction":
             get_parameter(model_config, "kv_cache_free_gpu_mem_fraction",
                           float),
+            "cross_kv_cache_fraction":
+            get_parameter(model_config, "cross_kv_cache_fraction", float),
             "host_cache_size":
             get_parameter(model_config, "kv_cache_host_memory_bytes", int),
             "onboard_blocks":
@@ -876,11 +889,14 @@ def execute(self, requests):
 
         with self.lock:
             request_ids = self.executor.enqueue_requests(executor_requests)
-            for req_id, triton_req_id, triton_user_id, triton_request, batch_index in zip(
+            for req_id, triton_req_id, triton_user_id, executor_request, triton_request, batch_index in zip(
                     request_ids, triton_req_ids, triton_user_ids,
-                    triton_requests, batch_indices):
+                    executor_requests, triton_requests, batch_indices):
+
                 self.req_id_to_request_data[
-                    req_id] = triton_req_id, triton_user_id, batch_index, triton_request.get_response_sender(
+                    req_id] = triton_req_id, triton_user_id, batch_index, len(
+                        batch_indices
+                    ), executor_request.num_return_sequences, triton_request.get_response_sender(
                     )
                 self.triton_req_id_to_req_ids[triton_req_id].add(req_id)
                 if triton_user_id is not None and triton_user_id != "":
@@ -897,11 +913,11 @@ def awaiter_loop(self):
                 with self.lock:
                     if req_id not in self.req_id_to_request_data:
                         continue
-                    triton_req_id, triton_user_id, batch_index, response_sender = self.req_id_to_request_data[
+                    triton_req_id, triton_user_id, batch_index, batch_size, num_return_sequences, response_sender = self.req_id_to_request_data[
                         req_id]
 
                 triton_response, is_final = convert_response(
-                    response, batch_index)
+                    response, batch_index, batch_size, num_return_sequences)
 
                 triton_request_final = False
                 if is_final:
@@ -935,7 +951,7 @@ def cancellation_loop(self):
             time.sleep(self.cancellation_check_period_ms / 1000.0)
             with self.lock:
                 for req_id, (triton_req_id, triton_user_id, batch_index,
-                             response_sender
+                             batch_size, num_return_sequences, response_sender
                              ) in self.req_id_to_request_data.items():
                     if response_sender.is_cancelled():
                         self.executor.cancel_request(req_id)
 
@@ -44,6 +44,21 @@ input [
     data_type: TYPE_INT32
     dims: [ -1 ]
     allow_ragged_batch: true
+    optional: true
+  },
+  {
+    name: "encoder_input_features"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    allow_ragged_batch: true
+    optional: true
+  },
+  {
+    name: "encoder_output_lengths"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
   },
   {
     name: "input_lengths"
@@ -465,6 +480,12 @@ parameters: {
     string_value: "${kv_cache_free_gpu_mem_fraction}"
   }
 }
+parameters: {
+  key: "cross_kv_cache_fraction"
+  value: {
+    string_value: "${cross_kv_cache_fraction}"
+  }
+}
 parameters: {
   key: "kv_cache_host_memory_bytes"
   value: {
 
@@ -358,14 +358,16 @@ def postprocess(self, gen_response: GenerationResponse,
                 )
 
             batch_index = gen_response.batch_index
-            if batch_index.ndim != 2:
-                raise Exception("Expected batch_index tensor to have 2 dims.")
-            if batch_index.shape[0] != 1:
-                raise Exception("Expected batch size of 1")
-            if batch_index.shape[1] != 1:
-                raise Exception("Expected only one batch_index")
-
-            batch_index = batch_index[0][0]
+            if batch_index is not None:
+                if batch_index.ndim != 2:
+                    raise Exception(
+                        "Expected batch_index tensor to have 2 dims.")
+                if batch_index.shape[0] != 1:
+                    raise Exception("Expected batch size of 1")
+                if batch_index.shape[1] != 1:
+                    raise Exception("Expected only one batch_index")
+
+            batch_index = batch_index[0][0] if batch_index is not None else 0
 
             self._accumulated_tokens[batch_index] = new_tokens if (
                 self._accumulated_tokens[batch_index] is None
 
@@ -539,7 +539,10 @@ def test_convert_request_invalid():
 
 def test_convert_response(trtllm_response: trtllm.Response):
     batch_index = 2
-    response, is_final = convert_response(trtllm_response, batch_index)
+    batch_size = 3
+    num_return_sequences = 1
+    response, is_final = convert_response(trtllm_response, batch_index,
+                                          batch_size, num_return_sequences)
     assert is_final == True
     assert (response.tensors["output_ids"].as_numpy() == np.array([[1, 2, 3]
                                                                    ])).all()
@@ -559,27 +562,30 @@ def test_convert_response(trtllm_response: trtllm.Response):
 
 def test_convert_response_minimal(trtllm_response_minimal: trtllm.Response):
     batch_index = 2
-    response, is_final = convert_response(trtllm_response_minimal, batch_index)
+    batch_size = 3
+    num_return_sequences = 1
+    response, is_final = convert_response(trtllm_response_minimal, batch_index,
+                                          batch_size, num_return_sequences)
     assert is_final == False
     assert (response.tensors["output_ids"].as_numpy() == np.array([[1, 2, 3]
                                                                    ])).all()
     assert (response.tensors["sequence_length"].as_numpy() == np.array(
         [[3]])).all()
-    assert (response.tensors["cum_log_probs"].as_numpy() == np.zeros(
-        (1, 1), np.float32)).all()
-    assert (response.tensors["output_log_probs"].as_numpy() == np.zeros(
-        (1, 1, 1), np.float32)).all()
-    assert (response.tensors["context_logits"].as_numpy() == np.zeros(
-        (1, 1, 1), np.float32)).all()
-    assert (response.tensors["generation_logits"].as_numpy() == np.zeros(
-        (1, 1, 1, 1), np.float32)).all()
+    assert "cum_log_probs" not in response.tensors
+    assert "output_log_probs" not in response.tensors
+    assert "output_log_probs" not in response.tensors
+    assert "context_logits" not in response.tensors
+    assert "generation_logits" not in response.tensors
     assert (response.tensors["batch_index"].as_numpy() == np.array(
         [[batch_index]])).all()
 
 
 def test_convert_response_error(trtllm_response_error: trtllm.Response):
     batch_index = 2
-    response, is_final = convert_response(trtllm_response_error, batch_index)
+    batch_size = 3
+    num_return_sequences = 1
+    response, is_final = convert_response(trtllm_response_error, batch_index,
+                                          batch_size, num_return_sequences)
     assert is_final == True
     assert response.has_error() and response.error.message == "internal error"
 
@@ -637,6 +643,7 @@ def model_config() -> Dict:
         "max_attention_window_size": "2",
         "sink_token_length": "3",
         "kv_cache_free_gpu_mem_fraction": "0.5",
+        "cross_kv_cache_fraction": "0.5",
         "kv_cache_host_memory_bytes": "4",
         "kv_cache_onboard_blocks": "false",
         "gpu_device_ids": "0,1,2,3",
@@ -665,6 +672,7 @@ def test_get_executor_config(model_config: Dict):
     assert config.kv_cache_config.max_attention_window == [2]
     assert config.kv_cache_config.sink_token_length == 3
     assert config.kv_cache_config.free_gpu_memory_fraction == 0.5
+    assert config.kv_cache_config.cross_kv_cache_fraction == 0.5
     assert config.kv_cache_config.host_cache_size == 4
     assert config.kv_cache_config.onboard_blocks == False
     assert config.parallel_config.device_ids == [0, 1, 2, 3]
@@ -707,6 +715,7 @@ def test_get_executor_config_minimal():
     assert config.kv_cache_config.max_attention_window is None
     assert config.kv_cache_config.sink_token_length is None
     assert config.kv_cache_config.free_gpu_memory_fraction is None
+    assert config.kv_cache_config.cross_kv_cache_fraction is None
     assert config.kv_cache_config.host_cache_size is None
     assert config.kv_cache_config.onboard_blocks == True
     assert config.parallel_config is None
 
@@ -0,0 +1,99 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Reference: https://github.com/openai/whisper/blob/main/whisper/audio.py
+import os
+from typing import Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+
+def mel_filters(device, n_mels: int = 128) -> torch.Tensor:
+    """
+    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
+    Allows decoupling librosa dependency; saved using:
+
+        np.savez_compressed(
+            "mel_filters.npz",
+            mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128),
+        )
+    """
+    assert n_mels == 80 or n_mels == 128, f"Unsupported n_mels: {n_mels}"
+    with np.load(os.path.join(os.path.dirname(__file__),
+                              "mel_filters.npz")) as f:
+        return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
+
+
+def log_mel_spectrogram(
+    audio: Union[torch.Tensor],
+    filters: torch.Tensor,
+    n_mels: int = 128,
+    n_fft: int = 400,
+    hop_length: int = 160,
+):
+    """
+    Compute the log-Mel spectrogram of
+
+    Parameters
+    ----------
+    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
+        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
+
+    n_mels: int
+        The number of Mel-frequency filters, only 80 or 128 is supported
+
+    filters: torch.Tensor
+
+    Returns
+    -------
+    torch.Tensor, shape = (128, n_frames)
+        A Tensor that contains the Mel spectrogram
+    """
+    window = torch.hann_window(n_fft).to(audio.device)
+    stft = torch.stft(audio,
+                      n_fft,
+                      hop_length,
+                      window=window,
+                      return_complex=True)
+    magnitudes = stft[..., :-1].abs()**2
+
+    mel_spec = filters @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    # cast to float 16
+    log_spec = log_spec.half()
+    return log_spec
+
+
+class FeatureExtractor(torch.nn.Module):
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+
+    def __init__(self, n_mels: int = 128):
+        self.device = torch.device("cuda")
+        self.n_mels = n_mels
+        self.filters = mel_filters(self.device, n_mels=self.n_mels)
+
+    def compute_feature(self, wav, target: int = 3000):
+        mel = log_mel_spectrogram(wav, self.filters)
+        if mel.shape[1] < target:
+            mel = F.pad(mel, (0, target - mel.shape[1]), mode='constant')
+        if mel.shape[1] % 2:
+            # pad to even length for remove_padding case, since conv1d requires even length
+            mel = torch.nn.functional.pad(mel, (0, 1))
+        mel = mel.unsqueeze(0)
+        return mel