Skip to content

Commit

Permalink
Return token ids instead of number of token ids
Browse files Browse the repository at this point in the history
  • Loading branch information
kthui committed Nov 7, 2024
1 parent 29099df commit 457eeaa
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 35 deletions.
32 changes: 16 additions & 16 deletions ci/L0_additional_outputs_vllm/additional_outputs_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _get_inputs(
sampling_parameters=None,
return_finish_reason=None,
return_cumulative_logprob=None,
return_num_token_ids=None,
return_token_ids=None,
):
inputs = []

Expand Down Expand Up @@ -76,9 +76,9 @@ def _get_inputs(
np.array([return_cumulative_logprob], dtype=bool)
)

if return_num_token_ids is not None:
inputs.append(grpcclient.InferInput("return_num_token_ids", [1], "BOOL"))
inputs[-1].set_data_from_numpy(np.array([return_num_token_ids], dtype=bool))
if return_token_ids is not None:
inputs.append(grpcclient.InferInput("return_token_ids", [1], "BOOL"))
inputs[-1].set_data_from_numpy(np.array([return_token_ids], dtype=bool))

return inputs

Expand Down Expand Up @@ -131,15 +131,15 @@ def _assert_cumulative_logprob(self, return_cumulative_logprob):
assert cumulative_logprob != prev_cumulative_logprob
prev_cumulative_logprob = cumulative_logprob

def _assert_num_token_ids(self, return_num_token_ids):
def _assert_token_ids(self, return_token_ids):
for response in self._responses:
result, error = response["result"], response["error"]
assert error is None
num_token_ids_np = result.as_numpy(name="num_token_ids")
if return_num_token_ids is None or return_num_token_ids == False:
assert num_token_ids_np is None
token_ids_np = result.as_numpy(name="token_ids")
if return_token_ids is None or return_token_ids == False:
assert token_ids_np is None
continue
num_token_ids = num_token_ids_np[0].astype(int)
token_ids = token_ids_np[0].astype(int)
# TODO: vLLM may return token ids identical to the previous one when
# streaming, for example:
#
Expand All @@ -155,31 +155,31 @@ def _assert_num_token_ids(self, return_num_token_ids):
# prev: text=' the term', token_ids=array('l', [5, 1385, 44, 48])
# curr: text=' the term “', token_ids=array('l', [5, 1385, 44, 48])
#
# If this is no longer the case in a future release, change the assert
# to assert num_token_ids > 0.
assert num_token_ids >= 0
# If this is no longer the case in a future release, change to
# assert len(token_ids) > 0.
assert len(token_ids) >= 0

@pytest.mark.parametrize("stream", [True, False])
@pytest.mark.parametrize("return_finish_reason", [None, True, False])
@pytest.mark.parametrize("return_cumulative_logprob", [None, True, False])
@pytest.mark.parametrize("return_num_token_ids", [None, True, False])
@pytest.mark.parametrize("return_token_ids", [None, True, False])
def test_additional_outputs(
self,
stream,
return_finish_reason,
return_cumulative_logprob,
return_num_token_ids,
return_token_ids,
):
inputs = self._get_inputs(
self._prompt,
stream=stream,
sampling_parameters=self._sampling_parameters,
return_finish_reason=return_finish_reason,
return_cumulative_logprob=return_cumulative_logprob,
return_num_token_ids=return_num_token_ids,
return_token_ids=return_token_ids,
)
self._llm_infer(inputs)
self._assert_text_output_valid()
self._assert_finish_reason(return_finish_reason)
self._assert_cumulative_logprob(return_cumulative_logprob)
self._assert_num_token_ids(return_num_token_ids)
self._assert_token_ids(return_token_ids)
13 changes: 5 additions & 8 deletions docs/additional_outputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,14 @@ point value will be sent on the `cumulative_logprob` output tensor.

Supported since r24.11.

### Number of token IDs
### Token IDs

The number of token IDs of the generated output text sent on this response. It
is the difference in length of the token IDs generated from the last response to
this response. If this is the first response, the last response length is
presumed to be zero. See
The token IDs of the generated output text sent on this response. See
[here](https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/outputs.py#L21)
for more details on the token IDs of the generated output text.
for more details.

To enable, set `return_num_token_ids` input tensor to `True`. The unsigned
integer value will be sent on the `num_token_ids` output tensor.
To enable, set `return_token_ids` input tensor to `True`. The array of integer
value will be sent on the `token_ids` output tensor.

Supported since r24.11.

Expand Down
20 changes: 9 additions & 11 deletions src/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def _auto_complete_inputs_and_outputs(auto_complete_model_config):
"optional": True,
},
{
"name": "return_num_token_ids",
"name": "return_token_ids",
"data_type": "TYPE_BOOL",
"dims": [1],
"optional": True,
Expand All @@ -111,7 +111,7 @@ def _auto_complete_inputs_and_outputs(auto_complete_model_config):
{"name": "text_output", "data_type": "TYPE_STRING", "dims": [-1]},
{"name": "finish_reason", "data_type": "TYPE_STRING", "dims": [-1]},
{"name": "cumulative_logprob", "data_type": "TYPE_FP32", "dims": [-1]},
{"name": "num_token_ids", "data_type": "TYPE_UINT32", "dims": [-1]},
{"name": "token_ids", "data_type": "TYPE_INT64", "dims": [-1, -1]},
]

# Collect input and output names from the provided model config.
Expand Down Expand Up @@ -348,11 +348,11 @@ def _get_input_tensors(self, request):
else:
parameters = request.parameters()

# return_finish_reason, return_cumulative_logprob, return_num_token_ids
# return_finish_reason, return_cumulative_logprob, return_token_ids
additional_outputs = {
"return_finish_reason": None,
"return_cumulative_logprob": None,
"return_num_token_ids": None,
"return_token_ids": None,
}
for tensor_name in additional_outputs.keys():
tensor = pb_utils.get_input_tensor_by_name(request, tensor_name)
Expand Down Expand Up @@ -467,8 +467,8 @@ def _create_response(
)
)

# num_token_ids
if additional_outputs["return_num_token_ids"]:
# token_ids
if additional_outputs["return_token_ids"]:
if prev_request_output is None:
# this is the first response
prev_lens = [0] * len(request_output.outputs)
Expand All @@ -478,14 +478,12 @@ def _create_response(
len(prev_output.token_ids)
for prev_output in prev_request_output.outputs
]
num_token_ids = [
(len(output.token_ids) - prev_len)
token_ids = [
output.token_ids[prev_len:]
for output, prev_len in zip(request_output.outputs, prev_lens)
]
output_tensors.append(
pb_utils.Tensor(
"num_token_ids", np.asarray(num_token_ids, dtype=np.uint32)
)
pb_utils.Tensor("token_ids", np.asarray(token_ids, dtype=np.int64))
)

return pb_utils.InferenceResponse(output_tensors=output_tensors)
Expand Down

0 comments on commit 457eeaa

Please sign in to comment.