diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index d687ed49b71e0..97c1ef5e9e52d 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -92,10 +92,12 @@ def _validate_allowed_token_ids( return if params.allowed_token_ids is None: return - if not all(0 <= tid < self.model_config.vocab_size - for tid in params.allowed_token_ids): + if not params.allowed_token_ids: + raise ValueError("allowed_token_ids is not None and empty!") + vocab_size = self.model_config.get_vocab_size() + if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids): raise ValueError( - "allowed_token_ids contains out-of-vocab token id") + "allowed_token_ids contains out-of-vocab token id!") def process_inputs( self, diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index b0b218d92b927..2fe177ea4e126 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -199,6 +199,8 @@ def __init__( self.logit_bias: list[Optional[dict[int, float]]] = [None] * max_num_reqs self.has_allowed_token_ids: set[str] = set() + # NOTE(lufang): In the mask tensor, if the corresponding token allowed, + # the value is False. Since we use masked_fill_ to set -inf. self.allowed_token_ids_mask: Optional[torch.Tensor] = None self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None @@ -300,6 +302,7 @@ def add_request( self.has_allowed_token_ids.add(req_id) if self.allowed_token_ids_mask_cpu_tensor is None: # Lazy allocation for this tensor, which can be large. + # False means we don't fill with -inf. self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs, self.vocab_size, dtype=torch.bool, @@ -309,8 +312,10 @@ def add_request( self.vocab_size, dtype=torch.bool, device="cpu") + self.allowed_token_ids_mask_cpu_tensor[req_index] = True + # False means we don't fill with -inf. self.allowed_token_ids_mask_cpu_tensor[req_index][ - sampling_params.allowed_token_ids] = True + sampling_params.allowed_token_ids] = False # Add request lora ID if request.lora_request: @@ -359,6 +364,7 @@ def remove_request(self, req_id: str) -> Optional[int]: self.logit_bias[req_index] = None self.has_allowed_token_ids.discard(req_id) if self.allowed_token_ids_mask_cpu_tensor is not None: + # False means we don't fill with -inf. self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False) return req_index