From a5457a6424811ad50cddac376a2974545f38842e Mon Sep 17 00:00:00 2001 From: Lu Fang Date: Mon, 3 Mar 2025 17:32:54 -0800 Subject: [PATCH 1/4] Fix allowed_token_ids for v1 Sampler Signed-off-by: Lu Fang --- vllm/v1/engine/processor.py | 7 +++++-- vllm/v1/worker/gpu_input_batch.py | 14 +++++++------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 3a3fc69e53e44..03f5706401cd2 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -92,10 +92,13 @@ def _validate_allowed_token_ids( return if params.allowed_token_ids is None: return - if not all(0 <= tid < self.model_config.vocab_size + if params.allowed_token_ids is not None and len( + params.allowed_token_ids) == 0: + raise ValueError("allowed_token_ids is not None and empty!") + if not all(0 <= tid < self.model_config.get_vocab_size() for tid in params.allowed_token_ids): raise ValueError( - "allowed_token_ids contains out-of-vocab token id") + "allowed_token_ids contains out-of-vocab token id!") def process_inputs( self, diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index b0b218d92b927..13f3d461ee4a9 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -300,17 +300,17 @@ def add_request( self.has_allowed_token_ids.add(req_id) if self.allowed_token_ids_mask_cpu_tensor is None: # Lazy allocation for this tensor, which can be large. - self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs, - self.vocab_size, - dtype=torch.bool, - device=self.device) - self.allowed_token_ids_mask_cpu_tensor = torch.zeros( + self.allowed_token_ids_mask = torch.ones(self.max_num_reqs, + self.vocab_size, + dtype=torch.bool, + device=self.device) + self.allowed_token_ids_mask_cpu_tensor = torch.ones( self.max_num_reqs, self.vocab_size, dtype=torch.bool, device="cpu") self.allowed_token_ids_mask_cpu_tensor[req_index][ - sampling_params.allowed_token_ids] = True + sampling_params.allowed_token_ids] = False # Add request lora ID if request.lora_request: @@ -359,7 +359,7 @@ def remove_request(self, req_id: str) -> Optional[int]: self.logit_bias[req_index] = None self.has_allowed_token_ids.discard(req_id) if self.allowed_token_ids_mask_cpu_tensor is not None: - self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False) + self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(True) return req_index def swap_states(self, i1: int, i2: int) -> None: From 56cf0cc805c94ece34184f693ca4cf94d2f42cb2 Mon Sep 17 00:00:00 2001 From: Lu Fang Date: Mon, 3 Mar 2025 22:11:31 -0800 Subject: [PATCH 2/4] address comments Signed-off-by: Lu Fang --- vllm/v1/engine/processor.py | 7 +++---- vllm/v1/worker/gpu_input_batch.py | 2 ++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 03f5706401cd2..9a400f2f12524 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -92,11 +92,10 @@ def _validate_allowed_token_ids( return if params.allowed_token_ids is None: return - if params.allowed_token_ids is not None and len( - params.allowed_token_ids) == 0: + if not params.allowed_token_ids: raise ValueError("allowed_token_ids is not None and empty!") - if not all(0 <= tid < self.model_config.get_vocab_size() - for tid in params.allowed_token_ids): + vocab_size = self.model_config.get_vocab_size() + if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids): raise ValueError( "allowed_token_ids contains out-of-vocab token id!") diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 13f3d461ee4a9..90082ced5b4da 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -199,6 +199,8 @@ def __init__( self.logit_bias: list[Optional[dict[int, float]]] = [None] * max_num_reqs self.has_allowed_token_ids: set[str] = set() + # NOTE(lufang): In the mask tensor, if the corresponding token allowed, + # the value is False. Since we use masked_fill_ to set -inf. self.allowed_token_ids_mask: Optional[torch.Tensor] = None self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None From 830f784b3a119a08eb4bdc6c31241de09107e782 Mon Sep 17 00:00:00 2001 From: Lu Fang Date: Tue, 4 Mar 2025 13:51:07 -0800 Subject: [PATCH 3/4] address comments to make sure we set the default value for mask correctly Signed-off-by: Lu Fang --- vllm/v1/worker/gpu_input_batch.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 90082ced5b4da..f990cb854123b 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -302,15 +302,16 @@ def add_request( self.has_allowed_token_ids.add(req_id) if self.allowed_token_ids_mask_cpu_tensor is None: # Lazy allocation for this tensor, which can be large. - self.allowed_token_ids_mask = torch.ones(self.max_num_reqs, - self.vocab_size, - dtype=torch.bool, - device=self.device) - self.allowed_token_ids_mask_cpu_tensor = torch.ones( + self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs, + self.vocab_size, + dtype=torch.bool, + device=self.device) + self.allowed_token_ids_mask_cpu_tensor = torch.zeros( self.max_num_reqs, self.vocab_size, dtype=torch.bool, device="cpu") + self.allowed_token_ids_mask_cpu_tensor[req_index] = True self.allowed_token_ids_mask_cpu_tensor[req_index][ sampling_params.allowed_token_ids] = False From 661ecf32456a1bd00ea8579b1546155cf5a154fb Mon Sep 17 00:00:00 2001 From: Lu Fang Date: Tue, 4 Mar 2025 14:33:30 -0800 Subject: [PATCH 4/4] address comments to make sure we set the default value for mask correctly Signed-off-by: Lu Fang --- vllm/v1/worker/gpu_input_batch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index f990cb854123b..2fe177ea4e126 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -302,6 +302,7 @@ def add_request( self.has_allowed_token_ids.add(req_id) if self.allowed_token_ids_mask_cpu_tensor is None: # Lazy allocation for this tensor, which can be large. + # False means we don't fill with -inf. self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs, self.vocab_size, dtype=torch.bool, @@ -312,6 +313,7 @@ def add_request( dtype=torch.bool, device="cpu") self.allowed_token_ids_mask_cpu_tensor[req_index] = True + # False means we don't fill with -inf. self.allowed_token_ids_mask_cpu_tensor[req_index][ sampling_params.allowed_token_ids] = False @@ -362,7 +364,8 @@ def remove_request(self, req_id: str) -> Optional[int]: self.logit_bias[req_index] = None self.has_allowed_token_ids.discard(req_id) if self.allowed_token_ids_mask_cpu_tensor is not None: - self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(True) + # False means we don't fill with -inf. + self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False) return req_index def swap_states(self, i1: int, i2: int) -> None: