From a5457a6424811ad50cddac376a2974545f38842e Mon Sep 17 00:00:00 2001
From: Lu Fang <lufang@fb.com>
Date: Mon, 3 Mar 2025 17:32:54 -0800
Subject: [PATCH 1/4] Fix allowed_token_ids for v1 Sampler

Signed-off-by: Lu Fang <lufang@fb.com>
---
 vllm/v1/engine/processor.py       |  7 +++++--
 vllm/v1/worker/gpu_input_batch.py | 14 +++++++-------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 3a3fc69e53e44..03f5706401cd2 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -92,10 +92,13 @@ def _validate_allowed_token_ids(
             return
         if params.allowed_token_ids is None:
             return
-        if not all(0 <= tid < self.model_config.vocab_size
+        if params.allowed_token_ids is not None and len(
+                params.allowed_token_ids) == 0:
+            raise ValueError("allowed_token_ids is not None and empty!")
+        if not all(0 <= tid < self.model_config.get_vocab_size()
                    for tid in params.allowed_token_ids):
             raise ValueError(
-                "allowed_token_ids contains out-of-vocab token id")
+                "allowed_token_ids contains out-of-vocab token id!")
 
     def process_inputs(
         self,
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index b0b218d92b927..13f3d461ee4a9 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -300,17 +300,17 @@ def add_request(
             self.has_allowed_token_ids.add(req_id)
             if self.allowed_token_ids_mask_cpu_tensor is None:
                 # Lazy allocation for this tensor, which can be large.
-                self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs,
-                                                          self.vocab_size,
-                                                          dtype=torch.bool,
-                                                          device=self.device)
-                self.allowed_token_ids_mask_cpu_tensor = torch.zeros(
+                self.allowed_token_ids_mask = torch.ones(self.max_num_reqs,
+                                                         self.vocab_size,
+                                                         dtype=torch.bool,
+                                                         device=self.device)
+                self.allowed_token_ids_mask_cpu_tensor = torch.ones(
                     self.max_num_reqs,
                     self.vocab_size,
                     dtype=torch.bool,
                     device="cpu")
             self.allowed_token_ids_mask_cpu_tensor[req_index][
-                sampling_params.allowed_token_ids] = True
+                sampling_params.allowed_token_ids] = False
 
         # Add request lora ID
         if request.lora_request:
@@ -359,7 +359,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.logit_bias[req_index] = None
         self.has_allowed_token_ids.discard(req_id)
         if self.allowed_token_ids_mask_cpu_tensor is not None:
-            self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
+            self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(True)
         return req_index
 
     def swap_states(self, i1: int, i2: int) -> None:

From 56cf0cc805c94ece34184f693ca4cf94d2f42cb2 Mon Sep 17 00:00:00 2001
From: Lu Fang <lufang@fb.com>
Date: Mon, 3 Mar 2025 22:11:31 -0800
Subject: [PATCH 2/4] address comments

Signed-off-by: Lu Fang <lufang@fb.com>
---
 vllm/v1/engine/processor.py       | 7 +++----
 vllm/v1/worker/gpu_input_batch.py | 2 ++
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 03f5706401cd2..9a400f2f12524 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -92,11 +92,10 @@ def _validate_allowed_token_ids(
             return
         if params.allowed_token_ids is None:
             return
-        if params.allowed_token_ids is not None and len(
-                params.allowed_token_ids) == 0:
+        if not params.allowed_token_ids:
             raise ValueError("allowed_token_ids is not None and empty!")
-        if not all(0 <= tid < self.model_config.get_vocab_size()
-                   for tid in params.allowed_token_ids):
+        vocab_size = self.model_config.get_vocab_size()
+        if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
             raise ValueError(
                 "allowed_token_ids contains out-of-vocab token id!")
 
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 13f3d461ee4a9..90082ced5b4da 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -199,6 +199,8 @@ def __init__(
         self.logit_bias: list[Optional[dict[int,
                                             float]]] = [None] * max_num_reqs
         self.has_allowed_token_ids: set[str] = set()
+        # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
+        # the value is False. Since we use masked_fill_ to set -inf.
         self.allowed_token_ids_mask: Optional[torch.Tensor] = None
         self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
 

From 830f784b3a119a08eb4bdc6c31241de09107e782 Mon Sep 17 00:00:00 2001
From: Lu Fang <lufang@fb.com>
Date: Tue, 4 Mar 2025 13:51:07 -0800
Subject: [PATCH 3/4] address comments to make sure we set the default value
 for mask correctly

Signed-off-by: Lu Fang <lufang@fb.com>
---
 vllm/v1/worker/gpu_input_batch.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 90082ced5b4da..f990cb854123b 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -302,15 +302,16 @@ def add_request(
             self.has_allowed_token_ids.add(req_id)
             if self.allowed_token_ids_mask_cpu_tensor is None:
                 # Lazy allocation for this tensor, which can be large.
-                self.allowed_token_ids_mask = torch.ones(self.max_num_reqs,
-                                                         self.vocab_size,
-                                                         dtype=torch.bool,
-                                                         device=self.device)
-                self.allowed_token_ids_mask_cpu_tensor = torch.ones(
+                self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs,
+                                                          self.vocab_size,
+                                                          dtype=torch.bool,
+                                                          device=self.device)
+                self.allowed_token_ids_mask_cpu_tensor = torch.zeros(
                     self.max_num_reqs,
                     self.vocab_size,
                     dtype=torch.bool,
                     device="cpu")
+            self.allowed_token_ids_mask_cpu_tensor[req_index] = True
             self.allowed_token_ids_mask_cpu_tensor[req_index][
                 sampling_params.allowed_token_ids] = False
 

From 661ecf32456a1bd00ea8579b1546155cf5a154fb Mon Sep 17 00:00:00 2001
From: Lu Fang <lufang@fb.com>
Date: Tue, 4 Mar 2025 14:33:30 -0800
Subject: [PATCH 4/4] address comments to make sure we set the default value
 for mask correctly

Signed-off-by: Lu Fang <lufang@fb.com>
---
 vllm/v1/worker/gpu_input_batch.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index f990cb854123b..2fe177ea4e126 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -302,6 +302,7 @@ def add_request(
             self.has_allowed_token_ids.add(req_id)
             if self.allowed_token_ids_mask_cpu_tensor is None:
                 # Lazy allocation for this tensor, which can be large.
+                # False means we don't fill with -inf.
                 self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs,
                                                           self.vocab_size,
                                                           dtype=torch.bool,
@@ -312,6 +313,7 @@ def add_request(
                     dtype=torch.bool,
                     device="cpu")
             self.allowed_token_ids_mask_cpu_tensor[req_index] = True
+            # False means we don't fill with -inf.
             self.allowed_token_ids_mask_cpu_tensor[req_index][
                 sampling_params.allowed_token_ids] = False
 
@@ -362,7 +364,8 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.logit_bias[req_index] = None
         self.has_allowed_token_ids.discard(req_id)
         if self.allowed_token_ids_mask_cpu_tensor is not None:
-            self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(True)
+            # False means we don't fill with -inf.
+            self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
         return req_index
 
     def swap_states(self, i1: int, i2: int) -> None: