refactor: Resolve comments for Python code

qixiang-99 · qixiang-99 · commit bd371f8da245 · 2025-03-25T19:28:37.000-07:00
Simplify no cache attention metadata preparation and streamline related attributes in TrtllmAttentionMetadata

Removed the private method for converting to no cache attention metadata and integrated its logic into the prepare method. Updated the test for BERT sequence classification to reflect these changes and ensure proper handling of attention metadata.

Signed-off-by: Qixiang Lin &lt;qixiangl@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -205,9 +205,9 @@ def plan(
             host_context_lengths (torch.Tensor): Same as context_lengths, but on CPU.
             host_request_types (torch.Tensor): The tensor that indicates whether a request is in context or generation phase, with shape (batch_size) on CPU.
             kv_cache_block_offsets (torch.Tensor): The offsets to the blocks inside KV cache pools on GPU, its shape is (num_pools, max_batch_size * max_beam_width, 2, max_blocks_per_sequence), one for each block. If kv_cache_block_offsets, host_kv_cache_block_offsets, host_kv_cache_pool_pointers, host_kv_cache_pool_mapping are all None, the attention will be no cache attention.
-            host_kv_cache_block_offsets (torch.Tensor): Same as kv_cache_block_offsets, but on CPU. If kv_cache_block_offsets, host_kv_cache_block_offsets, host_kv_cache_pool_pointers, host_kv_cache_pool_mapping are all None, the attention will be no cache attention.
-            host_kv_cache_pool_pointers (torch.Tensor): The pointers to the KV cache pools on CPU, its shape is (num_pools, 2), one for primary pool in GPU memory, one for secondary pool in CPU memory. If kv_cache_block_offsets, host_kv_cache_block_offsets, host_kv_cache_pool_pointers, host_kv_cache_pool_mapping are all None, the attention will be no cache attention.
-            host_kv_cache_pool_mapping (torch.Tensor): The index of the pool used by each attention layer on CPU, its shape is (num_local_attention_layers). The local attention layers mean all attention layers in the current PP stage in the pipeline parallelism case. If kv_cache_block_offsets, host_kv_cache_block_offsets, host_kv_cache_pool_pointers, host_kv_cache_pool_mapping are all None, the attention will be no cache attention.
+            host_kv_cache_block_offsets (torch.Tensor): Same as kv_cache_block_offsets, but on CPU.
+            host_kv_cache_pool_pointers (torch.Tensor): The pointers to the KV cache pools on CPU, its shape is (num_pools, 2), one for primary pool in GPU memory, one for secondary pool in CPU memory.
+            host_kv_cache_pool_mapping (torch.Tensor): The index of the pool used by each attention layer on CPU, its shape is (num_local_attention_layers). The local attention layers mean all attention layers in the current PP stage in the pipeline parallelism case.
             workspace (torch.Tensor): An optional workspace tensor on GPU.
             cache_indirection (torch.Tensor): A tensor for beam search on GPU, its shape is (batch_size, beam_width, max_seqlen), for a sequence si, a beam bi and a token ti, the element cache_indirection[si][bi][ti] is an integer between 0 and beam_width-1 that indicates which path in the beam to read the K and V elements from in the KV cache.
             kv_scale_orig_quant (torch.Tensor): The tensor to store the scaling factor for quantization to INT8/FP8 in the KV cache, with shape (1) on GPU.
@@ -523,7 +523,21 @@ def __post_init__(self) -> None:
     def prepare(self) -> None:
 
         if not self.is_dummy_attention and self.kv_cache_manager is None:
-            self._as_no_cache_attention_metadata()
+            # Convert the attention metadata to a TRT-LLM no cache attention metadata.
+            assert self.kv_cache_manager is None, "no cache attention should not have KV cache manager"
+            assert self._max_seq_len_storage is not None, "max_seq_len should be set for no cache attention"
+
+            # setting kv cache params
+            self.kv_cache_params = KVCacheParams(use_cache=False, )
+
+            # trtllm attn metadata prepare() requires this
+            self.prompt_lens = self.context_lens
+
+            # set params that are used in wrapper.plan()
+            self.kv_cache_block_offsets = None
+            self.host_kv_cache_block_offsets = None
+            self.block_ids_per_seq = None
+
         prompt_lens = torch.tensor(
             self.prompt_lens,
             dtype=torch.int,
@@ -579,25 +593,6 @@ def prepare(self) -> None:
             assert self.kv_lens[:self.num_seqs].max(
             ) <= self.kv_cache_manager.max_seq_len, f"Please set max_seq_len to at least {self.kv_lens[:self.num_seqs].max()} for kv cache manager."
 
-    def _as_no_cache_attention_metadata(self) -> None:
-        """
-        Convert the attention metadata to a TRT-LLM no cache attention metadata.
-        This is a private method and should not be directly called.
-        """
-        assert self.kv_cache_manager is None, "no cache attention should not have KV cache manager"
-        assert self._max_seq_len_storage is not None, "max_seq_len should be set for no cache attention"
-
-        # setting kv cache params
-        self.kv_cache_params = KVCacheParams(use_cache=False, )
-
-        # trtllm attn metadata prepare() requires this
-        self.prompt_lens = self.context_lens
-
-        # set params that are used in wrapper.plan()
-        self.kv_cache_block_offsets = None
-        self.host_kv_cache_block_offsets = None
-        self.block_ids_per_seq = None
-
 
 class TrtllmAttention(AttentionBackend[TrtllmAttentionMetadata]):
 
diff --git a/tests/unittest/_torch/modeling/test_modeling_bert.py b/tests/unittest/_torch/modeling/test_modeling_bert.py
@@ -105,21 +105,16 @@ def test_bert_allclose_to_hf(self, scenario: Scenario):
         # Fill the metadata for tllm attn
         request_ids = [1]
         prompt_lens = [input_ids.size(-1)]
-        kwargs = {
-            "max_num_requests": 1,
-            "max_num_tokens": 8192,
-            "kv_cache_manager": None,
-            "request_ids": request_ids,
-            "prompt_lens": prompt_lens,
-        }
-        # if metadata_cls is TrtllmAttentionMetadata:
-        #     kwargs["is_no_cache"] = True
-        #     kwargs["max_seq_len"] = input_ids.size(-1)
-
-        attn_metadata = metadata_cls(**kwargs)
-        attn_metadata.seq_lens = torch.tensor([input_ids.size(-1)],
-                                              dtype=torch.int)
-        attn_metadata.num_contexts = 1
+
+        attn_metadata = metadata_cls(
+            max_num_requests=1,
+            max_num_tokens=8192,
+            kv_cache_manager=None,
+            request_ids=request_ids,
+            prompt_lens=prompt_lens,
+            seq_lens=torch.tensor([input_ids.size(-1)], dtype=torch.int),
+            num_contexts=1,
+        )
         attn_metadata.max_seq_len = input_ids.size(-1)
         attn_metadata.prepare()
 
@@ -129,9 +124,6 @@ def test_bert_allclose_to_hf(self, scenario: Scenario):
 
         # Run inference
         with torch.inference_mode():
-            if backend == 'TRTLLM':
-                attn_metadata.prepare()
-            #NOTE:attn_metadata.prepare is not needed for no cache case
             # TRT-LLM model forward
             tllm_outputs = tllm_model(
                 input_ids=input_ids,