@@ -190,20 +190,15 @@ class Runner : public RunnerBase
190
190
int const cyclic_attention_window_size = attention_window_size;
191
191
bool const can_use_one_more_block = beam_width > 1 ;
192
192
193
- int max_blocks_per_sequence = kv_cache_block_offsets.has_value () ? kv_cache_block_offsets.value ().size (-1 ) : 0 ;
194
- int32_t const pool_index = kv_cache_block_offsets.has_value ()
195
- ? host_kv_cache_pool_mapping.value ().index ({op.mLayerIdx , 0 }).item <int32_t >()
196
- : 0 ;
197
- int32_t const layer_idx_in_cache_pool = kv_cache_block_offsets.has_value ()
198
- ? host_kv_cache_pool_mapping.value ().index ({op.mLayerIdx , 1 }).item <int32_t >()
199
- : 0 ;
200
- KVBlockArray::DataType* block_offsets = static_cast <KVBlockArray::DataType*>(kv_cache_block_offsets.has_value ()
201
- ? kv_cache_block_offsets.value ().index ({pool_index, seq_offset}).data_ptr ()
202
- : nullptr );
203
- KVBlockArray::DataType* host_block_offsets
204
- = static_cast <KVBlockArray::DataType*>(host_kv_cache_block_offsets.has_value ()
205
- ? host_kv_cache_block_offsets.value ().index ({pool_index, seq_offset}).data_ptr ()
206
- : nullptr );
193
+ int max_blocks_per_sequence = op.useKVCache () ? kv_cache_block_offsets.value ().size (-1 ) : 0 ;
194
+ int32_t const pool_index
195
+ = op.useKVCache () ? host_kv_cache_pool_mapping.value ().index ({op.mLayerIdx , 0 }).item <int32_t >() : 0 ;
196
+ int32_t const layer_idx_in_cache_pool
197
+ = op.useKVCache () ? host_kv_cache_pool_mapping.value ().index ({op.mLayerIdx , 1 }).item <int32_t >() : 0 ;
198
+ KVBlockArray::DataType* block_offsets = static_cast <KVBlockArray::DataType*>(
199
+ op.useKVCache () ? kv_cache_block_offsets.value ().index ({pool_index, seq_offset}).data_ptr () : nullptr );
200
+ KVBlockArray::DataType* host_block_offsets = static_cast <KVBlockArray::DataType*>(
201
+ op.useKVCache () ? host_kv_cache_block_offsets.value ().index ({pool_index, seq_offset}).data_ptr () : nullptr );
207
202
208
203
auto const cache_elem_size = (op.mKVCacheQuantMode .hasKvCacheQuant () ? 1 : sizeof (T));
209
204
auto const block_size = op.mTokensPerBlock * op.mNumKVHeads * op.mHeadSize ;
@@ -434,10 +429,7 @@ torch::Tensor attention(torch::Tensor q, torch::optional<torch::Tensor> k, torch
434
429
op->mKVCacheQuantMode = tensorrt_llm::common::QuantMode (uint32_t (quant_mode));
435
430
op->mUseKVCache = use_kv_cache;
436
431
op->mPagedKVCache = op->mPagedKVCache && use_kv_cache; // update mPagedKVCache based on use_kv_cache
437
- if (tokens_per_block.has_value ())
438
- {
439
- op->mTokensPerBlock = tokens_per_block.value ();
440
- }
432
+ op->mTokensPerBlock = tokens_per_block.value_or (0 );
441
433
op->mMaxContextLength = max_context_length;
442
434
op->mQScaling = q_scaling;
443
435
op->mPositionEmbeddingType
0 commit comments