PygmalionAI · AlpinDale · Feb 7, 2024 · Mar 23, 2024 · Mar 23, 2024 · Mar 23, 2024
diff --git a/aphrodite/common/sequence.py b/aphrodite/common/sequence.py
@@ -142,6 +142,7 @@ def __init__(
         prompt: str,
         prompt_token_ids: List[int],
         block_size: int,
+        is_encoder_decoder: bool,
         lora_request: Optional[LoRARequest] = None,
     ) -> None:
         self.seq_id = seq_id
@@ -154,8 +155,20 @@ def __init__(
         self.output_text = ""
 
         self.logical_token_blocks: List[LogicalTokenBlock] = []
+        initial_token_ids = prompt_token_ids
+        if is_encoder_decoder:
+            # We need to separate the prompt and generated tokens for
+            # encoder-decoder models.
+            num_prompt_blocks = (len(prompt_token_ids) + block_size -
+                                 1) // block_size
+            padded_prompt_len = num_prompt_blocks * block_size
+            initial_token_ids = prompt_token_ids + [0] * (
+                padded_prompt_len - len(prompt_token_ids))
+            # Also need to append decoder_start_token_id
+            initial_token_ids.append(0)
+
         # Initialize the logical token blocks with the prompt token ids.
-        self._append_tokens_to_blocks(prompt_token_ids)
+        self._append_tokens_to_blocks(initial_token_ids)
         self.status = SequenceStatus.WAITING
 
         # Used for incremental detokenization

diff --git a/aphrodite/endpoints/llm.py b/aphrodite/endpoints/llm.py
@@ -156,6 +156,10 @@ def generate(
         if sampling_params is None:
             # Use default sampling params.
             sampling_params = SamplingParams()
+
+        if self.llm_engine.is_encoder_decoder:
+            assert (self.llm_engine.cache_config.context_shift is None
+                    ), "Encoder-decoder models do not support context shift."
 
         # Add requests to the engine.
         num_requests = len(prompts) if prompts is not None else len(

diff --git a/aphrodite/engine/aphrodite_engine.py b/aphrodite/engine/aphrodite_engine.py
@@ -125,6 +125,9 @@ def __init__(
             self.stat_logger = StatLogger(
                 local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
                 labels=dict(model_name=model_config.model))
+
+        self.is_encoder_decoder = getattr(self.model_config.hf_config,
+                                          "is_encoder_decoder", False)
 
     def get_tokenizer_for_seq(self, sequence: Sequence):
         return self.tokenizer.get_lora_tokenizer(sequence.lora_request)
@@ -462,7 +465,7 @@ def add_request(
         block_size = self.cache_config.block_size
         seq_id = next(self.seq_counter)
         seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
-                       lora_request)
+                       self.is_encoder_decoder, lora_request)
 
         # Create the sequence group.
         seq_group = SequenceGroup(request_id, [seq], sampling_params,

diff --git a/aphrodite/modeling/layers/attention.py b/aphrodite/modeling/layers/attention.py
@@ -226,17 +226,12 @@ def forward(
                 )
 
         else:
-            # Decoding run.
-            output = _paged_attention(
-                query,
-                key_cache,
-                value_cache,
-                input_metadata,
-                self.num_kv_heads,
-                self.scale,
-                self.alibi_slopes,
-                kv_quant_param,
-            )
+            # Decoding run
+            output = paged_attention(
+                query, key_cache, value_cache, input_metadata.block_tables,
+                input_metadata.context_lens, input_metadata.max_context_len,
+                self.num_kv_heads, self.scale, self.alibi_slopes,
+                kv_quant_param, None, input_metadata.kv_cache_dtype)
 
         # Reshape the output tensor.
         return output.view(batch_size, seq_len, hidden_size)
@@ -276,32 +271,35 @@ def _make_alibi_bias(
     return attn_bias
 
 
-def _paged_attention(
+def paged_attention(
     query: torch.Tensor,
     key_cache: torch.Tensor,
     value_cache: torch.Tensor,
-    input_metadata: InputMetadata,
+    block_tables: torch.Tensor,
+    context_lens: torch.Tensor,
+    max_context_len: int,
     num_kv_heads: int,
     scale: float,
     alibi_slopes: Optional[torch.Tensor],
+    custom_bias: Optional[torch.Tensor],
+    kv_cache_dtype: torch.dtype,
     kv_quant_param: List[float],
 ) -> torch.Tensor:
     output = torch.empty_like(query)
 
     block_size = value_cache.shape[3]
     num_seqs, num_heads, head_size = query.shape
-    max_num_partitions = (
-        (input_metadata.max_context_len + _PARTITION_SIZE - 1) //
-        _PARTITION_SIZE)
+    max_num_partitions = ((max_context_len + _PARTITION_SIZE - 1) //
+                          _PARTITION_SIZE)
     # NOTE: We use a simple heuristic to decide whether to use
     # PagedAttention V1 or V2. If the number of partitions is 1, we use
     # V1 to avoid the overhead of reduction. Also, if the number of
     # sequences or heads is large, we use V1 since there is enough work
     # to parallelize.
     # TODO: Tune this heuristic.
     # For context len > 8192, use V2 kernel to avoid shared memory shortage.
-    use_v1 = input_metadata.max_context_len <= 8192 and (
-        max_num_partitions == 1 or num_seqs * num_heads > 512)
+    use_v1 = max_context_len <= 8192 and (max_num_partitions == 1
+                                          or num_seqs * num_heads > 512)
     if use_v1:
         # Run PagedAttention V1.
         ops.paged_attention_v1(
@@ -311,12 +309,13 @@ def _paged_attention(
             value_cache,
             num_kv_heads,
             scale,
-            input_metadata.block_tables,
-            input_metadata.context_lens,
+            block_tables,
+            context_lens,
             block_size,
-            input_metadata.max_context_len,
+            max_context_len,
             alibi_slopes,
-            input_metadata.kv_cache_dtype,
+            custom_bias,
+            kv_cache_dtype,
             *kv_quant_param,
         )
     else:
@@ -343,12 +342,13 @@ def _paged_attention(
             value_cache,
             num_kv_heads,
             scale,
-            input_metadata.block_tables,
-            input_metadata.context_lens,
+            block_tables,
+            context_lens,
             block_size,
-            input_metadata.max_context_len,
+            max_context_len,
             alibi_slopes,
-            input_metadata.kv_cache_dtype,
+            custom_bias,
+            kv_cache_dtype,
             *kv_quant_param,
         )
     return output