ROCm · jpvillam-amd · Mar 15, 2024 · Mar 19, 2024 · Mar 19, 2024 · Mar 19, 2024
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
@@ -26,6 +26,9 @@ ARG BUILD_FA="1"
 # whether to build cupy on rocm
 ARG BUILD_CUPY="1"
 
+# whether to build triton on rocm
+ARG BUILD_TRITON="1"
+
 # Install some basic utilities
 RUN apt-get update && apt-get install python3 python3-pip -y
 
@@ -95,6 +98,17 @@ RUN if [ "$BUILD_CUPY" = "1" ]; then \
     && cd ..; \
     fi
 
+# build triton
+RUN if [ "$BUILD_TRITON" = "1"]; then \
+    mkdir -p libs \
+    && cd libs \
+    && pip uninstall -y triton \
+    && git clone https://github.com/ROCmSoftwarePlatform/triton.git
+    && cd triton/python \
+    && pip3 install -e . \
+    && cd ../..; \
+    fi
+
 COPY ./ /app/vllm
 
 RUN python3 -m pip install --upgrade pip

diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py
@@ -30,7 +30,7 @@ def __init__(
         sliding_window: Optional[int] = None,
     ) -> None:
         super().__init__()
-        if (not is_hip() and torch.cuda.get_device_capability()[0] >= 8 and
+        if (torch.cuda.get_device_capability()[0] >= 8 and
                 torch.get_default_dtype() in (torch.float16, torch.bfloat16)):
             # Ampere or later NVIDIA GPUs.
             # NOTE(woosuk): FlashAttention does not support FP32.

diff --git a/vllm/model_executor/layers/attention/backends/flash_attn.py b/vllm/model_executor/layers/attention/backends/flash_attn.py
@@ -2,12 +2,21 @@
 from typing import List, Optional
 
 # NOTE(woosuk): This imports flash_attn under vllm/thirdparty_files/.
-from flash_attn import flash_attn_func
+from vllm.utils import is_hip
+try:
+    from flash_attn import flash_attn_func
+except ImportError:
+    if is_hip():
+        pass
+    else:
+        raise
+
 import torch
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.attention.ops.paged_attn import (
     PagedAttentionImpl)
+from vllm.model_executor.layers.attention.ops.flash_attention_triton import attention
 
 
 class FlashAttentionBackend:
@@ -86,15 +95,26 @@ def forward(
                 query = query.unflatten(0, (batch_size, seq_len))
                 key = key.unflatten(0, (batch_size, seq_len))
                 value = value.unflatten(0, (batch_size, seq_len))
-                output = flash_attn_func(
-                    query,
-                    key,
-                    value,
-                    softmax_scale=self.scale,
-                    causal=True,
-                    window_size=self.sliding_window,
-                    alibi_slopes=self.alibi_slopes,
-                )
+                if is_hip():
+                    output, _ = attention(
+                                query,
+                                key,
+                                value,
+                                None,
+                                input_metadata,
+                                True,
+                                self.scale,
+                            )
+                else:
+                    output = flash_attn_func(
+                        query,
+                        key,
+                        value,
+                        softmax_scale=self.scale,
+                        causal=True,
+                        window_size=self.sliding_window,
+                        alibi_slopes=self.alibi_slopes,
+                    )
             else:
                 # prefix-enabled attention
                 output = PagedAttentionImpl.forward_prefix(