From cd180ddd6d1063f016ffd1087bbeabd2f2a6820d Mon Sep 17 00:00:00 2001
From: Alex Chi <iskyzh@gmail.com>
Date: Fri, 31 Jan 2025 19:04:06 -0500
Subject: [PATCH] initial support for metal

Signed-off-by: Alex Chi <iskyzh@gmail.com>
---
 CMakeLists.txt                           |  2 +-
 build_and_run.sh                         |  8 ++
 collect_env.py                           |  6 ++
 requirements-metal.txt                   | 14 ++++
 setup.py                                 | 15 ++--
 vllm/attention/backends/torch_sdpa.py    | 19 ++---
 vllm/config.py                           |  2 +
 vllm/engine/arg_utils.py                 |  1 +
 vllm/model_executor/layers/activation.py | 14 ++--
 vllm/platforms/__init__.py               | 16 +++-
 vllm/platforms/interface.py              |  5 ++
 vllm/platforms/metal.py                  | 98 ++++++++++++++++++++++++
 vllm/worker/cpu_model_runner.py          |  6 +-
 vllm/worker/cpu_worker.py                |  2 +-
 14 files changed, 176 insertions(+), 32 deletions(-)
 create mode 100755 build_and_run.sh
 create mode 100644 requirements-metal.txt
 create mode 100644 vllm/platforms/metal.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c823c9ff895c3..5c0dabed7169b 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -85,7 +85,7 @@ find_package(Torch REQUIRED)
 #
 if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
     NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
-    if (VLLM_TARGET_DEVICE STREQUAL "cpu")
+    if (VLLM_TARGET_DEVICE STREQUAL "cpu" OR VLLM_TARGET_DEVICE STREQUAL "metal")
         include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
     else()
         return()
diff --git a/build_and_run.sh b/build_and_run.sh
new file mode 100755
index 0000000000000..e28c2a4197d9b
--- /dev/null
+++ b/build_and_run.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e
+export PYTORCH_ENABLE_MPS_FALLBACK=1
+export VLLM_TARGET_DEVICE=cpu
+pip uninstall vllm
+python setup.py install
+vllm serve Qwen/Qwen2.5-0.5B-Instruct --dtype float16
diff --git a/collect_env.py b/collect_env.py
index 254c19b19a5ac..02f531e11827a 100644
--- a/collect_env.py
+++ b/collect_env.py
@@ -515,6 +515,12 @@ def is_xnnpack_available():
     else:
         return "N/A"
 
+def is_mps_available():
+    if TORCH_AVAILABLE:
+        return str(torch.backends.mps.is_available())
+    else:
+        return "N/A"
+
 def get_env_vars():
     env_vars = ''
     secret_terms=('secret', 'token', 'api', 'access', 'password')
diff --git a/requirements-metal.txt b/requirements-metal.txt
new file mode 100644
index 0000000000000..5c1ae616a7270
--- /dev/null
+++ b/requirements-metal.txt
@@ -0,0 +1,14 @@
+# Common dependencies
+-r requirements-common.txt
+
+# Dependencies for CPUs
+torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin" 
+
+# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
+torchaudio; platform_machine != "ppc64le"
+torchaudio==2.5.1; platform_machine == "ppc64le"
+
+# required for the image processor of phi3v, this must be updated alongside torch
+torchvision; platform_machine != "ppc64le"
+torchvision==0.20.1; platform_machine == "ppc64le"
+datasets # for benchmark scripts
diff --git a/setup.py b/setup.py
index 50a2392a4d83b..9892827069306 100755
--- a/setup.py
+++ b/setup.py
@@ -34,11 +34,7 @@ def load_module_from_path(module_name, path):
 
 VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
 
-if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
-    logger.warning(
-        "VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
-    VLLM_TARGET_DEVICE = "cpu"
-elif not (sys.platform.startswith("linux")
+if not (sys.platform.startswith("linux")
           or sys.platform.startswith("darwin")):
     logger.warning(
         "vLLM only supports Linux platform (including WSL) and MacOS."
@@ -390,10 +386,11 @@ def _is_openvino() -> bool:
 def _is_xpu() -> bool:
     return VLLM_TARGET_DEVICE == "xpu"
 
+def _is_metal() -> bool:
+    return VLLM_TARGET_DEVICE == "metal"
 
 def _build_custom_ops() -> bool:
-    return _is_cuda() or _is_hip() or _is_cpu()
-
+    return _is_cuda() or _is_hip() or _is_cpu() or _is_metal()
 
 def get_rocm_version():
     # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
@@ -521,6 +518,8 @@ def get_vllm_version() -> str:
         version += f"{sep}cpu"
     elif _is_xpu():
         version += f"{sep}xpu"
+    elif _is_metal():
+        version += f"{sep}metal"
     else:
         raise RuntimeError("Unknown runtime environment")
 
@@ -581,6 +580,8 @@ def _read_requirements(filename: str) -> List[str]:
         requirements = _read_requirements("requirements-cpu.txt")
     elif _is_xpu():
         requirements = _read_requirements("requirements-xpu.txt")
+    elif _is_metal():
+        requirements = _read_requirements("requirements-metal.txt")
     else:
         raise ValueError(
             "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index c3b2398b4e632..14722f5c34049 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -283,6 +283,7 @@ class TorchSDPAMetadataBuilder(AttentionMetadataBuilder[TorchSDPAMetadata]):
     def __init__(self, input_builder: ModelInputForCPUBuilder) -> None:
         self.chunked_prefill = input_builder.chunked_prefill
         self.input_builder = input_builder
+        self._device = input_builder.device 
 
     def prepare(self):
         self.input_data = self.input_builder.input_data
@@ -294,7 +295,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         prefill_query_lens = query_lens[0:input_data.num_prefills]
         slot_mapping = torch.tensor(input_data.slot_mapping,
                                     dtype=torch.long,
-                                    device="cpu")
+                                    device=self._device)
 
         # For chunked-prefill
         if self.chunked_prefill and input_data.num_prefill_tokens != 0:
@@ -302,20 +303,20 @@ def build(self, seq_lens: List[int], query_lens: List[int],
                 self.input_data.prefill_block_tables,
                 pad=0,
                 dtype=torch.int32,
-                device="cpu",
+                device=self._device,
             )
             query_lens_tensor = torch.tensor(prefill_query_lens,
                                              dtype=torch.int32,
-                                             device="cpu")
+                                             device=self._device)
             kv_lens_tensor = torch.tensor(prefill_seq_lens,
                                           dtype=torch.int32,
-                                          device="cpu")
+                                          device=self._device)
             query_start_loc = torch.zeros(input_data.num_prefills + 1,
                                           dtype=torch.int32,
-                                          device="cpu")
+                                          device=self._device)
             kv_start_loc = torch.zeros(input_data.num_prefills + 1,
                                        dtype=torch.int32,
-                                       device="cpu")
+                                       device=self._device)
             torch.cumsum(query_lens_tensor,
                          dim=0,
                          dtype=torch.int32,
@@ -338,20 +339,20 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             seq_lens_tensor = torch.tensor(
                 input_data.seq_lens[input_data.num_prefills:],
                 dtype=torch.int32,
-                device="cpu",
+                device=self._device,
             )
             block_tables = make_tensor_with_pad(
                 self.input_data.decode_block_tables,
                 pad=0,
                 dtype=torch.int32,
-                device="cpu",
+                device=self._device,
             )
         else:
             block_tables = torch.tensor([])
             seq_lens_tensor = torch.tensor(
                 input_data.seq_lens[:input_data.num_prefills],
                 dtype=torch.int32,
-                device="cpu",
+                device=self._device,
             )
 
         # For multi-modal models
diff --git a/vllm/config.py b/vllm/config.py
index 58464eae80b82..412b96cb82aea 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1569,6 +1569,8 @@ def __init__(self, device: str = "auto") -> None:
         # Some device types require processing inputs on CPU
         if self.device_type in ["neuron", "openvino"]:
             self.device = torch.device("cpu")
+        elif self.device_type in ["metal"]:
+            self.device = torch.device("mps")
         elif self.device_type in ["tpu"]:
             self.device = None
         else:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 1f203b6eaeb33..c615f95c790f1 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -38,6 +38,7 @@
     "tpu",
     "xpu",
     "hpu",
+    "metal",
 ]
 
 
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index fb9684ac1c184..1693365bcba2d 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -32,7 +32,7 @@ def __init__(self, threshold: float = 0.):
         self.threshold = threshold
         if current_platform.is_cuda_alike():
             self.op = torch.ops._C.fatrelu_and_mul
-        elif current_platform.is_cpu():
+        elif current_platform.is_cpu() or current_platform.is_metal():
             self._forward_method = self.forward_native
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
@@ -63,7 +63,7 @@ class SiluAndMul(CustomOp):
 
     def __init__(self):
         super().__init__()
-        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+        if current_platform.is_cuda_alike() or current_platform.is_cpu() or current_platform.is_metal():
             self.op = torch.ops._C.silu_and_mul
         elif current_platform.is_xpu():
             from vllm._ipex_ops import ipex_ops
@@ -107,7 +107,7 @@ def __init__(self):
         elif current_platform.is_xpu():
             from vllm._ipex_ops import ipex_ops
             self.op = ipex_ops.silu_and_mul
-        elif current_platform.is_cpu():
+        elif current_platform.is_cpu() or current_platform.is_metal():
             self._forward_method = self.forward_native
 
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
@@ -142,7 +142,7 @@ def __init__(self, approximate: str = "none"):
         self.approximate = approximate
         if approximate not in ("none", "tanh"):
             raise ValueError(f"Unknown approximate mode: {approximate}")
-        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+        if current_platform.is_cuda_alike() or current_platform.is_cpu() or current_platform.is_metal():
             if approximate == "none":
                 self.op = torch.ops._C.gelu_and_mul
             elif approximate == "tanh":
@@ -182,7 +182,7 @@ class NewGELU(CustomOp):
 
     def __init__(self):
         super().__init__()
-        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+        if current_platform.is_cuda_alike() or current_platform.is_cpu() or current_platform.is_metal():
             self.op = torch.ops._C.gelu_new
         elif current_platform.is_xpu():
             from vllm._ipex_ops import ipex_ops
@@ -208,7 +208,7 @@ class FastGELU(CustomOp):
 
     def __init__(self):
         super().__init__()
-        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+        if current_platform.is_cuda_alike() or current_platform.is_cpu() or current_platform.is_metal():
             self.op = torch.ops._C.gelu_fast
         elif current_platform.is_xpu():
             from vllm._ipex_ops import ipex_ops
@@ -233,7 +233,7 @@ class QuickGELU(CustomOp):
     # https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90
     def __init__(self):
         super().__init__()
-        if current_platform.is_cuda_alike() or current_platform.is_cpu():
+        if current_platform.is_cuda_alike() or current_platform.is_cpu() or current_platform.is_metal():
             self.op = torch.ops._C.gelu_quick
         elif current_platform.is_xpu():
             from vllm._ipex_ops import ipex_ops
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index ddbdc43ca5710..8b1a508c7e6e0 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -96,15 +96,22 @@ def xpu_platform_plugin() -> Optional[str]:
     return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None
 
 
+def metal_platform_plugin() -> Optional[str]:
+    is_metal = False
+    try:
+        from importlib.metadata import version
+        import torch
+        is_metal = torch.backends.mps.is_available() and "metal" in version("vllm")
+    except Exception:
+        pass
+    return "vllm.platforms.metal.MetalPlatform" if is_metal else None
+
+
 def cpu_platform_plugin() -> Optional[str]:
     is_cpu = False
     try:
         from importlib.metadata import version
         is_cpu = "cpu" in version("vllm")
-        if not is_cpu:
-            import platform
-            is_cpu = platform.machine().lower().startswith("arm")
-
     except Exception:
         pass
 
@@ -142,6 +149,7 @@ def openvino_platform_plugin() -> Optional[str]:
     'cpu': cpu_platform_plugin,
     'neuron': neuron_platform_plugin,
     'openvino': openvino_platform_plugin,
+    'metal': metal_platform_plugin,
 }
 
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index f2ecec3203fb7..3fae0a6ab3ef5 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -28,6 +28,7 @@ class _Backend(enum.Enum):
     XFORMERS = enum.auto()
     ROCM_FLASH = enum.auto()
     TORCH_SDPA = enum.auto()
+    TORCH_SDPA_2 = enum.auto()
     OPENVINO = enum.auto()
     FLASHINFER = enum.auto()
     HPU_ATTN = enum.auto()
@@ -44,6 +45,7 @@ class PlatformEnum(enum.Enum):
     HPU = enum.auto()
     XPU = enum.auto()
     CPU = enum.auto()
+    METAL = enum.auto()
     NEURON = enum.auto()
     OPENVINO = enum.auto()
     OOT = enum.auto()
@@ -123,6 +125,9 @@ def is_xpu(self) -> bool:
     def is_cpu(self) -> bool:
         return self._enum == PlatformEnum.CPU
 
+    def is_metal(self) -> bool:
+        return self._enum == PlatformEnum.METAL
+
     def is_neuron(self) -> bool:
         return self._enum == PlatformEnum.NEURON
 
diff --git a/vllm/platforms/metal.py b/vllm/platforms/metal.py
new file mode 100644
index 0000000000000..47f5ea1c60d8a
--- /dev/null
+++ b/vllm/platforms/metal.py
@@ -0,0 +1,98 @@
+import os
+from typing import TYPE_CHECKING, Optional
+
+import psutil
+import torch
+
+from vllm.logger import init_logger
+
+from .interface import Platform, PlatformEnum, _Backend
+
+logger = init_logger(__name__)
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+
+logger = init_logger(__name__)
+
+
+class MetalPlatform(Platform):
+    _enum = PlatformEnum.METAL
+    device_name: str = "metal"
+    device_type: str = "metal"
+    dispatch_key: str = "CPU"
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return "metal"
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool) -> str:
+        if selected_backend != _Backend.TORCH_SDPA:
+            logger.info("Cannot use %s backend on Metal.", selected_backend)
+        logger.info("Using Torch SDPA backend.")
+        return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        return psutil.virtual_memory().total
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return False
+
+    @classmethod
+    def inference_mode(cls):
+        return torch.no_grad()
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        import vllm.envs as envs
+        from vllm.utils import GiB_bytes
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
+        kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
+
+        if kv_cache_space >= 0:
+            if kv_cache_space == 0:
+                cache_config.cpu_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
+                logger.warning(
+                    "Environment variable VLLM_CPU_KVCACHE_SPACE (GB) "
+                    "for CPU backend is not set, using 4 by default.")
+            else:
+                cache_config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes  # type: ignore # noqa
+        else:
+            raise RuntimeError(
+                "Invalid environment variable VLLM_CPU_KVCACHE_SPACE"
+                f" {kv_cache_space}, expect a positive integer value.")
+
+        scheduler_config = vllm_config.scheduler_config
+        if ((scheduler_config.chunked_prefill_enabled
+             or cache_config.enable_prefix_caching)
+                and model_config.dtype == torch.half):
+            logger.warning("Chunked-prefill on the CPU backend only does not"
+                           " support fp16 for now, cast to bf16.")
+            model_config.dtype = torch.bfloat16 # TODO supported?
+        
+        parallel_config = vllm_config.parallel_config
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"
+
+        assert vllm_config.device_config.device_type == "metal"
+
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        logger.warning("Pin memory is not supported on Metal.")
+        return False
+
+    @classmethod
+    def get_punica_wrapper(cls) -> str:
+        return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 4b429b67b36f8..f975d01732cd1 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -179,16 +179,16 @@ def build(self) -> ModelInputForCPU:
         input_data = self.input_data
         input_tokens = torch.tensor(input_data.input_tokens,
                                     dtype=torch.long,
-                                    device="cpu")
+                                    device=self.device)
         input_positions = torch.tensor(
             input_data.input_positions
             if not any(input_data.input_mrope_positions) else
             input_data.input_mrope_positions,
             dtype=torch.long,
-            device="cpu")
+            device=self.device)
         token_type_ids = torch.tensor(input_data.token_type_ids,
                                     dtype=torch.long,
-                                    device="cpu") \
+                                    device=self.device) \
                                     if input_data.token_type_ids else None
 
         # For multi-modal models
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 3e5fcf11b9e16..868b8462a85b7 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -35,7 +35,7 @@ class CPUCacheEngine:
     def __init__(self, cache_config: CacheConfig, model_config: ModelConfig,
                  parallel_config: ParallelConfig,
                  device_config: DeviceConfig) -> None:
-        assert device_config.device_type == "cpu"
+        assert device_config.device_type == "cpu" or device_config.device_type == "metal"
         self.cache_config = cache_config
         self.model_config = model_config
         self.parallel_config = parallel_config