From cd180ddd6d1063f016ffd1087bbeabd2f2a6820d Mon Sep 17 00:00:00 2001 From: Alex Chi Date: Fri, 31 Jan 2025 19:04:06 -0500 Subject: [PATCH] initial support for metal Signed-off-by: Alex Chi --- CMakeLists.txt | 2 +- build_and_run.sh | 8 ++ collect_env.py | 6 ++ requirements-metal.txt | 14 ++++ setup.py | 15 ++-- vllm/attention/backends/torch_sdpa.py | 19 ++--- vllm/config.py | 2 + vllm/engine/arg_utils.py | 1 + vllm/model_executor/layers/activation.py | 14 ++-- vllm/platforms/__init__.py | 16 +++- vllm/platforms/interface.py | 5 ++ vllm/platforms/metal.py | 98 ++++++++++++++++++++++++ vllm/worker/cpu_model_runner.py | 6 +- vllm/worker/cpu_worker.py | 2 +- 14 files changed, 176 insertions(+), 32 deletions(-) create mode 100755 build_and_run.sh create mode 100644 requirements-metal.txt create mode 100644 vllm/platforms/metal.py diff --git a/CMakeLists.txt b/CMakeLists.txt index c823c9ff895c3..5c0dabed7169b 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,7 +85,7 @@ find_package(Torch REQUIRED) # if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND NOT VLLM_TARGET_DEVICE STREQUAL "rocm") - if (VLLM_TARGET_DEVICE STREQUAL "cpu") + if (VLLM_TARGET_DEVICE STREQUAL "cpu" OR VLLM_TARGET_DEVICE STREQUAL "metal") include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake) else() return() diff --git a/build_and_run.sh b/build_and_run.sh new file mode 100755 index 0000000000000..e28c2a4197d9b --- /dev/null +++ b/build_and_run.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -e +export PYTORCH_ENABLE_MPS_FALLBACK=1 +export VLLM_TARGET_DEVICE=cpu +pip uninstall vllm +python setup.py install +vllm serve Qwen/Qwen2.5-0.5B-Instruct --dtype float16 diff --git a/collect_env.py b/collect_env.py index 254c19b19a5ac..02f531e11827a 100644 --- a/collect_env.py +++ b/collect_env.py @@ -515,6 +515,12 @@ def is_xnnpack_available(): else: return "N/A" +def is_mps_available(): + if TORCH_AVAILABLE: + return str(torch.backends.mps.is_available()) + else: + return "N/A" + def get_env_vars(): env_vars = '' secret_terms=('secret', 'token', 'api', 'access', 'password') diff --git a/requirements-metal.txt b/requirements-metal.txt new file mode 100644 index 0000000000000..5c1ae616a7270 --- /dev/null +++ b/requirements-metal.txt @@ -0,0 +1,14 @@ +# Common dependencies +-r requirements-common.txt + +# Dependencies for CPUs +torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin" + +# required for the image processor of minicpm-o-2_6, this must be updated alongside torch +torchaudio; platform_machine != "ppc64le" +torchaudio==2.5.1; platform_machine == "ppc64le" + +# required for the image processor of phi3v, this must be updated alongside torch +torchvision; platform_machine != "ppc64le" +torchvision==0.20.1; platform_machine == "ppc64le" +datasets # for benchmark scripts diff --git a/setup.py b/setup.py index 50a2392a4d83b..9892827069306 100755 --- a/setup.py +++ b/setup.py @@ -34,11 +34,7 @@ def load_module_from_path(module_name, path): VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE -if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu": - logger.warning( - "VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS") - VLLM_TARGET_DEVICE = "cpu" -elif not (sys.platform.startswith("linux") +if not (sys.platform.startswith("linux") or sys.platform.startswith("darwin")): logger.warning( "vLLM only supports Linux platform (including WSL) and MacOS." @@ -390,10 +386,11 @@ def _is_openvino() -> bool: def _is_xpu() -> bool: return VLLM_TARGET_DEVICE == "xpu" +def _is_metal() -> bool: + return VLLM_TARGET_DEVICE == "metal" def _build_custom_ops() -> bool: - return _is_cuda() or _is_hip() or _is_cpu() - + return _is_cuda() or _is_hip() or _is_cpu() or _is_metal() def get_rocm_version(): # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so @@ -521,6 +518,8 @@ def get_vllm_version() -> str: version += f"{sep}cpu" elif _is_xpu(): version += f"{sep}xpu" + elif _is_metal(): + version += f"{sep}metal" else: raise RuntimeError("Unknown runtime environment") @@ -581,6 +580,8 @@ def _read_requirements(filename: str) -> List[str]: requirements = _read_requirements("requirements-cpu.txt") elif _is_xpu(): requirements = _read_requirements("requirements-xpu.txt") + elif _is_metal(): + requirements = _read_requirements("requirements-metal.txt") else: raise ValueError( "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, " diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index c3b2398b4e632..14722f5c34049 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -283,6 +283,7 @@ class TorchSDPAMetadataBuilder(AttentionMetadataBuilder[TorchSDPAMetadata]): def __init__(self, input_builder: ModelInputForCPUBuilder) -> None: self.chunked_prefill = input_builder.chunked_prefill self.input_builder = input_builder + self._device = input_builder.device def prepare(self): self.input_data = self.input_builder.input_data @@ -294,7 +295,7 @@ def build(self, seq_lens: List[int], query_lens: List[int], prefill_query_lens = query_lens[0:input_data.num_prefills] slot_mapping = torch.tensor(input_data.slot_mapping, dtype=torch.long, - device="cpu") + device=self._device) # For chunked-prefill if self.chunked_prefill and input_data.num_prefill_tokens != 0: @@ -302,20 +303,20 @@ def build(self, seq_lens: List[int], query_lens: List[int], self.input_data.prefill_block_tables, pad=0, dtype=torch.int32, - device="cpu", + device=self._device, ) query_lens_tensor = torch.tensor(prefill_query_lens, dtype=torch.int32, - device="cpu") + device=self._device) kv_lens_tensor = torch.tensor(prefill_seq_lens, dtype=torch.int32, - device="cpu") + device=self._device) query_start_loc = torch.zeros(input_data.num_prefills + 1, dtype=torch.int32, - device="cpu") + device=self._device) kv_start_loc = torch.zeros(input_data.num_prefills + 1, dtype=torch.int32, - device="cpu") + device=self._device) torch.cumsum(query_lens_tensor, dim=0, dtype=torch.int32, @@ -338,20 +339,20 @@ def build(self, seq_lens: List[int], query_lens: List[int], seq_lens_tensor = torch.tensor( input_data.seq_lens[input_data.num_prefills:], dtype=torch.int32, - device="cpu", + device=self._device, ) block_tables = make_tensor_with_pad( self.input_data.decode_block_tables, pad=0, dtype=torch.int32, - device="cpu", + device=self._device, ) else: block_tables = torch.tensor([]) seq_lens_tensor = torch.tensor( input_data.seq_lens[:input_data.num_prefills], dtype=torch.int32, - device="cpu", + device=self._device, ) # For multi-modal models diff --git a/vllm/config.py b/vllm/config.py index 58464eae80b82..412b96cb82aea 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1569,6 +1569,8 @@ def __init__(self, device: str = "auto") -> None: # Some device types require processing inputs on CPU if self.device_type in ["neuron", "openvino"]: self.device = torch.device("cpu") + elif self.device_type in ["metal"]: + self.device = torch.device("mps") elif self.device_type in ["tpu"]: self.device = None else: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 1f203b6eaeb33..c615f95c790f1 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -38,6 +38,7 @@ "tpu", "xpu", "hpu", + "metal", ] diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index fb9684ac1c184..1693365bcba2d 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -32,7 +32,7 @@ def __init__(self, threshold: float = 0.): self.threshold = threshold if current_platform.is_cuda_alike(): self.op = torch.ops._C.fatrelu_and_mul - elif current_platform.is_cpu(): + elif current_platform.is_cpu() or current_platform.is_metal(): self._forward_method = self.forward_native def forward_native(self, x: torch.Tensor) -> torch.Tensor: @@ -63,7 +63,7 @@ class SiluAndMul(CustomOp): def __init__(self): super().__init__() - if current_platform.is_cuda_alike() or current_platform.is_cpu(): + if current_platform.is_cuda_alike() or current_platform.is_cpu() or current_platform.is_metal(): self.op = torch.ops._C.silu_and_mul elif current_platform.is_xpu(): from vllm._ipex_ops import ipex_ops @@ -107,7 +107,7 @@ def __init__(self): elif current_platform.is_xpu(): from vllm._ipex_ops import ipex_ops self.op = ipex_ops.silu_and_mul - elif current_platform.is_cpu(): + elif current_platform.is_cpu() or current_platform.is_metal(): self._forward_method = self.forward_native def forward_native(self, x: torch.Tensor) -> torch.Tensor: @@ -142,7 +142,7 @@ def __init__(self, approximate: str = "none"): self.approximate = approximate if approximate not in ("none", "tanh"): raise ValueError(f"Unknown approximate mode: {approximate}") - if current_platform.is_cuda_alike() or current_platform.is_cpu(): + if current_platform.is_cuda_alike() or current_platform.is_cpu() or current_platform.is_metal(): if approximate == "none": self.op = torch.ops._C.gelu_and_mul elif approximate == "tanh": @@ -182,7 +182,7 @@ class NewGELU(CustomOp): def __init__(self): super().__init__() - if current_platform.is_cuda_alike() or current_platform.is_cpu(): + if current_platform.is_cuda_alike() or current_platform.is_cpu() or current_platform.is_metal(): self.op = torch.ops._C.gelu_new elif current_platform.is_xpu(): from vllm._ipex_ops import ipex_ops @@ -208,7 +208,7 @@ class FastGELU(CustomOp): def __init__(self): super().__init__() - if current_platform.is_cuda_alike() or current_platform.is_cpu(): + if current_platform.is_cuda_alike() or current_platform.is_cpu() or current_platform.is_metal(): self.op = torch.ops._C.gelu_fast elif current_platform.is_xpu(): from vllm._ipex_ops import ipex_ops @@ -233,7 +233,7 @@ class QuickGELU(CustomOp): # https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90 def __init__(self): super().__init__() - if current_platform.is_cuda_alike() or current_platform.is_cpu(): + if current_platform.is_cuda_alike() or current_platform.is_cpu() or current_platform.is_metal(): self.op = torch.ops._C.gelu_quick elif current_platform.is_xpu(): from vllm._ipex_ops import ipex_ops diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index ddbdc43ca5710..8b1a508c7e6e0 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -96,15 +96,22 @@ def xpu_platform_plugin() -> Optional[str]: return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None +def metal_platform_plugin() -> Optional[str]: + is_metal = False + try: + from importlib.metadata import version + import torch + is_metal = torch.backends.mps.is_available() and "metal" in version("vllm") + except Exception: + pass + return "vllm.platforms.metal.MetalPlatform" if is_metal else None + + def cpu_platform_plugin() -> Optional[str]: is_cpu = False try: from importlib.metadata import version is_cpu = "cpu" in version("vllm") - if not is_cpu: - import platform - is_cpu = platform.machine().lower().startswith("arm") - except Exception: pass @@ -142,6 +149,7 @@ def openvino_platform_plugin() -> Optional[str]: 'cpu': cpu_platform_plugin, 'neuron': neuron_platform_plugin, 'openvino': openvino_platform_plugin, + 'metal': metal_platform_plugin, } diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index f2ecec3203fb7..3fae0a6ab3ef5 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -28,6 +28,7 @@ class _Backend(enum.Enum): XFORMERS = enum.auto() ROCM_FLASH = enum.auto() TORCH_SDPA = enum.auto() + TORCH_SDPA_2 = enum.auto() OPENVINO = enum.auto() FLASHINFER = enum.auto() HPU_ATTN = enum.auto() @@ -44,6 +45,7 @@ class PlatformEnum(enum.Enum): HPU = enum.auto() XPU = enum.auto() CPU = enum.auto() + METAL = enum.auto() NEURON = enum.auto() OPENVINO = enum.auto() OOT = enum.auto() @@ -123,6 +125,9 @@ def is_xpu(self) -> bool: def is_cpu(self) -> bool: return self._enum == PlatformEnum.CPU + def is_metal(self) -> bool: + return self._enum == PlatformEnum.METAL + def is_neuron(self) -> bool: return self._enum == PlatformEnum.NEURON diff --git a/vllm/platforms/metal.py b/vllm/platforms/metal.py new file mode 100644 index 0000000000000..47f5ea1c60d8a --- /dev/null +++ b/vllm/platforms/metal.py @@ -0,0 +1,98 @@ +import os +from typing import TYPE_CHECKING, Optional + +import psutil +import torch + +from vllm.logger import init_logger + +from .interface import Platform, PlatformEnum, _Backend + +logger = init_logger(__name__) + +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = None + +logger = init_logger(__name__) + + +class MetalPlatform(Platform): + _enum = PlatformEnum.METAL + device_name: str = "metal" + device_type: str = "metal" + dispatch_key: str = "CPU" + + @classmethod + def get_device_name(cls, device_id: int = 0) -> str: + return "metal" + + @classmethod + def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, + dtype: torch.dtype, kv_cache_dtype: Optional[str], + block_size: int, use_v1: bool) -> str: + if selected_backend != _Backend.TORCH_SDPA: + logger.info("Cannot use %s backend on Metal.", selected_backend) + logger.info("Using Torch SDPA backend.") + return "vllm.attention.backends.torch_sdpa.TorchSDPABackend" + + @classmethod + def get_device_total_memory(cls, device_id: int = 0) -> int: + return psutil.virtual_memory().total + + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + return False + + @classmethod + def inference_mode(cls): + return torch.no_grad() + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + import vllm.envs as envs + from vllm.utils import GiB_bytes + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + + if cache_config and cache_config.block_size is None: + cache_config.block_size = 16 + + kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE + + if kv_cache_space >= 0: + if kv_cache_space == 0: + cache_config.cpu_kvcache_space_bytes = 4 * GiB_bytes # type: ignore + logger.warning( + "Environment variable VLLM_CPU_KVCACHE_SPACE (GB) " + "for CPU backend is not set, using 4 by default.") + else: + cache_config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes # type: ignore # noqa + else: + raise RuntimeError( + "Invalid environment variable VLLM_CPU_KVCACHE_SPACE" + f" {kv_cache_space}, expect a positive integer value.") + + scheduler_config = vllm_config.scheduler_config + if ((scheduler_config.chunked_prefill_enabled + or cache_config.enable_prefix_caching) + and model_config.dtype == torch.half): + logger.warning("Chunked-prefill on the CPU backend only does not" + " support fp16 for now, cast to bf16.") + model_config.dtype = torch.bfloat16 # TODO supported? + + parallel_config = vllm_config.parallel_config + if parallel_config.worker_cls == "auto": + parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker" + + assert vllm_config.device_config.device_type == "metal" + + @classmethod + def is_pin_memory_available(cls) -> bool: + logger.warning("Pin memory is not supported on Metal.") + return False + + @classmethod + def get_punica_wrapper(cls) -> str: + return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU" diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 4b429b67b36f8..f975d01732cd1 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -179,16 +179,16 @@ def build(self) -> ModelInputForCPU: input_data = self.input_data input_tokens = torch.tensor(input_data.input_tokens, dtype=torch.long, - device="cpu") + device=self.device) input_positions = torch.tensor( input_data.input_positions if not any(input_data.input_mrope_positions) else input_data.input_mrope_positions, dtype=torch.long, - device="cpu") + device=self.device) token_type_ids = torch.tensor(input_data.token_type_ids, dtype=torch.long, - device="cpu") \ + device=self.device) \ if input_data.token_type_ids else None # For multi-modal models diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 3e5fcf11b9e16..868b8462a85b7 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -35,7 +35,7 @@ class CPUCacheEngine: def __init__(self, cache_config: CacheConfig, model_config: ModelConfig, parallel_config: ParallelConfig, device_config: DeviceConfig) -> None: - assert device_config.device_type == "cpu" + assert device_config.device_type == "cpu" or device_config.device_type == "metal" self.cache_config = cache_config self.model_config = model_config self.parallel_config = parallel_config