Refactor app_tests/integration_tests/llm for easy addition of more …

…models for testing (#728) Bundled up loose parameter passing in the testing infra to support support easily adding models from HF, Azure, and local like: ```python TEST_MODELS = { "open_llama_3b": ModelConfig( source=ModelSource.HUGGINGFACE, repo_id="SlyEcho/open_llama_3b_v2_gguf", model_file="open-llama-3b-v2-f16.gguf", tokenizer_id="openlm-research/open_llama_3b_v2", batch_sizes=(1, 4), device_settings=device_settings.CPU, ), "llama3.1_8b": ModelConfig( source=ModelSource.LOCAL, local_path=Path("/data/llama3.1/8b/llama8b_f16.irpa"), model_file="llama8b_f16.irpa", tokenizer_id="NousResearch/Meta-Llama-3.1-8B", batch_sizes=(1, 4), device_settings=device_settings.CPU, ), "azure_llama": ModelConfig( source=ModelSource.AZURE, azure_config=AzureConfig( account_name="sharkblobs", container_name="halo-models", blob_path="llm-dev/llama3_8b/8b_f16.irpa", ), model_file="azure-llama.irpa", tokenizer_id="openlm-research/open_llama_3b_v2", batch_sizes=(1, 4), device_settings=device_settings.CPU, ), } ``` Sharktank has [a similar list of models](https://github.com/nod-ai/shark-ai/blob/main/sharktank/sharktank/utils/hf_datasets.py) in `hf_datasets.py` that only supports huggingface. Might be of interest to draw from that at some point.
nod-ai · Jan 14, 2025 · 946fb7c · 946fb7c
1 parent 9c95aa8
commit 946fb7c
Show file tree

Hide file tree

Showing 5 changed files with 594 additions and 333 deletions.
diff --git a/app_tests/integration_tests/llm/device_settings.py b/app_tests/integration_tests/llm/device_settings.py
@@ -0,0 +1,17 @@
+from typing import Tuple
+from dataclasses import dataclass
+
+
+@dataclass
+class DeviceSettings:
+    compile_flags: Tuple[str]
+    server_flags: Tuple[str]
+
+
+CPU = DeviceSettings(
+    compile_flags=(
+        "-iree-hal-target-backends=llvm-cpu",
+        "--iree-llvmcpu-target-cpu=host",
+    ),
+    server_flags=("--device=local-task",),
+)
diff --git a/app_tests/integration_tests/llm/model_management.py b/app_tests/integration_tests/llm/model_management.py
@@ -0,0 +1,227 @@
+"""Module for managing model artifacts through various processing stages."""
+import logging
+from pathlib import Path
+import subprocess
+from dataclasses import dataclass
+from typing import Optional, Tuple
+from enum import Enum, auto
+
+logger = logging.getLogger(__name__)
+
+
+class ModelSource(Enum):
+    HUGGINGFACE = auto()
+    LOCAL = auto()
+    AZURE = auto()
+
+
+@dataclass
+class AzureConfig:
+    """Configuration for Azure blob storage downloads."""
+
+    account_name: str
+    container_name: str
+    blob_path: str
+    auth_mode: str = "key"
+
+
+@dataclass
+class ModelConfig:
+    """Configuration for model source and settings."""
+
+    model_file: str
+    tokenizer_id: str
+    batch_sizes: Tuple[int, ...]
+    device_settings: "DeviceSettings"
+    source: ModelSource
+    repo_id: Optional[str] = None
+    local_path: Optional[Path] = None
+    azure_config: Optional[AzureConfig] = None
+
+    def __post_init__(self):
+        if self.source == ModelSource.HUGGINGFACE and not self.repo_id:
+            raise ValueError("repo_id required for HuggingFace models")
+        elif self.source == ModelSource.LOCAL and not self.local_path:
+            raise ValueError("local_path required for local models")
+        elif self.source == ModelSource.AZURE and not self.azure_config:
+            raise ValueError("azure_config required for Azure models")
+
+
+@dataclass
+class ModelArtifacts:
+    """Container for all paths related to model artifacts."""
+
+    weights_path: Path
+    tokenizer_path: Path
+    mlir_path: Path
+    vmfb_path: Path
+    config_path: Path
+
+
+class ModelStageManager:
+    """Manages different stages of model processing with caching behavior."""
+
+    def __init__(self, base_dir: Path, config: ModelConfig):
+        self.base_dir = base_dir
+        self.config = config
+        self.model_dir = self._get_model_dir()
+        self.model_dir.mkdir(parents=True, exist_ok=True)
+
+    def _get_model_dir(self) -> Path:
+        """Creates and returns appropriate model directory based on source."""
+        if self.config.source == ModelSource.HUGGINGFACE:
+            return self.base_dir / self.config.repo_id.replace("/", "_")
+        elif self.config.source == ModelSource.LOCAL:
+            return self.base_dir / "local" / self.config.local_path.stem
+        elif self.config.source == ModelSource.AZURE:
+            return (
+                self.base_dir
+                / "azure"
+                / self.config.azure_config.blob_path.replace("/", "_")
+            )
+        raise ValueError(f"Unsupported model source: {self.config.source}")
+
+    def _download_from_huggingface(self) -> Path:
+        """Downloads model from HuggingFace."""
+        model_path = self.model_dir / self.config.model_file
+        if not model_path.exists():
+            logger.info(f"Downloading model {self.config.repo_id} from HuggingFace")
+            subprocess.run(
+                f"huggingface-cli download --local-dir {self.model_dir} {self.config.repo_id} {self.config.model_file}",
+                shell=True,
+                check=True,
+            )
+        return model_path
+
+    def _copy_from_local(self) -> Path:
+        """Copies model from local filesystem."""
+        import shutil
+
+        model_path = self.model_dir / self.config.model_file
+        if not model_path.exists():
+            logger.info(f"Copying local model from {self.config.local_path}")
+            shutil.copy2(self.config.local_path, model_path)
+        return model_path
+
+    def _download_from_azure(self) -> Path:
+        """Downloads model from Azure blob storage."""
+        model_path = self.model_dir / self.config.model_file
+        if not model_path.exists():
+            logger.info(
+                f"Downloading model from Azure blob storage: {self.config.azure_config.blob_path}"
+            )
+            subprocess.run(
+                [
+                    "az",
+                    "storage",
+                    "blob",
+                    "download",
+                    "--account-name",
+                    self.config.azure_config.account_name,
+                    "--container-name",
+                    self.config.azure_config.container_name,
+                    "--name",
+                    self.config.azure_config.blob_path,
+                    "--file",
+                    str(model_path),
+                    "--auth-mode",
+                    self.config.azure_config.auth_mode,
+                ],
+                check=True,
+            )
+        return model_path
+
+    def prepare_tokenizer(self) -> Path:
+        """Downloads and prepares tokenizer."""
+        tokenizer_path = self.model_dir / "tokenizer.json"
+        if not tokenizer_path.exists():
+            logger.info(f"Downloading tokenizer {self.config.tokenizer_id}")
+            from transformers import AutoTokenizer
+
+            tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_id)
+            tokenizer.save_pretrained(self.model_dir)
+        return tokenizer_path
+
+    def export_model(self, weights_path: Path) -> Tuple[Path, Path]:
+        """Exports model to MLIR format."""
+        bs_string = ",".join(map(str, self.config.batch_sizes))
+        mlir_path = self.model_dir / "model.mlir"
+        config_path = self.model_dir / "config.json"
+
+        logger.info(
+            "Exporting model with following settings:\n"
+            f"  MLIR Path: {mlir_path}\n"
+            f"  Config Path: {config_path}\n"
+            f"  Batch Sizes: {bs_string}"
+        )
+
+        subprocess.run(
+            [
+                "python",
+                "-m",
+                "sharktank.examples.export_paged_llm_v1",
+                "--block-seq-stride=16",
+                f"--{weights_path.suffix.strip('.')}-file={weights_path}",
+                f"--output-mlir={mlir_path}",
+                f"--output-config={config_path}",
+                f"--bs={bs_string}",
+            ],
+            check=True,
+        )
+
+        logger.info(f"Model successfully exported to {mlir_path}")
+        return mlir_path, config_path
+
+    def compile_model(self, mlir_path: Path) -> Path:
+        """Compiles model to VMFB format."""
+        vmfb_path = self.model_dir / "model.vmfb"
+        logger.info(f"Compiling model to {vmfb_path}")
+
+        compile_command = [
+            "iree-compile",
+            str(mlir_path),
+            "-o",
+            str(vmfb_path),
+        ]
+        compile_command.extend(self.config.device_settings.compile_flags)
+
+        subprocess.run(compile_command, check=True)
+        logger.info(f"Model successfully compiled to {vmfb_path}")
+        return vmfb_path
+
+
+class ModelProcessor:
+    """Main interface for processing models through all stages."""
+
+    def __init__(self, base_dir: Path):
+        self.base_dir = Path(base_dir)
+
+    def process_model(self, config: ModelConfig) -> ModelArtifacts:
+        """Process model through all stages and return paths to all artifacts."""
+        manager = ModelStageManager(self.base_dir, config)
+
+        # Stage 1: Download weights and tokenizer (cached)
+        if config.source == ModelSource.HUGGINGFACE:
+            weights_path = manager._download_from_huggingface()
+        elif config.source == ModelSource.LOCAL:
+            weights_path = manager._copy_from_local()
+        elif config.source == ModelSource.AZURE:
+            weights_path = manager._download_from_azure()
+        else:
+            raise ValueError(f"Unsupported model source: {config.source}")
+
+        tokenizer_path = manager.prepare_tokenizer()
+
+        # Stage 2: Export model (fresh every time)
+        mlir_path, config_path = manager.export_model(weights_path)
+
+        # Stage 3: Compile model (fresh every time)
+        vmfb_path = manager.compile_model(mlir_path)
+
+        return ModelArtifacts(
+            weights_path=weights_path,
+            tokenizer_path=tokenizer_path,
+            mlir_path=mlir_path,
+            vmfb_path=vmfb_path,
+            config_path=config_path,
+        )
diff --git a/app_tests/integration_tests/llm/server_management.py b/app_tests/integration_tests/llm/server_management.py
@@ -0,0 +1,126 @@
+"""Handles server lifecycle and configuration."""
+import json
+import socket
+from contextlib import closing
+from dataclasses import dataclass
+import subprocess
+import time
+import requests
+from pathlib import Path
+import sys
+from typing import Optional
+
+from .device_settings import DeviceSettings
+from .model_management import ModelArtifacts
+
+
+@dataclass
+class ServerConfig:
+    """Configuration for server instance."""
+
+    artifacts: ModelArtifacts
+    device_settings: DeviceSettings
+    prefix_sharing_algorithm: str = "none"
+
+
+class ServerInstance:
+    """An instance of the shortfin llm inference server.
+
+    Example usage:
+
+    ```
+        from shortfin_apps.llm.server_management import ServerInstance, ServerConfig
+        # Create and start server
+        server = Server(config=ServerConfig(
+            artifacts=model_artifacts,
+            device_settings=device_settings,
+            prefix_sharing_algorithm="none"
+        ))
+
+        server.start()  # This starts the server and waits for it to be ready
+
+        # Use the server
+        print(f"Server running on port {server.port}")
+
+        # Cleanup when done
+        server.stop()
+    ```
+    """
+
+    def __init__(self, config: ServerConfig):
+        self.config = config
+        self.process: Optional[subprocess.Popen] = None
+        self.port: Optional[int] = None
+        self.config_path: Optional[Path] = None
+
+    @staticmethod
+    def find_available_port() -> int:
+        """Finds an available port for the server."""
+        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+            s.bind(("", 0))
+            s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            return s.getsockname()[1]
+
+    def _write_config(self) -> Path:
+        """Creates server config by extending the exported model config."""
+        # TODO: eliminate this by moving prefix sharing algorithm to be a cmdline arg of server.py
+        source_config_path = self.config.artifacts.config_path
+        server_config_path = (
+            source_config_path.parent
+            / f"server_config_{self.config.prefix_sharing_algorithm}.json"
+        )
+
+        # Read the exported config as base
+        with open(source_config_path) as f:
+            config = json.load(f)
+        config["paged_kv_cache"][
+            "prefix_sharing_algorithm"
+        ] = self.config.prefix_sharing_algorithm
+        with open(server_config_path, "w") as f:
+            json.dump(config, f)
+        return server_config_path
+
+    def start(self) -> None:
+        """Starts the server process."""
+        if self.process is not None:
+            raise RuntimeError("Server is already running")
+
+        self.config_path = self._write_config()
+        self.port = self.find_available_port()
+
+        cmd = [
+            sys.executable,
+            "-m",
+            "shortfin_apps.llm.server",
+            f"--tokenizer_json={self.config.artifacts.tokenizer_path}",
+            f"--model_config={self.config_path}",
+            f"--vmfb={self.config.artifacts.vmfb_path}",
+            f"--parameters={self.config.artifacts.weights_path}",
+            f"--port={self.port}",
+        ]
+        cmd.extend(self.config.device_settings.server_flags)
+
+        self.process = subprocess.Popen(cmd)
+        self.wait_for_ready()
+
+    def wait_for_ready(self, timeout: int = 10) -> None:
+        """Waits for server to be ready and responding to health checks."""
+        if self.port is None:
+            raise RuntimeError("Server hasn't been started")
+
+        start = time.time()
+        while time.time() - start < timeout:
+            try:
+                requests.get(f"http://localhost:{self.port}/health")
+                return
+            except requests.exceptions.ConnectionError:
+                time.sleep(1)
+        raise TimeoutError(f"Server failed to start within {timeout} seconds")
+
+    def stop(self) -> None:
+        """Stops the server process."""
+        if self.process is not None and self.process.poll() is None:
+            self.process.terminate()
+            self.process.wait()
+            self.process = None
+            self.port = None