diff --git a/README.md b/README.md index 05757c7..248d251 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,7 @@ Table of Contents: | **[Python S3 upload](containers/python-s3-upload/README.md)**
A Python + Flask HTTP server that receives file uploads and writes them to S3. | Python | [Terraform] | | **[Terraform NGINX hello world](containers/terraform-nginx-hello-world/README.md)**
A minimal example running the base NGINX image in a serverless container deployed with Terraform. | N/A | [Terraform] | | **[Triggers with Terraform](containers/terraform-triggers/README.md)**
Configuring two SQS triggers, used to trigger two containers, one public, one private. | N/A | [Terraform] | +| **[Inference with Hugging Face Models](containers/hugging-face-inference/README.md)**
Experimentation to deploy and benchmark some lightweight Hugging Face Models in Serverless Containers. | N/A | [Terraform] | ### ⚙️ Jobs diff --git a/containers/hugging-face-inference/Dockerfile b/containers/hugging-face-inference/Dockerfile new file mode 100644 index 0000000..f3ce032 --- /dev/null +++ b/containers/hugging-face-inference/Dockerfile @@ -0,0 +1,23 @@ +FROM python:3.12-slim-bookworm + +ARG MODEL_DOWNLOAD_SOURCE + +RUN apt-get update && apt-get install -y wget + +WORKDIR /app + +RUN pip install --upgrade pip + +COPY requirements.txt . + +RUN pip install -r requirements.txt + +RUN pip install llama-cpp-python==0.2.62 \ + --no-cache-dir \ + --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu + +RUN wget $MODEL_DOWNLOAD_SOURCE + +COPY main.py . + +CMD ["uvicorn", "main:app", "--proxy-headers", "--host", "0.0.0.0", "--port", "80"] diff --git a/containers/hugging-face-inference/README.md b/containers/hugging-face-inference/README.md new file mode 100644 index 0000000..2f0a040 --- /dev/null +++ b/containers/hugging-face-inference/README.md @@ -0,0 +1,33 @@ +# Hugging Face Models + +### Deploy models in Serverless Containers + +- Export these variables: + +```bash +export SCW_ACCESS_KEY="access-key" SCW_SECRET_KEY="secret-key" SCW_PROJECT_ID="project-id" REGION="fr-par" +``` + +- Add/remove Hugging Face models (with `.gguf` extension) in `terraform/hf-models.json` file. + +- Run script to deploy multiple hugging face models using terraform workspaces: + +```bash +cd terraform && bash terraform.sh -a +``` + +### Benchmark models + +Check your models were deployed on the console and copy your container endpoints to the `terraform/hf-models.json` file, then perform the following command: + +```bash +python benchmark-models.py +``` + +This will generate a box plot to analyze response time per model family, and a `csv` file containing textual responses per each model. + +### Destroy terraform resources for all models + +```bash +bash terraform.sh -d +``` \ No newline at end of file diff --git a/containers/hugging-face-inference/main.py b/containers/hugging-face-inference/main.py new file mode 100644 index 0000000..a473554 --- /dev/null +++ b/containers/hugging-face-inference/main.py @@ -0,0 +1,46 @@ +import os + +from fastapi import FastAPI +from llama_cpp import Llama +from pydantic import BaseModel + + +class Message(BaseModel): + content: str + + +MODEL_FILE_NAME = os.environ["MODEL_FILE_NAME"] + +app = FastAPI() + +print("loading model starts", flush=True) + +llm = Llama(model_path=MODEL_FILE_NAME) + +print("loading model successfully ends", flush=True) + + +@app.get("/") +def hello(): + """Get info of inference server""" + + return { + "message": "Hello, this is the inference server! Serving model {model_name}".format( + model_name=MODEL_FILE_NAME + ) + } + + +@app.post("/") +def infer(message: Message): + """Post a message and receive a response from inference server""" + + print("inference endpoint is called", flush=True) + + output = llm(prompt=message.content, max_tokens=200) + + print("output is successfully inferred", flush=True) + + print(output, flush=True) + + return output diff --git a/containers/hugging-face-inference/requirements.txt b/containers/hugging-face-inference/requirements.txt new file mode 100644 index 0000000..3b33077 --- /dev/null +++ b/containers/hugging-face-inference/requirements.txt @@ -0,0 +1,2 @@ +fastapi==0.104.1 +uvicorn==0.24.0.post1 \ No newline at end of file diff --git a/containers/hugging-face-inference/terraform/benchmark-models.py b/containers/hugging-face-inference/terraform/benchmark-models.py new file mode 100644 index 0000000..17fddaa --- /dev/null +++ b/containers/hugging-face-inference/terraform/benchmark-models.py @@ -0,0 +1,102 @@ +import csv +import json + +import matplotlib.pyplot as plt +import pandas +import requests + + +class Benchmark: + _model_families = ["llama", "mistral", "phi"] + _endpoints = {} + + def __init__( + self, models_file: str, benchmark_file: str, results_figure: str, message: str + ) -> None: + self.models_file = models_file + self.benchmark_file = benchmark_file + self.message = message + self.results_figure = results_figure + + def get_container_endpoints_from_json_file(self) -> None: + if self.models_file == "": + raise Exception("file name is empty") + + with open(self.models_file, "r") as models_file: + json_data = json.load(models_file) + + for family in self._model_families: + self._endpoints[family] = [] + for model in json_data[family]: + self._endpoints[family].append( + {"model": model["file"], "endpoint": model["ctn_endpoint"]} + ) + + def analyze_results(self) -> None: + benchmark_results = pandas.read_csv(self.benchmark_file) + benchmark_results.boxplot(column="Total Response Time", by="Family").plot() + plt.ylabel("Total Response Time in seconds") + plt.savefig(self.results_figure) + + def benchmark_models(self, num_samples: int) -> None: + self.get_container_endpoints_from_json_file() + + fields = ["Model", "Family", "Total Response Time", "Response Message"] + benchmark_data = [] + + for family in self._model_families: + for endpoint in self._endpoints[family]: + if endpoint["endpoint"] == "": + raise Exception("model endpoint is empty") + + for _ in range(num_samples): + try: + print( + "Calling model {model} on endpoint {endpoint} with message {message}".format( + model=endpoint["model"], + endpoint=endpoint["endpoint"], + message=self.message, + ) + ) + + rsp = requests.post( + endpoint["endpoint"], json={"message": self.message} + ) + + response_text = rsp.json()["choices"][0]["text"] + + print( + "The model {model} responded with: {response_text}".format( + model=endpoint["model"], response_text=response_text + ) + ) + + benchmark_data.append( + [ + endpoint["model"], + family, + rsp.elapsed.total_seconds(), + response_text, + ] + ) + except: + pass + + with open(self.benchmark_file, "w") as results_file: + wrt = csv.writer(results_file) + wrt.writerow(fields) + wrt.writerows(benchmark_data) + + self.analyze_results() + + +if __name__ == "__main__": + + benchmark = Benchmark( + models_file="hf-models.json", + benchmark_file="benchmark-results.csv", + results_figure="results-plot.png", + message="What the difference between an elephant and an ant?", + ) + + benchmark.benchmark_models(num_samples=50) diff --git a/containers/hugging-face-inference/terraform/container.tf b/containers/hugging-face-inference/terraform/container.tf new file mode 100644 index 0000000..3502223 --- /dev/null +++ b/containers/hugging-face-inference/terraform/container.tf @@ -0,0 +1,20 @@ +resource "scaleway_container_namespace" "main" { + name = "ifr-${lower(replace(var.hf_model_file_name, "/[.]|[_]/", "-"))}-${random_string.random_suffix.result}" + description = "Inference using Hugging Face models" +} + +resource "scaleway_container" "inference-hugging-face" { + name = "inference" + description = "Inference serving API using a Hugging Face model" + namespace_id = scaleway_container_namespace.main.id + registry_image = docker_image.inference.name + environment_variables = { + "MODEL_FILE_NAME" = var.hf_model_file_name + } + port = 80 + cpu_limit = 2240 + memory_limit = 4096 + min_scale = 1 + max_scale = 1 + deploy = true +} diff --git a/containers/hugging-face-inference/terraform/hf-models.json b/containers/hugging-face-inference/terraform/hf-models.json new file mode 100644 index 0000000..a4f44f5 --- /dev/null +++ b/containers/hugging-face-inference/terraform/hf-models.json @@ -0,0 +1,46 @@ +{ + "llama" : [ + { + "file": "llama-2-7b.Q2_K.gguf", + "source" : "https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q2_K.gguf", + "size_gb": "2.83", + "ctn_endpoint": "paste container endpoint here" + }, + { + "file": "llama-2-7b.Q3_K_L.gguf", + "source" : "https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q3_K_L.gguf", + "size_gb": "3.6", + "ctn_endpoint": "paste container endpoint here" + } + ], + + "mistral" : [ + { + "file": "mistral-7b-instruct-v0.2.Q2_K.gguf", + "source" : "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q2_K.gguf", + "size_gb": "3.08", + "ctn_endpoint": "paste container endpoint here" + }, + { + "file": "mistral-7b-instruct-v0.2.Q3_K_L.gguf", + "source" : "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q3_K_L.gguf", + "size_gb": "3.82", + "ctn_endpoint": "paste container endpoint here" + } + ], + + "phi" : [ + { + "file": "phi-2.Q2_K.gguf", + "source" : "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q2_K.gguf", + "size_gb": "1.17", + "ctn_endpoint": "paste container endpoint here" + }, + { + "file": "phi-2.Q5_K_M.gguf", + "source" : "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q5_K_M.gguf", + "size_gb": "2.07", + "ctn_endpoint": "paste container endpoint here" + } + ] +} \ No newline at end of file diff --git a/containers/hugging-face-inference/terraform/images.tf b/containers/hugging-face-inference/terraform/images.tf new file mode 100644 index 0000000..6c857e6 --- /dev/null +++ b/containers/hugging-face-inference/terraform/images.tf @@ -0,0 +1,20 @@ +resource "scaleway_registry_namespace" "main" { + name = "ifr-${lower(replace(var.hf_model_file_name, "/[.]|[_]/", "-"))}-${random_string.random_suffix.result}" + region = var.region + project_id = var.project_id +} + +resource "docker_image" "inference" { + name = "${scaleway_registry_namespace.main.endpoint}/inference-with-huggingface:${var.image_version}" + build { + context = "${path.cwd}/../" + no_cache = true + build_args = { + MODEL_DOWNLOAD_SOURCE : var.hf_model_download_source + } + } + + provisioner "local-exec" { + command = "docker push ${docker_image.inference.name}" + } +} diff --git a/containers/hugging-face-inference/terraform/providers.tf b/containers/hugging-face-inference/terraform/providers.tf new file mode 100644 index 0000000..439df4d --- /dev/null +++ b/containers/hugging-face-inference/terraform/providers.tf @@ -0,0 +1,16 @@ +provider "scaleway" { + region = var.region + access_key = var.access_key + secret_key = var.secret_key + project_id = var.project_id +} + +provider "docker" { + host = "unix:///var/run/docker.sock" + + registry_auth { + address = scaleway_registry_namespace.main.endpoint + username = "nologin" + password = var.secret_key + } +} diff --git a/containers/hugging-face-inference/terraform/terraform.sh b/containers/hugging-face-inference/terraform/terraform.sh new file mode 100755 index 0000000..5bbd07a --- /dev/null +++ b/containers/hugging-face-inference/terraform/terraform.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +set -e + +# Common environment variables +export TF_VAR_access_key=${SCW_ACCESS_KEY} \ + TF_VAR_secret_key=${SCW_SECRET_KEY} \ + TF_VAR_project_id=${SCW_PROJECT_ID} + +# Associative list of models to deploy using json data +declare -A hf_models +eval "$(jq -r '.[]|.[]|"hf_models[\(.file)]=\(.source)"' hf-models.json)" + +# Login to docker Scaleway's registry +docker login "rg.$REGION.scw.cloud" -u nologin --password-stdin <<< "$SCW_SECRET_KEY" + +# Initialize, plan, and deploy each model in a Terraform workspace +apply() { + terraform init + for model_file_name in "${!hf_models[@]}"; + do + terraform workspace select -or-create $model_file_name + export TF_VAR_hf_model_file_name=$model_file_name \ + TF_VAR_hf_model_download_source=${hf_models[$model_file_name]} + terraform plan + terraform apply -auto-approve + done +} + +# Destroy resources of each Terraform workspace +destroy() { + for model_file_name in "${!hf_models[@]}"; + do + terraform workspace select $model_file_name + export TF_VAR_hf_model_file_name=$model_file_name \ + TF_VAR_hf_model_download_source=${hf_models[$model_file_name]} + terraform destroy -auto-approve + done +} + +# Script actions +while getopts "ad" option; do + case $option in + a) + echo "deploying models" + apply + ;; + d) + echo "destroying models" + destroy + ;; + *) + echo "flag is not provided" + exit 1 + esac +done \ No newline at end of file diff --git a/containers/hugging-face-inference/terraform/utils.tf b/containers/hugging-face-inference/terraform/utils.tf new file mode 100644 index 0000000..15d52ab --- /dev/null +++ b/containers/hugging-face-inference/terraform/utils.tf @@ -0,0 +1,5 @@ +resource "random_string" "random_suffix" { + length = 3 + upper = false + special = false +} diff --git a/containers/hugging-face-inference/terraform/variables.tf b/containers/hugging-face-inference/terraform/variables.tf new file mode 100644 index 0000000..afc799c --- /dev/null +++ b/containers/hugging-face-inference/terraform/variables.tf @@ -0,0 +1,29 @@ +variable "access_key" { + type = string +} + +variable "secret_key" { + type = string +} + +variable "project_id" { + type = string +} + +variable "image_version" { + type = string + default = "0.0.3" +} + +variable "region" { + type = string + default = "fr-par" +} + +variable "hf_model_file_name" { + type = string +} + +variable "hf_model_download_source" { + type = string +} diff --git a/containers/hugging-face-inference/terraform/versions.tf b/containers/hugging-face-inference/terraform/versions.tf new file mode 100644 index 0000000..b186193 --- /dev/null +++ b/containers/hugging-face-inference/terraform/versions.tf @@ -0,0 +1,13 @@ +terraform { + required_providers { + scaleway = { + source = "scaleway/scaleway" + version = ">= 2.39" + } + docker = { + source = "kreuzwerker/docker" + version = "3.0.2" + } + } + required_version = ">= 0.13" +}