diff --git a/README.md b/README.md
index 05757c7..248d251 100644
--- a/README.md
+++ b/README.md
@@ -71,6 +71,7 @@ Table of Contents:
| **[Python S3 upload](containers/python-s3-upload/README.md)**
A Python + Flask HTTP server that receives file uploads and writes them to S3. | Python | [Terraform] |
| **[Terraform NGINX hello world](containers/terraform-nginx-hello-world/README.md)**
A minimal example running the base NGINX image in a serverless container deployed with Terraform. | N/A | [Terraform] |
| **[Triggers with Terraform](containers/terraform-triggers/README.md)**
Configuring two SQS triggers, used to trigger two containers, one public, one private. | N/A | [Terraform] |
+| **[Inference with Hugging Face Models](containers/hugging-face-inference/README.md)**
Experimentation to deploy and benchmark some lightweight Hugging Face Models in Serverless Containers. | N/A | [Terraform] |
### ⚙️ Jobs
diff --git a/containers/hugging-face-inference/Dockerfile b/containers/hugging-face-inference/Dockerfile
new file mode 100644
index 0000000..f3ce032
--- /dev/null
+++ b/containers/hugging-face-inference/Dockerfile
@@ -0,0 +1,23 @@
+FROM python:3.12-slim-bookworm
+
+ARG MODEL_DOWNLOAD_SOURCE
+
+RUN apt-get update && apt-get install -y wget
+
+WORKDIR /app
+
+RUN pip install --upgrade pip
+
+COPY requirements.txt .
+
+RUN pip install -r requirements.txt
+
+RUN pip install llama-cpp-python==0.2.62 \
+ --no-cache-dir \
+ --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
+
+RUN wget $MODEL_DOWNLOAD_SOURCE
+
+COPY main.py .
+
+CMD ["uvicorn", "main:app", "--proxy-headers", "--host", "0.0.0.0", "--port", "80"]
diff --git a/containers/hugging-face-inference/README.md b/containers/hugging-face-inference/README.md
new file mode 100644
index 0000000..2f0a040
--- /dev/null
+++ b/containers/hugging-face-inference/README.md
@@ -0,0 +1,33 @@
+# Hugging Face Models
+
+### Deploy models in Serverless Containers
+
+- Export these variables:
+
+```bash
+export SCW_ACCESS_KEY="access-key" SCW_SECRET_KEY="secret-key" SCW_PROJECT_ID="project-id" REGION="fr-par"
+```
+
+- Add/remove Hugging Face models (with `.gguf` extension) in `terraform/hf-models.json` file.
+
+- Run script to deploy multiple hugging face models using terraform workspaces:
+
+```bash
+cd terraform && bash terraform.sh -a
+```
+
+### Benchmark models
+
+Check your models were deployed on the console and copy your container endpoints to the `terraform/hf-models.json` file, then perform the following command:
+
+```bash
+python benchmark-models.py
+```
+
+This will generate a box plot to analyze response time per model family, and a `csv` file containing textual responses per each model.
+
+### Destroy terraform resources for all models
+
+```bash
+bash terraform.sh -d
+```
\ No newline at end of file
diff --git a/containers/hugging-face-inference/main.py b/containers/hugging-face-inference/main.py
new file mode 100644
index 0000000..a473554
--- /dev/null
+++ b/containers/hugging-face-inference/main.py
@@ -0,0 +1,46 @@
+import os
+
+from fastapi import FastAPI
+from llama_cpp import Llama
+from pydantic import BaseModel
+
+
+class Message(BaseModel):
+ content: str
+
+
+MODEL_FILE_NAME = os.environ["MODEL_FILE_NAME"]
+
+app = FastAPI()
+
+print("loading model starts", flush=True)
+
+llm = Llama(model_path=MODEL_FILE_NAME)
+
+print("loading model successfully ends", flush=True)
+
+
+@app.get("/")
+def hello():
+ """Get info of inference server"""
+
+ return {
+ "message": "Hello, this is the inference server! Serving model {model_name}".format(
+ model_name=MODEL_FILE_NAME
+ )
+ }
+
+
+@app.post("/")
+def infer(message: Message):
+ """Post a message and receive a response from inference server"""
+
+ print("inference endpoint is called", flush=True)
+
+ output = llm(prompt=message.content, max_tokens=200)
+
+ print("output is successfully inferred", flush=True)
+
+ print(output, flush=True)
+
+ return output
diff --git a/containers/hugging-face-inference/requirements.txt b/containers/hugging-face-inference/requirements.txt
new file mode 100644
index 0000000..3b33077
--- /dev/null
+++ b/containers/hugging-face-inference/requirements.txt
@@ -0,0 +1,2 @@
+fastapi==0.104.1
+uvicorn==0.24.0.post1
\ No newline at end of file
diff --git a/containers/hugging-face-inference/terraform/benchmark-models.py b/containers/hugging-face-inference/terraform/benchmark-models.py
new file mode 100644
index 0000000..17fddaa
--- /dev/null
+++ b/containers/hugging-face-inference/terraform/benchmark-models.py
@@ -0,0 +1,102 @@
+import csv
+import json
+
+import matplotlib.pyplot as plt
+import pandas
+import requests
+
+
+class Benchmark:
+ _model_families = ["llama", "mistral", "phi"]
+ _endpoints = {}
+
+ def __init__(
+ self, models_file: str, benchmark_file: str, results_figure: str, message: str
+ ) -> None:
+ self.models_file = models_file
+ self.benchmark_file = benchmark_file
+ self.message = message
+ self.results_figure = results_figure
+
+ def get_container_endpoints_from_json_file(self) -> None:
+ if self.models_file == "":
+ raise Exception("file name is empty")
+
+ with open(self.models_file, "r") as models_file:
+ json_data = json.load(models_file)
+
+ for family in self._model_families:
+ self._endpoints[family] = []
+ for model in json_data[family]:
+ self._endpoints[family].append(
+ {"model": model["file"], "endpoint": model["ctn_endpoint"]}
+ )
+
+ def analyze_results(self) -> None:
+ benchmark_results = pandas.read_csv(self.benchmark_file)
+ benchmark_results.boxplot(column="Total Response Time", by="Family").plot()
+ plt.ylabel("Total Response Time in seconds")
+ plt.savefig(self.results_figure)
+
+ def benchmark_models(self, num_samples: int) -> None:
+ self.get_container_endpoints_from_json_file()
+
+ fields = ["Model", "Family", "Total Response Time", "Response Message"]
+ benchmark_data = []
+
+ for family in self._model_families:
+ for endpoint in self._endpoints[family]:
+ if endpoint["endpoint"] == "":
+ raise Exception("model endpoint is empty")
+
+ for _ in range(num_samples):
+ try:
+ print(
+ "Calling model {model} on endpoint {endpoint} with message {message}".format(
+ model=endpoint["model"],
+ endpoint=endpoint["endpoint"],
+ message=self.message,
+ )
+ )
+
+ rsp = requests.post(
+ endpoint["endpoint"], json={"message": self.message}
+ )
+
+ response_text = rsp.json()["choices"][0]["text"]
+
+ print(
+ "The model {model} responded with: {response_text}".format(
+ model=endpoint["model"], response_text=response_text
+ )
+ )
+
+ benchmark_data.append(
+ [
+ endpoint["model"],
+ family,
+ rsp.elapsed.total_seconds(),
+ response_text,
+ ]
+ )
+ except:
+ pass
+
+ with open(self.benchmark_file, "w") as results_file:
+ wrt = csv.writer(results_file)
+ wrt.writerow(fields)
+ wrt.writerows(benchmark_data)
+
+ self.analyze_results()
+
+
+if __name__ == "__main__":
+
+ benchmark = Benchmark(
+ models_file="hf-models.json",
+ benchmark_file="benchmark-results.csv",
+ results_figure="results-plot.png",
+ message="What the difference between an elephant and an ant?",
+ )
+
+ benchmark.benchmark_models(num_samples=50)
diff --git a/containers/hugging-face-inference/terraform/container.tf b/containers/hugging-face-inference/terraform/container.tf
new file mode 100644
index 0000000..3502223
--- /dev/null
+++ b/containers/hugging-face-inference/terraform/container.tf
@@ -0,0 +1,20 @@
+resource "scaleway_container_namespace" "main" {
+ name = "ifr-${lower(replace(var.hf_model_file_name, "/[.]|[_]/", "-"))}-${random_string.random_suffix.result}"
+ description = "Inference using Hugging Face models"
+}
+
+resource "scaleway_container" "inference-hugging-face" {
+ name = "inference"
+ description = "Inference serving API using a Hugging Face model"
+ namespace_id = scaleway_container_namespace.main.id
+ registry_image = docker_image.inference.name
+ environment_variables = {
+ "MODEL_FILE_NAME" = var.hf_model_file_name
+ }
+ port = 80
+ cpu_limit = 2240
+ memory_limit = 4096
+ min_scale = 1
+ max_scale = 1
+ deploy = true
+}
diff --git a/containers/hugging-face-inference/terraform/hf-models.json b/containers/hugging-face-inference/terraform/hf-models.json
new file mode 100644
index 0000000..a4f44f5
--- /dev/null
+++ b/containers/hugging-face-inference/terraform/hf-models.json
@@ -0,0 +1,46 @@
+{
+ "llama" : [
+ {
+ "file": "llama-2-7b.Q2_K.gguf",
+ "source" : "https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q2_K.gguf",
+ "size_gb": "2.83",
+ "ctn_endpoint": "paste container endpoint here"
+ },
+ {
+ "file": "llama-2-7b.Q3_K_L.gguf",
+ "source" : "https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q3_K_L.gguf",
+ "size_gb": "3.6",
+ "ctn_endpoint": "paste container endpoint here"
+ }
+ ],
+
+ "mistral" : [
+ {
+ "file": "mistral-7b-instruct-v0.2.Q2_K.gguf",
+ "source" : "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q2_K.gguf",
+ "size_gb": "3.08",
+ "ctn_endpoint": "paste container endpoint here"
+ },
+ {
+ "file": "mistral-7b-instruct-v0.2.Q3_K_L.gguf",
+ "source" : "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q3_K_L.gguf",
+ "size_gb": "3.82",
+ "ctn_endpoint": "paste container endpoint here"
+ }
+ ],
+
+ "phi" : [
+ {
+ "file": "phi-2.Q2_K.gguf",
+ "source" : "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q2_K.gguf",
+ "size_gb": "1.17",
+ "ctn_endpoint": "paste container endpoint here"
+ },
+ {
+ "file": "phi-2.Q5_K_M.gguf",
+ "source" : "https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q5_K_M.gguf",
+ "size_gb": "2.07",
+ "ctn_endpoint": "paste container endpoint here"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/containers/hugging-face-inference/terraform/images.tf b/containers/hugging-face-inference/terraform/images.tf
new file mode 100644
index 0000000..6c857e6
--- /dev/null
+++ b/containers/hugging-face-inference/terraform/images.tf
@@ -0,0 +1,20 @@
+resource "scaleway_registry_namespace" "main" {
+ name = "ifr-${lower(replace(var.hf_model_file_name, "/[.]|[_]/", "-"))}-${random_string.random_suffix.result}"
+ region = var.region
+ project_id = var.project_id
+}
+
+resource "docker_image" "inference" {
+ name = "${scaleway_registry_namespace.main.endpoint}/inference-with-huggingface:${var.image_version}"
+ build {
+ context = "${path.cwd}/../"
+ no_cache = true
+ build_args = {
+ MODEL_DOWNLOAD_SOURCE : var.hf_model_download_source
+ }
+ }
+
+ provisioner "local-exec" {
+ command = "docker push ${docker_image.inference.name}"
+ }
+}
diff --git a/containers/hugging-face-inference/terraform/providers.tf b/containers/hugging-face-inference/terraform/providers.tf
new file mode 100644
index 0000000..439df4d
--- /dev/null
+++ b/containers/hugging-face-inference/terraform/providers.tf
@@ -0,0 +1,16 @@
+provider "scaleway" {
+ region = var.region
+ access_key = var.access_key
+ secret_key = var.secret_key
+ project_id = var.project_id
+}
+
+provider "docker" {
+ host = "unix:///var/run/docker.sock"
+
+ registry_auth {
+ address = scaleway_registry_namespace.main.endpoint
+ username = "nologin"
+ password = var.secret_key
+ }
+}
diff --git a/containers/hugging-face-inference/terraform/terraform.sh b/containers/hugging-face-inference/terraform/terraform.sh
new file mode 100755
index 0000000..5bbd07a
--- /dev/null
+++ b/containers/hugging-face-inference/terraform/terraform.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+set -e
+
+# Common environment variables
+export TF_VAR_access_key=${SCW_ACCESS_KEY} \
+ TF_VAR_secret_key=${SCW_SECRET_KEY} \
+ TF_VAR_project_id=${SCW_PROJECT_ID}
+
+# Associative list of models to deploy using json data
+declare -A hf_models
+eval "$(jq -r '.[]|.[]|"hf_models[\(.file)]=\(.source)"' hf-models.json)"
+
+# Login to docker Scaleway's registry
+docker login "rg.$REGION.scw.cloud" -u nologin --password-stdin <<< "$SCW_SECRET_KEY"
+
+# Initialize, plan, and deploy each model in a Terraform workspace
+apply() {
+ terraform init
+ for model_file_name in "${!hf_models[@]}";
+ do
+ terraform workspace select -or-create $model_file_name
+ export TF_VAR_hf_model_file_name=$model_file_name \
+ TF_VAR_hf_model_download_source=${hf_models[$model_file_name]}
+ terraform plan
+ terraform apply -auto-approve
+ done
+}
+
+# Destroy resources of each Terraform workspace
+destroy() {
+ for model_file_name in "${!hf_models[@]}";
+ do
+ terraform workspace select $model_file_name
+ export TF_VAR_hf_model_file_name=$model_file_name \
+ TF_VAR_hf_model_download_source=${hf_models[$model_file_name]}
+ terraform destroy -auto-approve
+ done
+}
+
+# Script actions
+while getopts "ad" option; do
+ case $option in
+ a)
+ echo "deploying models"
+ apply
+ ;;
+ d)
+ echo "destroying models"
+ destroy
+ ;;
+ *)
+ echo "flag is not provided"
+ exit 1
+ esac
+done
\ No newline at end of file
diff --git a/containers/hugging-face-inference/terraform/utils.tf b/containers/hugging-face-inference/terraform/utils.tf
new file mode 100644
index 0000000..15d52ab
--- /dev/null
+++ b/containers/hugging-face-inference/terraform/utils.tf
@@ -0,0 +1,5 @@
+resource "random_string" "random_suffix" {
+ length = 3
+ upper = false
+ special = false
+}
diff --git a/containers/hugging-face-inference/terraform/variables.tf b/containers/hugging-face-inference/terraform/variables.tf
new file mode 100644
index 0000000..afc799c
--- /dev/null
+++ b/containers/hugging-face-inference/terraform/variables.tf
@@ -0,0 +1,29 @@
+variable "access_key" {
+ type = string
+}
+
+variable "secret_key" {
+ type = string
+}
+
+variable "project_id" {
+ type = string
+}
+
+variable "image_version" {
+ type = string
+ default = "0.0.3"
+}
+
+variable "region" {
+ type = string
+ default = "fr-par"
+}
+
+variable "hf_model_file_name" {
+ type = string
+}
+
+variable "hf_model_download_source" {
+ type = string
+}
diff --git a/containers/hugging-face-inference/terraform/versions.tf b/containers/hugging-face-inference/terraform/versions.tf
new file mode 100644
index 0000000..b186193
--- /dev/null
+++ b/containers/hugging-face-inference/terraform/versions.tf
@@ -0,0 +1,13 @@
+terraform {
+ required_providers {
+ scaleway = {
+ source = "scaleway/scaleway"
+ version = ">= 2.39"
+ }
+ docker = {
+ source = "kreuzwerker/docker"
+ version = "3.0.2"
+ }
+ }
+ required_version = ">= 0.13"
+}