From cf6674838e1e63ae45e95e61c3400a37446e80ab Mon Sep 17 00:00:00 2001 From: Daniel J Walsh Date: Thu, 17 Oct 2024 15:08:35 -0400 Subject: [PATCH] Add kubernetes.YAML support to ramalama serverve Fixes: https://github.com/containers/ramalama/issues/183 Signed-off-by: Daniel J Walsh --- docs/ramalama-serve.1.md | 51 ++++++++++++++++++--- ramalama/cli.py | 17 +++---- ramalama/common.py | 6 +++ ramalama/model.py | 91 +++++++++++++++++++++++++++++++++++--- test/system/040-serve.bats | 12 +++++ 5 files changed, 154 insertions(+), 23 deletions(-) diff --git a/docs/ramalama-serve.1.md b/docs/ramalama-serve.1.md index 0421722..90c36d8 100644 --- a/docs/ramalama-serve.1.md +++ b/docs/ramalama-serve.1.md @@ -23,9 +23,14 @@ The default is TRUE. The --nocontainer option forces this option to False. Use the `ramalama stop` command to stop the container running the served ramalama Model. -#### **--generate**=quadlet +#### **--generate**=type Generate specified configuration format for running the AI Model as a service +| Key | Description | +| --------- | ---------------------------------------------------------------- | +| quadlet | Podman supported container definition for running AI Model under systemd | +| kube | Kubernetes YAML definition for running the AI MOdel as a service | + #### **--help**, **-h** show this help message and exit @@ -36,10 +41,9 @@ Name of the container to run the Model in. port for AI Model server to listen on ## EXAMPLES - -Run two AI Models at the same time, notice that they are running within Podman Containers. - +### Run two AI Models at the same time. Notice both are running within Podman Containers. ``` + $ ramalama serve -p 8080 --name mymodel ollama://tiny-llm:latest 09b0e0d26ed28a8418fb5cd0da641376a08c435063317e89cf8f5336baf35cfa @@ -52,8 +56,7 @@ CONTAINER ID IMAGE COMMAND CREATED 3f64927f11a5 quay.io/ramalama/ramalama:latest /usr/bin/ramalama... 17 seconds ago Up 17 seconds 0.0.0.0:8082->8082/tcp ramalama_YMPQvJxN97 ``` -Generate a quadlet for running the AI Model service - +### Generate a quadlet for running the AI Model service ``` $ ramalama serve --name MyGraniteServer --generate=quadlet granite > $HOME/.config/containers/systemd/MyGraniteServer.container $ cat $HOME/.config/containers/systemd/MyGraniteServer.container @@ -91,6 +94,42 @@ CONTAINER ID IMAGE COMMAND CREATED 7bb35b97a0fe quay.io/ramalama/ramalama:latest llama-server --po... 3 minutes ago Up 3 minutes 0.0.0.0:43869->8080/tcp MyGraniteServer ``` +### Generate a kubernetes YAML file named tini +``` +$ ramalama --nocontainer serve --name tini --generate kube tiny +# Save the output of this file and use kubectl create -f to import +# it into Kubernetes. +# +# Created with ramalama-0.0.17 +apiVersion: v1 +kind: Pod +metadata: + labels: + app: tini-pod + name: tini-pod +spec: + containers: + - name: tini + image: quay.io/ramalama/ramalama:latest + command: [ "llama-server" ] + args: ['--port', '8080', '-m', '/run/model'] + ports: + - containerPort: 8080 + hostPort: 8080 + volumeMounts: + - mountPath: /run/model + name: model + - mountPath: /dev/dri + name: dri + volumes: + - name model + hostPath: + path: /home/dwalsh/.local/share/ramalama/models/ollama/tinyllama:latest" + - name dri + hostPath: + path: /dev/dri +``` + ## SEE ALSO **[ramalama(1)](ramalama.1.md)**, **[ramalama-stop(1)](ramalama-stop.1.md)**, **quadlet(1)**, **systemctl(1)**, **podman-ps(1)** diff --git a/ramalama/cli.py b/ramalama/cli.py index 33d13f2..65b3301 100644 --- a/ramalama/cli.py +++ b/ramalama/cli.py @@ -3,21 +3,20 @@ import glob import json import os -import random -import string import subprocess import sys import time from ramalama.huggingface import Huggingface from ramalama.common import ( - in_container, container_manager, - exec_cmd, - run_cmd, default_image, + exec_cmd, find_working_directory, + genname, + in_container, perror, + run_cmd, ) from ramalama.oci import OCI from ramalama.ollama import Ollama @@ -409,10 +408,6 @@ def push_cli(args): model.push(source, args) -def _name(): - return "ramalama_" + "".join(random.choices(string.ascii_letters + string.digits, k=10)) - - def run_parser(subparsers): parser = subparsers.add_parser("run", help="run specified AI Model as a chatbot") parser.add_argument("-n", "--name", dest="name", help="name of container in which the Model will be run") @@ -435,7 +430,7 @@ def serve_parser(subparsers): parser.add_argument("-p", "--port", default="8080", help="port for AI Model server to listen on") parser.add_argument( "--generate", - choices=["quadlet"], + choices=["quadlet", "kube"], help="generate specified configuration format for running the AI Model as a service", ) parser.add_argument("MODEL") # positional argument @@ -578,7 +573,7 @@ def run_container(args): if hasattr(args, "name") and args.name: name = args.name else: - name = _name() + name = genname() wd = find_working_directory() conman_args = [ diff --git a/ramalama/common.py b/ramalama/common.py index 1d28af1..9d80976 100644 --- a/ramalama/common.py +++ b/ramalama/common.py @@ -2,7 +2,9 @@ import hashlib import os +import random import shutil +import string import subprocess import sys @@ -148,3 +150,7 @@ def default_image(): if image: return image return "quay.io/ramalama/ramalama:latest" + + +def genname(): + return "ramalama_" + "".join(random.choices(string.ascii_letters + string.digits, k=10)) diff --git a/ramalama/model.py b/ramalama/model.py index a12cc1e..4249678 100644 --- a/ramalama/model.py +++ b/ramalama/model.py @@ -1,6 +1,7 @@ import os import sys -from ramalama.common import exec_cmd, default_image, in_container +from ramalama.common import exec_cmd, default_image, in_container, genname +from ramalama.version import version file_not_found = """\ @@ -12,8 +13,8 @@ file_not_found_in_container = """\ RamaLama requires the "%s" command to be installed inside of the container. RamaLama requires the server application be installed in the container images. -Either install a package containing the "%s" command in the container or run with the default -RamaLama image. +Either install a package containing the "%s" command in the container or run +with the default RamaLama image. """ @@ -128,13 +129,16 @@ def run(self, args): def serve(self, args): symlink_path = self.pull(args) - exec_args = ["llama-server", "--port", args.port, "-m", symlink_path] + exec_args = ["llama-server", "--port", args.port, "-m", "/run/model"] if args.runtime == "vllm": - exec_args = ["vllm", "serve", "--port", args.port, symlink_path] + exec_args = ["vllm", "serve", "--port", args.port, "/run/model"] if args.generate == "quadlet": return self.quadlet(symlink_path, args, exec_args) + if args.generate == "kube": + return self.kube(symlink_path, args, exec_args) + try: exec_cmd(exec_args) except FileNotFoundError as e: @@ -162,7 +166,7 @@ def quadlet(self, model, args, exec_args): AddDevice=-/dev/kfd Exec={" ".join(exec_args)} Image={default_image()} -Volume={model}:{model}:ro,z +Volume={model}:/run/model:ro,z {name_string} {port_string} @@ -171,3 +175,78 @@ def quadlet(self, model, args, exec_args): WantedBy=multi-user.target default.target """ ) + + def _gen_ports(self, args): + if not hasattr(args, "port"): + return "" + + ports = args.port.split(":", 2) + container_port = ports[0] + host_port = ports[0] + if len(ports) > 1: + host_port = ports[1] + return f"""\ + ports: + - containerPort: {container_port} + hostPort: {host_port}""" + + def _gen_volumes(self, model, args): + mounts = """\ + volumeMounts: + - mountPath: /run/model + name: model""" + + volumes = f""" + volumes: + - name model + hostPath: + path: {model}""" + + for dev in ["dri", "kfd"]: + if os.path.exists("/dev/" + dev): + mounts = ( + mounts + + f""" + - mountPath: /dev/{dev} + name: dri""" + ) + volumes = ( + volumes + + f"""" + - name {dev} + hostPath: + path: /dev/{dev}""" + ) + + return mounts + volumes + + def kube(self, model, args, exec_args): + port_string = self._gen_ports(args) + volume_string = self._gen_volumes(model, args) + _version = version() + if hasattr(args, "name") and args.name: + name = args.name + else: + name = genname() + + print( + f"""\ +# Save the output of this file and use kubectl create -f to import +# it into Kubernetes. +# +# Created with ramalama-{_version} +apiVersion: v1 +kind: Pod +metadata: + labels: + app: {name}-pod + name: {name}-pod +spec: + containers: + - name: {name} + image: {args.image} + command: ["{exec_args[0]}"] + args: {exec_args[1:]} +{port_string} +{volume_string}""" + ) diff --git a/test/system/040-serve.bats b/test/system/040-serve.bats index 37bd24b..00bb9dc 100644 --- a/test/system/040-serve.bats +++ b/test/system/040-serve.bats @@ -121,6 +121,18 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable -e RAM is "$output" ".*PublishPort=1234" "PublishPort should match" is "$output" ".*Name=${name}" "Quadlet should have name field" is "$output" ".*Exec=llama-server --port 1234 -m .*" "Exec line should be correct" + run_ramalama 2 serve --name=${name} --port 1234 --generate=bogus ${model} + is "$output" ".*error: argument --generate: invalid choice: 'bogus' (choose from 'quadlet', 'kube')" "Should fail" +} + +@test "ramalama serve --generate=kube" { + model=tiny + name=c_$(safename) + run_ramalama pull ${model} + run_ramalama serve --name=${name} --port 1234 --generate=kube ${model} + is "$output" ".*image: quay.io/ramalama/ramalama:latest" "Should container image" + is "$output" ".*command: \[\"llama-server\"\]" "Should command" + is "$output" ".*containerPort: 1234" "Should container container port" } # vim: filetype=sh