Skip to content

Commit

Permalink
Add kubernetes.YAML support to ramalama serverve
Browse files Browse the repository at this point in the history
Fixes: #183

Signed-off-by: Daniel J Walsh <[email protected]>
  • Loading branch information
rhatdan committed Oct 17, 2024
1 parent dc419a4 commit cf66748
Show file tree
Hide file tree
Showing 5 changed files with 154 additions and 23 deletions.
51 changes: 45 additions & 6 deletions docs/ramalama-serve.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,14 @@ The default is TRUE. The --nocontainer option forces this option to False.

Use the `ramalama stop` command to stop the container running the served ramalama Model.

#### **--generate**=quadlet
#### **--generate**=type
Generate specified configuration format for running the AI Model as a service

| Key | Description |
| --------- | ---------------------------------------------------------------- |
| quadlet | Podman supported container definition for running AI Model under systemd |
| kube | Kubernetes YAML definition for running the AI MOdel as a service |

#### **--help**, **-h**
show this help message and exit

Expand All @@ -36,10 +41,9 @@ Name of the container to run the Model in.
port for AI Model server to listen on

## EXAMPLES

Run two AI Models at the same time, notice that they are running within Podman Containers.

### Run two AI Models at the same time. Notice both are running within Podman Containers.
```
$ ramalama serve -p 8080 --name mymodel ollama://tiny-llm:latest
09b0e0d26ed28a8418fb5cd0da641376a08c435063317e89cf8f5336baf35cfa
Expand All @@ -52,8 +56,7 @@ CONTAINER ID IMAGE COMMAND CREATED
3f64927f11a5 quay.io/ramalama/ramalama:latest /usr/bin/ramalama... 17 seconds ago Up 17 seconds 0.0.0.0:8082->8082/tcp ramalama_YMPQvJxN97
```

Generate a quadlet for running the AI Model service

### Generate a quadlet for running the AI Model service
```
$ ramalama serve --name MyGraniteServer --generate=quadlet granite > $HOME/.config/containers/systemd/MyGraniteServer.container
$ cat $HOME/.config/containers/systemd/MyGraniteServer.container
Expand Down Expand Up @@ -91,6 +94,42 @@ CONTAINER ID IMAGE COMMAND CREATED
7bb35b97a0fe quay.io/ramalama/ramalama:latest llama-server --po... 3 minutes ago Up 3 minutes 0.0.0.0:43869->8080/tcp MyGraniteServer
```

### Generate a kubernetes YAML file named tini
```
$ ramalama --nocontainer serve --name tini --generate kube tiny
# Save the output of this file and use kubectl create -f to import
# it into Kubernetes.
#
# Created with ramalama-0.0.17
apiVersion: v1
kind: Pod
metadata:
labels:
app: tini-pod
name: tini-pod
spec:
containers:
- name: tini
image: quay.io/ramalama/ramalama:latest
command: [ "llama-server" ]
args: ['--port', '8080', '-m', '/run/model']
ports:
- containerPort: 8080
hostPort: 8080
volumeMounts:
- mountPath: /run/model
name: model
- mountPath: /dev/dri
name: dri
volumes:
- name model
hostPath:
path: /home/dwalsh/.local/share/ramalama/models/ollama/tinyllama:latest"
- name dri
hostPath:
path: /dev/dri
```

## SEE ALSO
**[ramalama(1)](ramalama.1.md)**, **[ramalama-stop(1)](ramalama-stop.1.md)**, **quadlet(1)**, **systemctl(1)**, **podman-ps(1)**

Expand Down
17 changes: 6 additions & 11 deletions ramalama/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,20 @@
import glob
import json
import os
import random
import string
import subprocess
import sys
import time

from ramalama.huggingface import Huggingface
from ramalama.common import (
in_container,
container_manager,
exec_cmd,
run_cmd,
default_image,
exec_cmd,
find_working_directory,
genname,
in_container,
perror,
run_cmd,
)
from ramalama.oci import OCI
from ramalama.ollama import Ollama
Expand Down Expand Up @@ -409,10 +408,6 @@ def push_cli(args):
model.push(source, args)


def _name():
return "ramalama_" + "".join(random.choices(string.ascii_letters + string.digits, k=10))


def run_parser(subparsers):
parser = subparsers.add_parser("run", help="run specified AI Model as a chatbot")
parser.add_argument("-n", "--name", dest="name", help="name of container in which the Model will be run")
Expand All @@ -435,7 +430,7 @@ def serve_parser(subparsers):
parser.add_argument("-p", "--port", default="8080", help="port for AI Model server to listen on")
parser.add_argument(
"--generate",
choices=["quadlet"],
choices=["quadlet", "kube"],
help="generate specified configuration format for running the AI Model as a service",
)
parser.add_argument("MODEL") # positional argument
Expand Down Expand Up @@ -578,7 +573,7 @@ def run_container(args):
if hasattr(args, "name") and args.name:
name = args.name
else:
name = _name()
name = genname()

wd = find_working_directory()
conman_args = [
Expand Down
6 changes: 6 additions & 0 deletions ramalama/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

import hashlib
import os
import random
import shutil
import string
import subprocess
import sys

Expand Down Expand Up @@ -148,3 +150,7 @@ def default_image():
if image:
return image
return "quay.io/ramalama/ramalama:latest"


def genname():
return "ramalama_" + "".join(random.choices(string.ascii_letters + string.digits, k=10))
91 changes: 85 additions & 6 deletions ramalama/model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import sys
from ramalama.common import exec_cmd, default_image, in_container
from ramalama.common import exec_cmd, default_image, in_container, genname
from ramalama.version import version


file_not_found = """\
Expand All @@ -12,8 +13,8 @@
file_not_found_in_container = """\
RamaLama requires the "%s" command to be installed inside of the container.
RamaLama requires the server application be installed in the container images.
Either install a package containing the "%s" command in the container or run with the default
RamaLama image.
Either install a package containing the "%s" command in the container or run
with the default RamaLama image.
"""


Expand Down Expand Up @@ -128,13 +129,16 @@ def run(self, args):

def serve(self, args):
symlink_path = self.pull(args)
exec_args = ["llama-server", "--port", args.port, "-m", symlink_path]
exec_args = ["llama-server", "--port", args.port, "-m", "/run/model"]
if args.runtime == "vllm":
exec_args = ["vllm", "serve", "--port", args.port, symlink_path]
exec_args = ["vllm", "serve", "--port", args.port, "/run/model"]

if args.generate == "quadlet":
return self.quadlet(symlink_path, args, exec_args)

if args.generate == "kube":
return self.kube(symlink_path, args, exec_args)

try:
exec_cmd(exec_args)
except FileNotFoundError as e:
Expand Down Expand Up @@ -162,7 +166,7 @@ def quadlet(self, model, args, exec_args):
AddDevice=-/dev/kfd
Exec={" ".join(exec_args)}
Image={default_image()}
Volume={model}:{model}:ro,z
Volume={model}:/run/model:ro,z
{name_string}
{port_string}
Expand All @@ -171,3 +175,78 @@ def quadlet(self, model, args, exec_args):
WantedBy=multi-user.target default.target
"""
)

def _gen_ports(self, args):
if not hasattr(args, "port"):
return ""

ports = args.port.split(":", 2)
container_port = ports[0]
host_port = ports[0]
if len(ports) > 1:
host_port = ports[1]
return f"""\
ports:
- containerPort: {container_port}
hostPort: {host_port}"""

def _gen_volumes(self, model, args):
mounts = """\
volumeMounts:
- mountPath: /run/model
name: model"""

volumes = f"""
volumes:
- name model
hostPath:
path: {model}"""

for dev in ["dri", "kfd"]:
if os.path.exists("/dev/" + dev):
mounts = (
mounts
+ f"""
- mountPath: /dev/{dev}
name: dri"""
)
volumes = (
volumes
+ f""""
- name {dev}
hostPath:
path: /dev/{dev}"""
)

return mounts + volumes

def kube(self, model, args, exec_args):
port_string = self._gen_ports(args)
volume_string = self._gen_volumes(model, args)
_version = version()
if hasattr(args, "name") and args.name:
name = args.name
else:
name = genname()

print(
f"""\
# Save the output of this file and use kubectl create -f to import
# it into Kubernetes.
#
# Created with ramalama-{_version}
apiVersion: v1
kind: Pod
metadata:
labels:
app: {name}-pod
name: {name}-pod
spec:
containers:
- name: {name}
image: {args.image}
command: ["{exec_args[0]}"]
args: {exec_args[1:]}
{port_string}
{volume_string}"""
)
12 changes: 12 additions & 0 deletions test/system/040-serve.bats
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,18 @@ verify_begin=".*run --rm -i --label RAMALAMA --security-opt=label=disable -e RAM
is "$output" ".*PublishPort=1234" "PublishPort should match"
is "$output" ".*Name=${name}" "Quadlet should have name field"
is "$output" ".*Exec=llama-server --port 1234 -m .*" "Exec line should be correct"
run_ramalama 2 serve --name=${name} --port 1234 --generate=bogus ${model}
is "$output" ".*error: argument --generate: invalid choice: 'bogus' (choose from 'quadlet', 'kube')" "Should fail"
}

@test "ramalama serve --generate=kube" {
model=tiny
name=c_$(safename)
run_ramalama pull ${model}
run_ramalama serve --name=${name} --port 1234 --generate=kube ${model}
is "$output" ".*image: quay.io/ramalama/ramalama:latest" "Should container image"
is "$output" ".*command: \[\"llama-server\"\]" "Should command"
is "$output" ".*containerPort: 1234" "Should container container port"
}

# vim: filetype=sh

0 comments on commit cf66748

Please sign in to comment.