diff --git a/docs/index.md b/docs/index.md index a67bfb480..027058e5c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -120,6 +120,7 @@ auto_examples/mmcloud_agent/index auto_examples/modin_plugin/index auto_examples/kfmpi_plugin/index auto_examples/nim_plugin/index +auto_examples/ollama_plugin/index auto_examples/onnx_plugin/index auto_examples/openai_batch_agent/index auto_examples/papermill_plugin/index diff --git a/docs/integrations.md b/docs/integrations.md index f4ff3e08f..b763017ad 100644 --- a/docs/integrations.md +++ b/docs/integrations.md @@ -44,6 +44,8 @@ Flytekit functionality. These plugins can be anything and for comparison can be - `wandb`: Machine learning platform to build better models faster. * - {doc}`NIM ` - Serve optimized model containers with NIM. +* - {doc}`Ollama ` + - Serve fine-tuned LLMs with Ollama in a Flyte workflow. ``` :::{dropdown} {fa}`info-circle` Using flytekit plugins diff --git a/examples/nim_plugin/README.md b/examples/nim_plugin/README.md index 506c9eab9..36011695b 100644 --- a/examples/nim_plugin/README.md +++ b/examples/nim_plugin/README.md @@ -29,7 +29,7 @@ pip install flytekitplugins-inference For a usage example, see {doc}`NIM example usage `. ```{note} -NIM can only be run in a Flyte cluster, not locally, as it must be deployed as a sidecar service in a Kubernetes pod. +NIM can only be run in a Flyte cluster as it must be deployed as a sidecar service in a Kubernetes pod. ``` ```{toctree} diff --git a/examples/ollama_plugin/Dockerfile b/examples/ollama_plugin/Dockerfile new file mode 100644 index 000000000..0c46be23a --- /dev/null +++ b/examples/ollama_plugin/Dockerfile @@ -0,0 +1,23 @@ +# ###################### +# NOTE: For CI/CD only # +######################## +FROM python:3.11-slim-buster +LABEL org.opencontainers.image.source=https://github.com/flyteorg/flytesnacks + +WORKDIR /root +ENV VENV /opt/venv +ENV LANG C.UTF-8 +ENV LC_ALL C.UTF-8 +ENV PYTHONPATH /root + +# Install Python dependencies +COPY requirements.in /root +RUN pip install -r /root/requirements.in + +# Copy the actual code +COPY . /root/ + +# This tag is supplied by the build script and will be used to determine the version +# when registering tasks, workflows, and launch plans +ARG tag +ENV FLYTE_INTERNAL_IMAGE $tag diff --git a/examples/ollama_plugin/README.md b/examples/ollama_plugin/README.md new file mode 100644 index 000000000..75b97f0fb --- /dev/null +++ b/examples/ollama_plugin/README.md @@ -0,0 +1,36 @@ +(ollama_plugin)= + +# Ollama + +```{eval-rst} +.. tags:: Inference, LLM +``` + +Serve large language models (LLMs) in a Flyte task. + +[Ollama](https://ollama.com/) simplifies the process of serving fine-tuned LLMs. +Whether you're generating predictions from a customized model or deploying it across different hardware setups, +Ollama enables you to encapsulate the entire workflow in a single pipeline. + +## Installation + +To use the Ollama plugin, run the following command: + +``` +pip install flytekitplugins-inference +``` + +## Example usage + +For a usage example, see {doc}`Ollama example usage `. + +```{note} +Ollama can only be run in a Flyte cluster as it must be deployed as a sidecar service in a Kubernetes pod. +``` + +```{toctree} +:maxdepth: -1 +:hidden: + +serve_llm +``` diff --git a/examples/ollama_plugin/ollama_plugin/__init__.py b/examples/ollama_plugin/ollama_plugin/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/ollama_plugin/ollama_plugin/serve_llm.py b/examples/ollama_plugin/ollama_plugin/serve_llm.py new file mode 100644 index 000000000..f96ef7252 --- /dev/null +++ b/examples/ollama_plugin/ollama_plugin/serve_llm.py @@ -0,0 +1,99 @@ +# %% [markdown] +# (serve_llm)= +# +# # Serve LLMs with Ollama +# +# In this guide, you'll learn how to locally serve Gemma2 and fine-tuned Llama3 models using Ollama within a Flyte task. +# +# Start by importing Ollama from the `flytekitplugins.inference` package and specifying the desired model name. +# +# Below is a straightforward example of serving a Gemma2 model: +# %% +from flytekit import ImageSpec, Resources, task +from flytekit.extras.accelerators import A10G +from flytekitplugins.inference import Model, Ollama +from openai import OpenAI + +image = ImageSpec( + name="ollama_serve", + registry="ghcr.io/flyteorg", + packages=["flytekitplugins-inference"], + builder="default", +) + +ollama_instance = Ollama(model=Model(name="gemma2"), gpu="1") + + +@task( + container_image=image, + pod_template=ollama_instance.pod_template, + accelerator=A10G, + requests=Resources(gpu="0"), +) +def model_serving(user_prompt: str) -> str: + client = OpenAI(base_url=f"{ollama_instance.base_url}/v1", api_key="ollama") # api key required but ignored + + completion = client.chat.completions.create( + model="gemma2", + messages=[ + { + "role": "user", + "content": user_prompt, + } + ], + temperature=0.5, + top_p=1, + max_tokens=1024, + ) + + return completion.choices[0].message.content + + +# %% [markdown] +# :::{important} +# Replace `ghcr.io/flyteorg` with a container registry to which you can publish. +# To upload the image to the local registry in the demo cluster, indicate the registry as `localhost:30000`. +# ::: +# +# The `model_serving` task initiates a sidecar service to serve the model, making it accessible on localhost via the `base_url` property. +# You can use either the chat or chat completion endpoints. +# +# By default, Ollama initializes the server with `cpu`, `gpu`, and `mem` set to `1`, `1`, and `15Gi`, respectively. +# You can adjust these settings to meet your requirements. +# +# To serve a fine-tuned model, provide the model configuration as `modelfile` within the `Model` dataclass. +# +# Below is an example of specifying a fine-tuned LoRA adapter for a Llama3 Mario model: +# %% +from flytekit.types.file import FlyteFile + +finetuned_ollama_instance = Ollama( + model=Model( + name="llama3-mario", + modelfile="FROM llama3\nADAPTER {inputs.ggml}\nPARAMETER temperature 1\nPARAMETER num_ctx 4096\nSYSTEM {inputs.system_prompt}", + ), + gpu="1", +) + + +@task( + container_image=image, + pod_template=finetuned_ollama_instance.pod_template, + accelerator=A10G, + requests=Resources(gpu="0"), +) +def finetuned_model_serving(ggml: FlyteFile, system_prompt: str): + ... + + +# %% [markdown] +# `{inputs.ggml}` and `{inputs.system_prompt}` are materialized at run time, with `ggml` and `system_prompt` available as inputs to the task. +# +# Ollama models can be integrated into different stages of your AI workflow, including data pre-processing, +# model inference, and post-processing. Flyte also allows serving multiple Ollama models simultaneously +# on various instances. +# +# This integration enables you to self-host and serve AI models on your own infrastructure, +# ensuring full control over costs and data security. +# +# For more detailed information on the models natively supported by Ollama, visit the [Ollama models library](https://ollama.com/library). diff --git a/examples/ollama_plugin/requirements.in b/examples/ollama_plugin/requirements.in new file mode 100644 index 000000000..a4a684ce6 --- /dev/null +++ b/examples/ollama_plugin/requirements.in @@ -0,0 +1 @@ +flytekitplugins-inference>=1.13.6b1