flyteorg · samhita-alla · Sep 19, 2024 · Aug 14, 2024 · Sep 4, 2024 · Sep 4, 2024
@@ -120,6 +120,7 @@ auto_examples/mmcloud_agent/index
 auto_examples/modin_plugin/index
 auto_examples/kfmpi_plugin/index
 auto_examples/nim_plugin/index
+auto_examples/ollama_plugin/index
 auto_examples/onnx_plugin/index
 auto_examples/openai_batch_agent/index
 auto_examples/papermill_plugin/index

@@ -44,6 +44,8 @@ Flytekit functionality. These plugins can be anything and for comparison can be
   - `wandb`: Machine learning platform to build better models faster.
 * - {doc}`NIM <auto_examples/nim_plugin/index>`
   - Serve optimized model containers with NIM.
+* - {doc}`Ollama <auto_examples/ollama_plugin/index>`
+  - Serve fine-tuned LLMs with Ollama in a Flyte workflow.
 ```
 
 :::{dropdown} {fa}`info-circle` Using flytekit plugins

@@ -29,7 +29,7 @@ pip install flytekitplugins-inference
 For a usage example, see {doc}`NIM example usage <serve_nim_container>`.
 
 ```{note}
-NIM can only be run in a Flyte cluster, not locally, as it must be deployed as a sidecar service in a Kubernetes pod.
+NIM can only be run in a Flyte cluster as it must be deployed as a sidecar service in a Kubernetes pod.
 ```
 
 ```{toctree}

@@ -0,0 +1,23 @@
+# ######################
+# NOTE: For CI/CD only #
+########################
+FROM python:3.11-slim-buster
+LABEL org.opencontainers.image.source=https://github.com/flyteorg/flytesnacks
+
+WORKDIR /root
+ENV VENV /opt/venv
+ENV LANG C.UTF-8
+ENV LC_ALL C.UTF-8
+ENV PYTHONPATH /root
+
+# Install Python dependencies
+COPY requirements.in /root
+RUN pip install -r /root/requirements.in
+
+# Copy the actual code
+COPY . /root/
+
+# This tag is supplied by the build script and will be used to determine the version
+# when registering tasks, workflows, and launch plans
+ARG tag
+ENV FLYTE_INTERNAL_IMAGE $tag
@@ -0,0 +1,36 @@
+(ollama_plugin)=
+
+# Ollama
+
+```{eval-rst}
+.. tags:: Inference, LLM
+```
+
+Serve large language models (LLMs) in a Flyte task.
+
+[Ollama](https://ollama.com/) simplifies the process of serving fine-tuned LLMs.
+Whether you're generating predictions from a customized model or deploying it across different hardware setups,
+Ollama enables you to encapsulate the entire workflow in a single pipeline.
+
+## Installation
+
+To use the Ollama plugin, run the following command:
+
+```
+pip install flytekitplugins-inference
+```
+
+## Example usage
+
+For a usage example, see {doc}`Ollama example usage <serve_llm>`.
+
+```{note}
+Ollama can only be run in a Flyte cluster as it must be deployed as a sidecar service in a Kubernetes pod.
+```
+
+```{toctree}
+:maxdepth: -1
+:hidden:
+
+serve_llm
+```
@@ -0,0 +1,99 @@
+# %% [markdown]
+# (serve_llm)=
+#
+# # Serve LLMs with Ollama
+#
+# In this guide, you'll learn how to locally serve Gemma2 and fine-tuned Llama3 models using Ollama within a Flyte task.
+#
+# Start by importing Ollama from the `flytekitplugins.inference` package and specifying the desired model name.
+#
+# Below is a straightforward example of serving a Gemma2 model:
+# %%
+from flytekit import ImageSpec, Resources, task
+from flytekit.extras.accelerators import A10G
+from flytekitplugins.inference import Model, Ollama
+from openai import OpenAI
+
+image = ImageSpec(
+    name="ollama_serve",
+    registry="ghcr.io/flyteorg",
+    packages=["flytekitplugins-inference"],
+    builder="default",
+)
+
+ollama_instance = Ollama(model=Model(name="gemma2"), gpu="1")
+
+
+@task(
+    container_image=image,
+    pod_template=ollama_instance.pod_template,
+    accelerator=A10G,
+    requests=Resources(gpu="0"),
+)
+def model_serving(user_prompt: str) -> str:
+    client = OpenAI(base_url=f"{ollama_instance.base_url}/v1", api_key="ollama")  # api key required but ignored
+
+    completion = client.chat.completions.create(
+        model="gemma2",
+        messages=[
+            {
+                "role": "user",
+                "content": user_prompt,
+            }
+        ],
+        temperature=0.5,
+        top_p=1,
+        max_tokens=1024,
+    )
+
+    return completion.choices[0].message.content
+
+
+# %% [markdown]
+# :::{important}
+# Replace `ghcr.io/flyteorg` with a container registry to which you can publish.
+# To upload the image to the local registry in the demo cluster, indicate the registry as `localhost:30000`.
+# :::
+#
+# The `model_serving` task initiates a sidecar service to serve the model, making it accessible on localhost via the `base_url` property.
+# You can use either the chat or chat completion endpoints.
+#
+# By default, Ollama initializes the server with `cpu`, `gpu`, and `mem` set to `1`, `1`, and `15Gi`, respectively.
+# You can adjust these settings to meet your requirements.
+#
+# To serve a fine-tuned model, provide the model configuration as `modelfile` within the `Model` dataclass.
+#
+# Below is an example of specifying a fine-tuned LoRA adapter for a Llama3 Mario model:
+# %%
+from flytekit.types.file import FlyteFile
+
+finetuned_ollama_instance = Ollama(
+    model=Model(
+        name="llama3-mario",
+        modelfile="FROM llama3\nADAPTER {inputs.ggml}\nPARAMETER temperature 1\nPARAMETER num_ctx 4096\nSYSTEM {inputs.system_prompt}",
+    ),
+    gpu="1",
+)
+
+
+@task(
+    container_image=image,
+    pod_template=finetuned_ollama_instance.pod_template,
+    accelerator=A10G,
+    requests=Resources(gpu="0"),
+)
+def finetuned_model_serving(ggml: FlyteFile, system_prompt: str) -> str:
+    ...
+
+
+# %% [markdown]
+# `{inputs.ggml}` and `{inputs.system_prompt}` are materialized at run time, with `ggml` and `system_prompt` available as inputs to the task.
+#
+# Ollama models can be integrated into different stages of your AI workflow, including data pre-processing,
+# model inference, and post-processing. Flyte also allows serving multiple Ollama models simultaneously
+# on various instances.
+#
+# This integration enables you to self-host and serve AI models on your own infrastructure,
+# ensuring full control over costs and data security.
+#
+# For more detailed information on the models natively supported by Ollama, visit the [Ollama models library](https://ollama.com/library).
@@ -0,0 +1 @@
+flytekitplugins-inference
diff --git a/modelfile b/modelfile
@@ -0,0 +1,13 @@
+FROM llama3:latest
+TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
+
+{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
+
+{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ .Response }}<|eot_id|>"""
+SYSTEM "You're a kitty. Answer using kitty sounds."
+PARAMETER stop "<|start_header_id|>"
+PARAMETER stop "<|end_header_id|>"
+PARAMETER stop "<|eot_id|>"
+PARAMETER stop "<|reserved_special_token"