From 50c49f58dc0900960a44779a4281c9d8686fc7da Mon Sep 17 00:00:00 2001 From: Swarnim Arun Date: Mon, 29 Apr 2024 15:29:58 +0530 Subject: [PATCH 1/2] docs: update use llama-server instead I created llama-server image over the weekend, it's very small, simple and entirely static. No python or interpreted lanugages here, also it uses the .cache dir, as long as it's volume mounted you can easily cache it. --- docs/guides/langchain.md | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/docs/guides/langchain.md b/docs/guides/langchain.md index 432c7ec..7bdbfb5 100644 --- a/docs/guides/langchain.md +++ b/docs/guides/langchain.md @@ -42,9 +42,17 @@ spec: template: spec: containers: - - name: "ai-model-7b" - image: modelzai/llm-llama-7b:latest # GPU requirements: A100(40GB) - # alternatively use, modelzai/llm-bloomz-560m:latest (even works on CPU) + - name: "ai-model" + image: swarnimarun/llama-server:latest-cuda + # GPU requirements: T4(16GB) + # For CPU : "swarnimarun/llama-server:latest" - 16GB + args: + - "-m" + - "lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF" + - "-g" + - "33" + - "-q" + - "q8" accelerator: interface: "CUDA" minVersion: @@ -52,13 +60,13 @@ spec: resources: limits: cpu: "1" - memory: "16Gi" # a decent amount of RAM is required for loading the model as well + memory: "2Gi" # some amount of RAM is required for loading the model as well, for cpu use atleast 16GB of RAM ``` - Port forward the deployment service. If you don't have a proper ingress setup for your cluster. ```bash -kubectl port-forward service/aideployment 8000:8000 +kubectl port-forward service/aideployment 80:8000 ``` - Now locally, install the required libraries. @@ -73,11 +81,11 @@ pip install langchain openai import os import openai -# note: we port-forwarded the service to 8000 -openai.api_base="http://localhost:8000" +# note: we port-forwarded the service to 80 aka http +openai.api_base="http://localhost" # if you have ingress setup then use your domain name # you can also modify the port to use http(s) port itself -# openai.api_base="https://.tld:8000" +# openai.api_base="https://.tld" openai.api_key = "any" ``` From 6d5be25edd3eea2d0003703f0c399491f097fbc2 Mon Sep 17 00:00:00 2001 From: Swarnim Arun Date: Mon, 6 May 2024 10:50:26 +0530 Subject: [PATCH 2/2] docs: explain issues with low RAM --- docs/guides/langchain.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/guides/langchain.md b/docs/guides/langchain.md index 7bdbfb5..503e3f4 100644 --- a/docs/guides/langchain.md +++ b/docs/guides/langchain.md @@ -60,7 +60,9 @@ spec: resources: limits: cpu: "1" - memory: "2Gi" # some amount of RAM is required for loading the model as well, for cpu use atleast 16GB of RAM + memory: "2Gi" # loading of model maybe slow or buggy for large models with low RAM + # for faster initial loading of large models increase to at least 8GB of RAM + # if you want to use CPU inference, use at least 16GB of RAM for 7B models ``` - Port forward the deployment service. If you don't have a proper ingress setup for your cluster.