diff --git a/260_private_aks_bastion/Readme.md b/260_private_aks_bastion/Readme.md index e89dacb..b27d60b 100644 --- a/260_private_aks_bastion/Readme.md +++ b/260_private_aks_bastion/Readme.md @@ -53,7 +53,7 @@ Once you are connected to the Azure VM, run the following command to connect to az login --identity # get the credentials of the AKS cluster -az aks get-credentials -g rg-private-aks-bastion-260 -n aks-private-260 +az aks get-credentials -g rg-private-aks-bastion-260 -n aks-cluster # verify the connection kubectl get nodes diff --git a/260_private_aks_bastion/aks.tf b/260_private_aks_bastion/aks.tf index 0434920..64943f1 100644 --- a/260_private_aks_bastion/aks.tf +++ b/260_private_aks_bastion/aks.tf @@ -30,14 +30,4 @@ resource "azurerm_kubernetes_cluster" "aks" { default_node_pool.0.upgrade_settings ] } -} - -resource "terraform_data" "aks-get-credentials" { - triggers_replace = [ - azurerm_kubernetes_cluster.aks.id - ] - - provisioner "local-exec" { - command = "az aks get-credentials -n ${azurerm_kubernetes_cluster.aks.name} -g ${azurerm_kubernetes_cluster.aks.resource_group_name} --overwrite-existing" - } -} +} \ No newline at end of file diff --git a/260_private_aks_bastion/install-tools.sh b/260_private_aks_bastion/install-tools.sh index a581907..5ab52cd 100644 --- a/260_private_aks_bastion/install-tools.sh +++ b/260_private_aks_bastion/install-tools.sh @@ -13,6 +13,6 @@ snap install kubectl --classic # az aks list -o table -# az aks get-credentials -n aks-cluster -g rg-spoke-202 --overwrite-existing +# az aks get-credentials -g rg-private-aks-bastion-260 -n aks-private-260 # kubectl get nodes \ No newline at end of file diff --git a/510_ai_ollama_k8s/Readme.md b/510_ai_ollama_k8s/Readme.md new file mode 100644 index 0000000..7666002 --- /dev/null +++ b/510_ai_ollama_k8s/Readme.md @@ -0,0 +1,51 @@ +# Ollama AI model deployment on Azure Kubernetes Service (AKS) + +https://github.com/open-webui/open-webui/tree/main/kubernetes/manifest/base + +```sh +$AKS_RG="rg-aks-ollama-llm" +$AKS_NAME="aks-cluster" + +# create resource group +az group create -n $AKS_RG -l swedencentral + +# create an AKS cluster +az aks create -n $AKS_NAME -g $AKS_RG --network-plugin azure --network-plugin-mode overlay -k 1.30.3 --node-vm-size Standard_D4s_v5 + +# get credentials +az aks get-credentials -n $AKS_NAME -g $AKS_RG --overwrite-existing + +# deploy Ollama server and client app (Open-WebUI) into AKS +kubectl apply -f . + +# check the install +kubectl get all -n open-webui + +# install LLM model likw phi3 or llama3.1 into ollama server +kubectl exec ollama-0 -n ollama -it -- ollama run phi3 + +# get the public IP of the client service +kubectl get svc -n open-webui +``` + +Here are some example models that can be used in `ollama` [available here](https://github.com/ollama/ollama/blob/main/README.md#model-library): + +| Model | Parameters | Size | Download | +| ------------------ | ---------- | ----- | ------------------------------ | +| Llama 3.1 | 8B | 4.7GB | `ollama run llama3.1` | +| Llama 3.1 | 70B | 40GB | `ollama run llama3.1:70b` | +| Llama 3.1 | 405B | 231GB | `ollama run llama3.1:405b` | +| Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` | +| Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` | +| Gemma 2 | 2B | 1.6GB | `ollama run gemma2:2b` | +| Gemma 2 | 9B | 5.5GB | `ollama run gemma2` | +| Gemma 2 | 27B | 16GB | `ollama run gemma2:27b` | +| Mistral | 7B | 4.1GB | `ollama run mistral` | +| Moondream 2 | 1.4B | 829MB | `ollama run moondream` | +| Neural Chat | 7B | 4.1GB | `ollama run neural-chat` | +| Starling | 7B | 4.1GB | `ollama run starling-lm` | +| Code Llama | 7B | 3.8GB | `ollama run codellama` | +| Llama 2 Uncensored | 7B | 3.8GB | `ollama run llama2-uncensored` | +| LLaVA | 7B | 4.5GB | `ollama run llava` | +| Solar | 10.7B | 6.1GB | `ollama run solar` | + diff --git a/510_ai_ollama_k8s/namespace.yaml b/510_ai_ollama_k8s/namespace.yaml new file mode 100644 index 0000000..3a29c01 --- /dev/null +++ b/510_ai_ollama_k8s/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: ollama \ No newline at end of file diff --git a/510_ai_ollama_k8s/ollama-service.yaml b/510_ai_ollama_k8s/ollama-service.yaml new file mode 100644 index 0000000..919510d --- /dev/null +++ b/510_ai_ollama_k8s/ollama-service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: ollama-service + namespace: ollama +spec: + type: ClusterIP + selector: + app: ollama + ports: + - protocol: TCP + port: 11434 + targetPort: 11434 \ No newline at end of file diff --git a/510_ai_ollama_k8s/ollama-statefulset.yaml b/510_ai_ollama_k8s/ollama-statefulset.yaml new file mode 100644 index 0000000..e413195 --- /dev/null +++ b/510_ai_ollama_k8s/ollama-statefulset.yaml @@ -0,0 +1,41 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: ollama + namespace: ollama +spec: + serviceName: ollama + replicas: 1 + selector: + matchLabels: + app: ollama + template: + metadata: + labels: + app: ollama + spec: + containers: + - name: ollama + image: ollama/ollama:latest + ports: + - containerPort: 11434 + resources: + requests: + cpu: "2000m" + memory: "2Gi" + limits: + cpu: "4000m" + memory: "16Gi" + nvidia.com/gpu: "0" + volumeMounts: + - name: ollama-volume + mountPath: /root/.ollama + tty: true + volumeClaimTemplates: + - metadata: + name: ollama-volume + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 90Gi \ No newline at end of file diff --git a/510_ai_ollama_k8s/webui-deployment.yaml b/510_ai_ollama_k8s/webui-deployment.yaml new file mode 100644 index 0000000..6267b78 --- /dev/null +++ b/510_ai_ollama_k8s/webui-deployment.yaml @@ -0,0 +1,40 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: open-webui-deployment + namespace: ollama +spec: + replicas: 1 + selector: + matchLabels: + app: open-webui + template: + metadata: + labels: + app: open-webui + spec: + containers: + - name: open-webui + image: ghcr.io/open-webui/open-webui:main + ports: + - containerPort: 8080 + resources: + requests: + cpu: "500m" + memory: "500Mi" + limits: + cpu: "1000m" + memory: "1Gi" + env: + - name: OLLAMA_BASE_URL + value: "http://ollama-service.ollama.svc.cluster.local:11434" + - name: WEBUI_AUTH + value: "False" + tty: true + volumeMounts: + - name: webui-volume + mountPath: /app/backend/data + volumes: + - name: webui-volume + persistentVolumeClaim: + claimName: open-webui-pvc \ No newline at end of file diff --git a/510_ai_ollama_k8s/webui-ingress.yaml b/510_ai_ollama_k8s/webui-ingress.yaml new file mode 100644 index 0000000..d9a85ff --- /dev/null +++ b/510_ai_ollama_k8s/webui-ingress.yaml @@ -0,0 +1,20 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: open-webui-ingress + namespace: ollama + #annotations: + # Use appropriate annotations for your Ingress controller, e.g., for NGINX: + # nginx.ingress.kubernetes.io/rewrite-target: / +spec: + rules: + - host: open-webui.minikube.local + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: open-webui-service + port: + number: 8080 \ No newline at end of file diff --git a/510_ai_ollama_k8s/webui-pvc.yaml b/510_ai_ollama_k8s/webui-pvc.yaml new file mode 100644 index 0000000..8c2da1f --- /dev/null +++ b/510_ai_ollama_k8s/webui-pvc.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + labels: + app: open-webui + name: open-webui-pvc + namespace: ollama +spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 2Gi \ No newline at end of file diff --git a/510_ai_ollama_k8s/webui-service.yaml b/510_ai_ollama_k8s/webui-service.yaml new file mode 100644 index 0000000..163515c --- /dev/null +++ b/510_ai_ollama_k8s/webui-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: open-webui-service + namespace: ollama +spec: + type: LoadBalancer # NodePort # Use LoadBalancer if you're on a cloud that supports it + selector: + app: open-webui + ports: + - protocol: TCP + port: 80 + targetPort: 8080 + # If using NodePort, you can optionally specify the nodePort: + # nodePort: 30000 diff --git a/_kaito/app/app.py b/_kaito/app/app.py new file mode 100644 index 0000000..7fbac4d --- /dev/null +++ b/_kaito/app/app.py @@ -0,0 +1,57 @@ +from openai import AzureOpenAI +# from openai import OpenAI +import streamlit as st + +with st.sidebar: + openai_api_key = st.text_input( + "OpenAI API Key", key="chatbot_api_key", type="password" + ) + "[Get an OpenAI API key](https://platform.openai.com/account/api-keys)" + "[View the source code](https://github.com/streamlit/llm-examples/blob/main/Chatbot.py)" + "[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/streamlit/llm-examples?quickstart=1)" + +st.title("💬 Chatbot") +st.caption("🚀 A Streamlit chatbot powered by OpenAI") + +if "messages" not in st.session_state: + st.session_state["messages"] = [ + {"role": "assistant", "content": "How can I help you?"} + ] + +for msg in st.session_state.messages: + st.chat_message(msg["role"]).write(msg["content"]) + +if prompt := st.chat_input(): + if not openai_api_key: + st.info("Please add your OpenAI API key to continue.") + st.stop() + + openai_client = AzureOpenAI( + azure_endpoint="https://swedencentral.api.cognitive.microsoft.com", + api_key=openai_api_key, + api_version="2024-06-01", + ) + # client = OpenAI(api_key=openai_api_key) + + st.session_state.messages.append({"role": "user", "content": prompt}) + + st.chat_message("user").write(prompt) + + response = openai_client.chat.completions.create( + model="gpt-4o", + messages=st.session_state.messages + # messages=[ + # {"role": "system", "content": "You are a helpful assistant."}, + # {"role": "user", "content": "Who are you ?"}, + # ], + ) + + # response = client.chat.completions.create( + # model="gpt-3.5-turbo", messages=st.session_state.messages + # ) + + msg = response.choices[0].message.content + + st.session_state.messages.append({"role": "assistant", "content": msg}) + + st.chat_message("assistant").write(msg) diff --git a/_kaito/app/flaskapp.py b/_kaito/app/flaskapp.py new file mode 100644 index 0000000..7fad2b4 --- /dev/null +++ b/_kaito/app/flaskapp.py @@ -0,0 +1,23 @@ +from flask import Flask, request, jsonify +import openai + +app = Flask(__name__) + +# Set your Azure OpenAI API key +openai.api_key = 'YOUR_AZURE_OPENAI_API_KEY' + +@app.route('/chat', methods=['POST']) +def chat(): + user_input = request.json.get('message') + response = openai.ChatCompletion.create( + model="gpt-4o", # Updated to use GPT-4o + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": user_input} + ], + max_tokens=150 + ) + return jsonify(response.choices[0].message['content'].strip()) + +if __name__ == '__main__': + app.run(debug=True) diff --git a/_kaito/app/requirements.txt b/_kaito/app/requirements.txt new file mode 100644 index 0000000..b03829e --- /dev/null +++ b/_kaito/app/requirements.txt @@ -0,0 +1,8 @@ +streamlit>=1.28 +langchain>=0.0.217 +openai>=1.2 +duckduckgo-search +anthropic>=0.3.0 +trubrics>=1.4.3 +streamlit-feedback +langchain-community \ No newline at end of file diff --git a/_kaito/commands.ps1 b/_kaito/commands.ps1 index 58cadb3..24863f8 100644 --- a/_kaito/commands.ps1 +++ b/_kaito/commands.ps1 @@ -1,6 +1,8 @@ +# https://learn.microsoft.com/en-us/azure/aks/ai-toolchain-operator + $AZURE_SUBSCRIPTION_ID=$(az account show --query id -o tsv) -$AZURE_RESOURCE_GROUP="rg-kaito" -$AZURE_LOCATION="swedencentral" +$AZURE_RESOURCE_GROUP="rg-aks-kaito-frc" +$AZURE_LOCATION="francecentral" # "swedencentral" $CLUSTER_NAME="aks-cluster" az group create --name $AZURE_RESOURCE_GROUP --location $AZURE_LOCATION @@ -34,6 +36,6 @@ kubectl rollout restart deployment/kaito-gpu-provisioner -n kube-system kubectl get deployment -n kube-system | grep kaito # Deploy the Falcon 7B-instruct model from the KAITO model repository using the kubectl apply command. -kubectl apply -f https://raw.githubusercontent.com/Azure/kaito/main/examples/kaito_workspace_falcon_7b-instruct.yaml +kubectl apply -f https://raw.githubusercontent.com/Azure/kaito/main/examples/inference/kaito_workspace_falcon_7b-instruct.yaml kubectl get workspace workspace-falcon-7b-instruct -w \ No newline at end of file diff --git a/_kaito/kaito_workspace_falcon_7b-instruct-d4s-v5.yaml b/_kaito/kaito_workspace_falcon_7b-instruct-d4s-v5.yaml new file mode 100644 index 0000000..76d7ad4 Binary files /dev/null and b/_kaito/kaito_workspace_falcon_7b-instruct-d4s-v5.yaml differ diff --git a/_kaito/kaito_workspace_falcon_7b-instruct.yaml b/_kaito/kaito_workspace_falcon_7b-instruct.yaml index 1b1efba..651c6b0 100644 Binary files a/_kaito/kaito_workspace_falcon_7b-instruct.yaml and b/_kaito/kaito_workspace_falcon_7b-instruct.yaml differ diff --git a/_kaito/ollama-deploy-cpu.yaml b/_kaito/ollama-deploy-cpu.yaml new file mode 100644 index 0000000..ea861d6 --- /dev/null +++ b/_kaito/ollama-deploy-cpu.yaml @@ -0,0 +1,43 @@ +# https://github.com/ollama/ollama/blob/main/examples/kubernetes/cpu.yaml +--- +apiVersion: v1 +kind: Namespace +metadata: + name: ollama +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ollama + namespace: ollama +spec: + selector: + matchLabels: + name: ollama + template: + metadata: + labels: + name: ollama + spec: + containers: + - name: ollama + image: ollama/ollama:latest + ports: + - name: http + containerPort: 11434 + protocol: TCP +--- +apiVersion: v1 +kind: Service +metadata: + name: ollama + namespace: ollama +spec: + type: ClusterIP + selector: + name: ollama + ports: + - port: 80 + name: http + targetPort: http + protocol: TCP \ No newline at end of file diff --git a/_kaito/ollama-deploy-gpu.yaml b/_kaito/ollama-deploy-gpu.yaml new file mode 100644 index 0000000..8af28aa --- /dev/null +++ b/_kaito/ollama-deploy-gpu.yaml @@ -0,0 +1,59 @@ +# https://github.com/ollama/ollama/blob/main/examples/kubernetes/gpu.yaml +--- +apiVersion: v1 +kind: Namespace +metadata: + name: ollama +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ollama + namespace: ollama +spec: + strategy: + type: Recreate + selector: + matchLabels: + name: ollama + template: + metadata: + labels: + name: ollama + spec: + containers: + - name: ollama + image: ollama/ollama:latest + env: + - name: PATH + value: /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + - name: LD_LIBRARY_PATH + value: /usr/local/nvidia/lib:/usr/local/nvidia/lib64 + - name: NVIDIA_DRIVER_CAPABILITIES + value: compute,utility + ports: + - name: http + containerPort: 11434 + protocol: TCP + resources: + limits: + nvidia.com/gpu: 1 + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule +--- +apiVersion: v1 +kind: Service +metadata: + name: ollama + namespace: ollama +spec: + type: ClusterIP + selector: + name: ollama + ports: + - port: 80 + name: http + targetPort: http + protocol: TCP \ No newline at end of file diff --git a/_kaito/ollama.ps1 b/_kaito/ollama.ps1 new file mode 100644 index 0000000..154fe29 --- /dev/null +++ b/_kaito/ollama.ps1 @@ -0,0 +1,7 @@ +# https://ollama.com/blog/ollama-is-now-available-as-an-official-docker-image + +# CPU only +docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama + +# Now you can run a model like Llama 2 inside the container. +docker exec -it ollama ollama run llama2 \ No newline at end of file