From 47d0f4ab00bf342b99cce233c3632cadc47cb7a0 Mon Sep 17 00:00:00 2001 From: Swarnim Arun Date: Thu, 4 Apr 2024 16:54:05 +0530 Subject: [PATCH] format markdown and remove redundant files --- controllers/aideployment/scheduling.go | 24 ++++---- docs/contributing.md | 28 +++++----- docs/deployment.md | 15 +++-- docs/developer_guide.md | 68 +++++++++++++++-------- docs/vllm.md | 77 ++++++++++++-------------- examples/cpu.yaml | 20 +++++++ examples/generic.yaml | 2 - examples/models.yaml | 5 +- examples/test.yaml | 31 ----------- examples/testingress.yaml | 46 --------------- hack/.keep | 1 + script/install_gpu_operator_k3s.sh | 2 +- 12 files changed, 142 insertions(+), 177 deletions(-) create mode 100644 examples/cpu.yaml delete mode 100644 examples/test.yaml delete mode 100644 examples/testingress.yaml create mode 100644 hack/.keep diff --git a/controllers/aideployment/scheduling.go b/controllers/aideployment/scheduling.go index 5b5cfe5..13da1f5 100644 --- a/controllers/aideployment/scheduling.go +++ b/controllers/aideployment/scheduling.go @@ -35,33 +35,31 @@ func addTopologySpread(tmpl *v1.PodTemplateSpec) { }) } -func neededGPUs(deploy a1.Deployment) (resource.Quantity, error) { - gpus := resource.MustParse("0") +func neededNvidiaGPUs(deploy a1.Deployment) (resource.Quantity, error) { + zerogpus := resource.MustParse("0") if deploy.Accelerator == nil { if deploy.Resources.Requests == nil { - return gpus, nil + return zerogpus, nil } if _, ok := deploy.Resources.Requests[constants.NvidiaGPULabel]; ok { - return gpus, fmt.Errorf("deployment requests Nvidia GPU but no accelerator is specified") + return zerogpus, fmt.Errorf("deployment requests Nvidia GPU but no accelerator is specified") } else { - return gpus, nil + return zerogpus, nil } } // If you add non-Nvidia accelerators, remember to set the pod runtime class name if deploy.Accelerator.Interface != a1.AcceleratorInterfaceCUDA { - return gpus, fmt.Errorf("unsupported accelerator interface: %s", deploy.Accelerator.Interface) + return zerogpus, fmt.Errorf("unsupported accelerator interface: %s", deploy.Accelerator.Interface) } if gpusSpec, ok := deploy.Resources.Requests[constants.NvidiaGPULabel]; ok { return gpusSpec, nil } - gpus.Add(resource.MustParse("1")) - - return gpus, nil + return resource.MustParse("1"), nil } func findContainerEngine(appDeployment *appsv1.Deployment) (engineContainer *v1.Container) { @@ -80,7 +78,7 @@ func AddSchedulingProperties(appDeployment *appsv1.Deployment, AIDeployment a1.A pod := &appDeployment.Spec.Template.Spec pod.NodeSelector = utils.MergeMaps(pod.NodeSelector, AIDeployment.Deployment.NodeSelector) - gpus, err := neededGPUs(AIDeployment.Deployment) + nvidiaGpus, err := neededNvidiaGPUs(AIDeployment.Deployment) if err != nil { return err } @@ -100,9 +98,9 @@ func AddSchedulingProperties(appDeployment *appsv1.Deployment, AIDeployment a1.A AIDeployment.Deployment.Resources.Limits, ) - if !gpus.IsZero() { - engineContainer.Resources.Requests[constants.NvidiaGPULabel] = gpus - engineContainer.Resources.Limits[constants.NvidiaGPULabel] = gpus + if !nvidiaGpus.IsZero() { + engineContainer.Resources.Requests[constants.NvidiaGPULabel] = nvidiaGpus + engineContainer.Resources.Limits[constants.NvidiaGPULabel] = nvidiaGpus if pod.RuntimeClassName == nil { runtimeClassName := "nvidia" diff --git a/docs/contributing.md b/docs/contributing.md index e809be8..94510f0 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -18,31 +18,33 @@ We welcome all contributions, including bug fixes, feature requests, documentati - This approach ensures that your efforts are in line with the project's goals and have a higher chance of being accepted. - However, if you prefer to submit a PR directly, please be aware that it is possible not be reviewed. - ## Getting Started 1. **Fork the repository** - Start by forking the project repository to your GitHub account. This creates a personal copy for you to work on. 2. **Clone the forked repository** - Clone your fork to your local machine to start making changes. -```bash - git clone https://github.com/YOUR_USERNAME/YOUR_FORKED_REPO.git -``` + ```bash + git clone https://github.com/YOUR_USERNAME/YOUR_FORKED_REPO.git + ``` + 3. **Create a branch** - Create a new branch for the changes you want to make. Use a descriptive branch name. -```bash - git checkout -b branch-name -``` + ```bash + git checkout -b branch-name + ``` + 4. **Make changes** - Make your changes to the codebase. 5. **Commit changes** - Commit your changes with a descriptive commit message. -```bash - git commit -m 'commit message' -``` + ```bash + git commit -m 'commit message' + ``` + 6. **Push changes** - Push your changes to your forked repository. -```bash - git push origin branch-name -``` + ```bash + git push origin branch-name + ``` Before making any changes, please make sure to check out our [Developer Guide](./developer_guide.md) for detailed instructions on code standards, testing procedures, and more to ensure your contributions align with our project's standards. Check out detailed instructions on [GitHub Workflow Guide.](https://github.com/kubernetes/community/blob/master/contributors/guide/github-workflow.md) diff --git a/docs/deployment.md b/docs/deployment.md index 2c8758f..92d9792 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -5,22 +5,29 @@ - K8s cluster The Operator can be run without the following, but some features will be absent: + - Helm - Ingress controller (e.g. Traefik) - Nvidia GPU Operator ## Artifacts + Prem-Operator has three artifacts: + 1. [Source Code](https://github.com/premAI-io/saas-controller) -1. [Prem-Operator Docker image](#container-images) -1. [Prem-Operator Helm chart](https://hub.docker.com/r/premai/prem-operator-chart) +2. [Prem-Operator Docker image](#container-images) +3. [Prem-Operator Helm chart](https://hub.docker.com/r/premai/prem-operator-chart) ## Installation + After setting up K8s cluster, you can optionally install [Nvidia GPU Operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/getting-started.html) and [Traefik](https://doc.traefik.io/traefik/getting-started/install-traefik/#use-the-helm-chart) as an ingress controller. -Note that Nvidia GPU Operator is required for GPU support and Traefik can be used for handling ingress traffic. + +Note that Nvidia GPU Operator is required for GPU support as they are considered extended resource, [docs](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#extended-resources). Also for non-nvidia gpus you will require separate device plugins. +We use Traefik for handling ingress traffic for tests and deployments, but any ingress controller should work. Now install Prem-Operator using Helm: + ```bash -$ helm install oci://registry-1.docker.io/premai/prem-operator-chart +helm install oci://registry-1.docker.io/premai/prem-operator-chart ``` ### Flux diff --git a/docs/developer_guide.md b/docs/developer_guide.md index 13bf05b..edc06af 100644 --- a/docs/developer_guide.md +++ b/docs/developer_guide.md @@ -3,38 +3,43 @@ ## Project Overview This project aims to follow the Kubernetes [Operator pattern](https://kubernetes.io/docs/concepts/extend-kubernetes/operator/) -It uses [Controllers](https://kubernetes.io/docs/concepts/architecture/controller/) -which provides a reconcile function responsible for synchronizing resources until the desired state is reached on the cluster +It consists of [Controllers](https://kubernetes.io/docs/concepts/architecture/controller/) which provides a reconcile function +responsible for synchronizing resources until the desired state is reached on the cluster. + The major components are: -1. **AI Deployment Custom Resource with Controller**:
- AIDeployment is a custom Kubernetes resource that encapsulates the configuration necessary for deploying and managing AI models within a Kubernetes cluster. It allows users to specify details about the AI engine, model parameters, computational resource requirements, networking settings like endpoints, services, and ingresses, as well as environmental variables and arguments for model deployment. This resource aims to streamline the deployment process of AI models, making it easier to manage, scale, and update AI deployments in a cloud-native ecosystem. -2. **AIModelMap Custom Resource with Controller**:
- The AIModelMap Custom Resource (CR) is a Kubernetes resource defined to facilitate the mapping and management of artificial intelligence (AI) model specifications across various execution engines, such as TensorRT, DeepSpeed-Mii, LocalAI, and VLLM. It allows for the specification of key details like the model's data type, engine configuration, quantization settings, and access URIs, alongside variant-specific configurations. By serving as a centralized repository for model specifications, AIModelMap enables consistent, efficient, and scalable deployment of AI models across multiple Kubernetes deployments, streamlining the management of model configurations and fostering reuse and flexibility in AI deployments within the Kubernetes ecosystem. -3. **AutoNodeLabeler with Controller**:
- The AutoNodeLabeler is a Kubernetes custom resource (CR) designed to automatically apply labels to nodes based on specified criteria, such as hardware configurations. It enables precise and dynamic scheduling of workloads by labeling nodes with specific attributes like GPU types and sizes. This CR facilitates efficient resource utilization, cluster segmentation, and automation in node labeling, improving both cluster management and workload performance. + +1. **AI Deployment Custom Resource (CR) with Controller**: + AIDeployment is a custom Kubernetes resource that encapsulates the configuration necessary for deploying and managing AI models within a Kubernetes cluster. It allows users to specify details about the AI engine, model weights, computational resource requirements, networking settings like endpoints, services, and ingresses, as well as environmental variables and arguments for model deployment. This resource aims to streamline the deployment process of AI models, making it easier to manage, scale, and update AI deployments in a cloud-native ecosystem. +2. **AIModelMap Custom Resource with Controller**: + The AIModelMap is a Kubernetes resource defined to facilitate the management of AI model specifications across various execution engines + for the AIDeployment(s) to use, such as TensorRT, DeepSpeed-MII, LocalAI, and VLLM. + It allows for the key details like about the model such as, the data type, engine configuration, quantization settings, and access URIs, + alongside variant-specific configurations to pre-defined for model variants during deployment. +3. **AutoNodeLabeler with Controller**: + The AutoNodeLabeler is a Kubernetes resource designed to automatically apply labels to nodes based on specified criteria, such as hardware configurations. It helps with precise scheduling of workloads by labeling nodes with specific attributes like GPU types and sizes, often useful for setting up features such as MIG for GPU-accelerated nodes. ## Requirements -- Docker -- Kind +- Go (>= 1.21) +- Docker Engine (>= v23.0) +- Kind (>= v0.20.0) - Kubectl - Kustomize -- Go - Helm ## Development and Testing -The Prem-Operator needs to be developed and tested in a Kubernetes environment. For local testing, we recommend using [KIND](https://sigs.k8s.io/kind), which allows you to run a Kubernetes cluster within Docker containers.
+The Prem-Operator needs to be developed and tested in a Kubernetes environment. For local testing, we recommend using [KIND](https://sigs.k8s.io/kind), which allows you to run a Kubernetes cluster within Docker containers. Note you should prefer testing with CPU based models as KIND at the time of writing does not properly support GPUs. -To facilitate the development process, we provide various Makefile targets leveraging KIND as the Kubernetes cluster. Run make --help to see all available make targets.
+To facilitate the development process, we provide various Makefile targets leveraging KIND as the Kubernetes cluster. Run make `--help` to see all available targets. Please note that certain components, like vLLM engines, require GPU support for execution. As KIND does not offer GPU support, alternatives like K3s or any Kubernetes cluster with GPU capabilities should be considered for these cases. For detailed instructions on running Mistral on K3s with vLLM from the Prem-Operator source code, refer to [this guide](./vllm.md) for more information. -#### Installing Prem-Operator: Process Overview +### Installing Prem-Operator: Process Overview Installing the Prem-Operator involves a series of steps designed to prepare your local development environment for both development and testing. Here’s what happens at each step: -- **Install KIND:** KIND (Kubernetes IN Docker) is a tool for running local Kubernetes clusters using Docker container “nodes.” KIND is particularly useful for development and testing of Kubernetes applications like Prem-Operator. The installation sets up KIND on your machine, enabling you to create a local cluster. +- **Install KIND:** KIND (Kubernetes "in" Docker) is a tool for running local Kubernetes clusters using Docker container “nodes.” KIND is particularly useful for development and testing of Kubernetes applications like Prem-Operator. The installation sets up KIND on your machine, enabling you to create a local cluster. - **Create a KIND Cluster:** This step initializes a new Kubernetes cluster on your local machine using KIND. The cluster simulates a real Kubernetes environment, allowing you to test Prem-Operator in conditions resembling its intended runtime environment. @@ -46,33 +51,46 @@ Installing the Prem-Operator involves a series of steps designed to prepare your - **Deploy the Prem-Operator Controller:** Finally, this step deploys the Prem-Operator controller into your Kubernetes cluster. The controller is the core component that monitors for changes to resources and applies the necessary logic to react to these changes, effectively managing the Prem-Operator's operational logic within the cluster. +#### Install Kind -#### Install Kind To install KIND, go to [official page](https://kind.sigs.k8s.io/docs/user/quick-start/#installation) and follow the instructions for your operating system, or run the following script for Ubuntu: + ```bash ./../script/install_kind.sh ``` #### Create a Kind cluster with Traefik, build prem-operator Docker image and load it to Kind cluster + ```bash make kind-setup ``` #### Install CRDs into the cluster + ```bash make install ``` #### Deploy prem-operator controller + ```bash make deploy ``` -#### Run test +#### Run test + ```bash make test ``` +#### Regenerate boilerplate code for types + +Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations for the types to be used in the manifest. + +```sh +make generate +``` + #### Modifying the API definitions If you are editing the API definitions, generate the manifests such as CRs or CRDs using: @@ -81,7 +99,6 @@ If you are editing the API definitions, generate the manifests such as CRs or CR make manifests ``` - #### Uninstall CRDs To delete the CRDs from the cluster: @@ -99,19 +116,22 @@ make undeploy ``` ## Run AI Model inside Engine + Check [examples](./../examples) of AI Model deployment inside different Engines. Bellow is an example of deploying tinyllama inside LocalAi engine: -``` + +```bash kubectl apply -f example/test.yaml ``` After deploying you can infer the model using the following curl command: -``` + +```bash curl http://foo.127.0.0.1.nip.io:8080/v1/completions -H "Content-Type: application/json" -d '{ -"model": "tinyllama-chat", -"prompt": "Do you always repeat yourself?", -"temperature": 0.1, -"max_tokens": 50 + "model": "tinyllama-chat", + "prompt": "Do you always repeat yourself?", + "temperature": 0.1, + "max_tokens": 50 }' ``` diff --git a/docs/vllm.md b/docs/vllm.md index 4e4828e..4e01bdd 100644 --- a/docs/vllm.md +++ b/docs/vllm.md @@ -1,10 +1,12 @@ -## Run Mistral on K3s with vLLM from prem-operator source code +# Run Mistral on K3s with vLLM from prem-operator source code + +## Overview -### Overview This guide will help you to run Mistral on K3s with vLLM from prem-operator source code. Ubuntu 22.04.3 LTS was used as a host OS. -### Prerequisites +## Prerequisites + - K3s cluster - Helm - Git @@ -14,61 +16,52 @@ Ubuntu 22.04.3 LTS was used as a host OS. - K9s(optional) - Nvidia GPU Operator -### Steps -1. Install K3s cluster +## Steps + ```bash +# 1. Install K3s cluster ./../scripts/install_k3s.sh -``` -2. Install Helm -```bash + +# 2. Install Helm ./.../scripts/install_helm.sh -``` -3. Install Nvidia GPU Operator -```bash + +# 3. Install Nvidia GPU Operator ./../scripts/install_gpu_operator_k3s.sh -``` -4. Install tools: make, curl, jq -```bash + +# 4. Install tools: make, curl, jq ./../scripts/install_make_curl_jq.sh -``` -5. Install Go -```bash + +# 5. Install Go ./../scripts/install_go.sh -``` -6. Install Docker -```bash + +# 6. Install Docker ./../scripts/install_docker.sh -``` -7. Install K9s(optional) -```bash + +# 7. Install K9s(optional) ./../scripts/install_k9s.sh -``` -8. Clone prem-operator repository -```bash + +# 8. Clone prem-operator repository git clone git@github.com:premAI-io/prem-operator.git -``` -9. Deploy AIDeployment CRD -```bash + +# 9. Deploy AIDeployment CRD sudo make install -``` -10. Build prem-operator Docker image -```bash + +# 10. Build prem-operator Docker image sudo make docker-build -``` -11. Load Docker image to K3s cluster -```bash + +# 11. Load Docker image to K3s cluster sudo docker save -o ./controller controller:latest sudo k3s ctr images import controller -``` -12. Deploy prem-operator -```bash + +# 12. Deploy prem-operator sudo make deploy -``` -13. Deploy vLLM -```bash + +# 13. Deploy vLLM sudo kubectl apply -f ./../examples/vllm.yaml ``` -14. Send request to vLLM + +### Send request to vLLM using curl and process the response with jq + ```bash curl -X 'POST' http://vllm.127.0.0.1.nip.io/v1/completions \ -H 'accept: application/json' \ diff --git a/examples/cpu.yaml b/examples/cpu.yaml new file mode 100644 index 0000000..2a5afd0 --- /dev/null +++ b/examples/cpu.yaml @@ -0,0 +1,20 @@ +apiVersion: premlabs.io/v1alpha1 +kind: AIDeployment +metadata: + name: simple + namespace: default +spec: + engine: + name: "localai" + endpoint: + - domain: "foo.127.0.0.1.nip.io" + port: 8080 + models: + - uri: tinyllama-chat + deployment: + resources: + requests: + # cpu only deployment, + # as gpu resource is part of requests + cpu: 4 + memory: "4Gi" diff --git a/examples/generic.yaml b/examples/generic.yaml index 324b5fd..ab15ad2 100644 --- a/examples/generic.yaml +++ b/examples/generic.yaml @@ -19,8 +19,6 @@ metadata: name: generic-test namespace: default spec: - # single node instance - # no dashboard support engine: name: "generic" endpoint: diff --git a/examples/models.yaml b/examples/models.yaml index 7964b42..023ab0b 100644 --- a/examples/models.yaml +++ b/examples/models.yaml @@ -30,10 +30,13 @@ metadata: spec: engine: name: "localai" + # multiple models can be loaded together models: - - modelMapRef: + - modelMapRef: + # name specified here is used by the model name: tinyllama-1.1b-chat-v0.1 variant: q4-k-m + # uri is the name of the model as well - uri: "phi-2" endpoint: - domain: "tinyllama.127.0.0.1.nip.io" diff --git a/examples/test.yaml b/examples/test.yaml deleted file mode 100644 index eb794a0..0000000 --- a/examples/test.yaml +++ /dev/null @@ -1,31 +0,0 @@ -apiVersion: premlabs.io/v1alpha1 -kind: AIDeployment -metadata: - name: simple - namespace: default -spec: - # single node instance - # no dashboard support - engine: - name: "localai" - endpoint: - - domain: "foo.127.0.0.1.nip.io" - port: 8080 - models: - - uri: tinyllama-chat - service: - annotations: - labels: - ingress: - annotations: - labels: - deployment: - annotations: - labels: - resources: - requests: - cpu: 8 - memory: "3Gi" - env: - - name: "DEBUG" - value: "true" diff --git a/examples/testingress.yaml b/examples/testingress.yaml deleted file mode 100644 index a6a5252..0000000 --- a/examples/testingress.yaml +++ /dev/null @@ -1,46 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: whoami-deployment -spec: - replicas: 1 - selector: - matchLabels: - app: whoami - template: - metadata: - labels: - app: whoami - spec: - containers: - - name: whoami-container - image: containous/whoami ---- -apiVersion: v1 -kind: Service -metadata: - name: whoami-service -spec: - ports: - - name: http - targetPort: 80 - port: 80 - selector: - app: whoami ---- -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: whoami-ingress -spec: - rules: - - host: whoami.192.168.68.115.nip.io - http: - paths: - - path: / - pathType: Exact - backend: - service: - name: whoami-service - port: - number: 80 diff --git a/hack/.keep b/hack/.keep new file mode 100644 index 0000000..544584c --- /dev/null +++ b/hack/.keep @@ -0,0 +1 @@ +USED BY Makefile AS HEADER FILE FOR GENERATED CODE diff --git a/script/install_gpu_operator_k3s.sh b/script/install_gpu_operator_k3s.sh index 4484986..0cedddd 100644 --- a/script/install_gpu_operator_k3s.sh +++ b/script/install_gpu_operator_k3s.sh @@ -4,7 +4,7 @@ helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update # Install GPU Operator (arguments are for k3s) -helm install --wait nvidiagpu -n gpu-operator --create-namespace \ +helm install --wait nvidia-gpu-operator -n nvidia-gpu-operator --create-namespace \ --set toolkit.env[0].name=CONTAINERD_CONFIG \ --set toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml \ --set toolkit.env[1].name=CONTAINERD_SOCKET \