diff --git a/charts/all/gpu-operator/Chart.yaml b/charts/all/gpu-operator/Chart.yaml new file mode 100644 index 0000000..6f711fc --- /dev/null +++ b/charts/all/gpu-operator/Chart.yaml @@ -0,0 +1,15 @@ +apiVersion: v2 +description: Example helm chart for configuring a OpenShift AI project +name: test-dsp +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" diff --git a/charts/all/gpu-operator/templates/cluster-policy.yaml b/charts/all/gpu-operator/templates/cluster-policy.yaml new file mode 100644 index 0000000..dc821d9 --- /dev/null +++ b/charts/all/gpu-operator/templates/cluster-policy.yaml @@ -0,0 +1,101 @@ +apiVersion: nvidia.com/v1 +kind: ClusterPolicy +metadata: + name: gpu-cluster-policy +spec: + vgpuDeviceManager: + config: + default: default + enabled: true + migManager: + config: + default: all-disabled + name: default-mig-parted-config + enabled: true + operator: + defaultRuntime: crio + initContainer: {} + runtimeClass: nvidia + use_ocp_driver_toolkit: true + dcgm: + enabled: true + gfd: + enabled: true + dcgmExporter: + config: + name: '' + serviceMonitor: + enabled: true + enabled: true + cdi: + default: false + enabled: false + driver: + licensingConfig: + nlsEnabled: true + configMapName: '' + certConfig: + name: '' + kernelModuleConfig: + name: '' + upgradePolicy: + autoUpgrade: true + drain: + deleteEmptyDir: false + enable: false + force: false + timeoutSeconds: 300 + maxParallelUpgrades: 1 + maxUnavailable: 25% + podDeletion: + deleteEmptyDir: false + force: false + timeoutSeconds: 300 + waitForCompletion: + timeoutSeconds: 0 + repoConfig: + configMapName: '' + virtualTopology: + config: '' + enabled: true + useNvidiaDriverCRD: false + useOpenKernelModules: false + devicePlugin: + config: + name: '' + default: '' + mps: + root: /run/nvidia/mps + enabled: true + gdrcopy: + enabled: false + kataManager: + config: + artifactsDir: /opt/nvidia-gpu-operator/artifacts/runtimeclasses + mig: + strategy: mixed + sandboxDevicePlugin: + enabled: true + validator: + plugin: + env: + - name: WITH_WORKLOAD + value: 'false' + nodeStatusExporter: + enabled: true + daemonsets: + rollingUpdate: + maxUnavailable: '1' + updateStrategy: RollingUpdate + sandboxWorkloads: + defaultWorkload: container + enabled: false + gds: + enabled: false + vgpuManager: + enabled: false + vfioManager: + enabled: true + toolkit: + installDir: /usr/local/nvidia + enabled: true diff --git a/charts/all/gpu-operator/values.yaml b/charts/all/gpu-operator/values.yaml new file mode 100644 index 0000000..a1521c9 --- /dev/null +++ b/charts/all/gpu-operator/values.yaml @@ -0,0 +1,16 @@ + + + +secretStore: + name: vault-backend + kind: ClusterSecretStore + +# Secret provisioned for the AWS Controller for Kubernetes - S3 +# Begin global parameters + + +dsp: + name: test-project + +# https://github.com/openshift-ai-examples/openshift-ai-examples/blob/main/openshift-ai-deploy-llm/manifests/3-notebook-template.yaml +