fix: GPU operator configuration

Signed-off-by: Chris Butler <[email protected]>
opendatahub-io-contrib · May 17, 2024 · 24790a6 · 24790a6
1 parent d837b81
commit 24790a6
Show file tree

Hide file tree

Showing 3 changed files with 132 additions and 0 deletions.
diff --git a/charts/all/gpu-operator/Chart.yaml b/charts/all/gpu-operator/Chart.yaml
@@ -0,0 +1,15 @@
+apiVersion: v2
+description: Example helm chart for configuring a OpenShift AI project
+name: test-dsp
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.1.0
+
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+# It is recommended to use it with quotes.
+appVersion: "1.0.0"
diff --git a/charts/all/gpu-operator/templates/cluster-policy.yaml b/charts/all/gpu-operator/templates/cluster-policy.yaml
@@ -0,0 +1,101 @@
+apiVersion: nvidia.com/v1
+kind: ClusterPolicy
+metadata:
+  name: gpu-cluster-policy
+spec:
+  vgpuDeviceManager:
+    config:
+      default: default
+    enabled: true
+  migManager:
+    config:
+      default: all-disabled
+      name: default-mig-parted-config
+    enabled: true
+  operator:
+    defaultRuntime: crio
+    initContainer: {}
+    runtimeClass: nvidia
+    use_ocp_driver_toolkit: true
+  dcgm:
+    enabled: true
+  gfd:
+    enabled: true
+  dcgmExporter:
+    config:
+      name: ''
+    serviceMonitor:
+      enabled: true
+    enabled: true
+  cdi:
+    default: false
+    enabled: false
+  driver:
+    licensingConfig:
+      nlsEnabled: true
+      configMapName: ''
+    certConfig:
+      name: ''
+    kernelModuleConfig:
+      name: ''
+    upgradePolicy:
+      autoUpgrade: true
+      drain:
+        deleteEmptyDir: false
+        enable: false
+        force: false
+        timeoutSeconds: 300
+      maxParallelUpgrades: 1
+      maxUnavailable: 25%
+      podDeletion:
+        deleteEmptyDir: false
+        force: false
+        timeoutSeconds: 300
+      waitForCompletion:
+        timeoutSeconds: 0
+    repoConfig:
+      configMapName: ''
+    virtualTopology:
+      config: ''
+    enabled: true
+    useNvidiaDriverCRD: false
+    useOpenKernelModules: false
+  devicePlugin:
+    config:
+      name: ''
+      default: ''
+    mps:
+      root: /run/nvidia/mps
+    enabled: true
+  gdrcopy:
+    enabled: false
+  kataManager:
+    config:
+      artifactsDir: /opt/nvidia-gpu-operator/artifacts/runtimeclasses
+  mig:
+    strategy: mixed
+  sandboxDevicePlugin:
+    enabled: true
+  validator:
+    plugin:
+      env:
+      - name: WITH_WORKLOAD
+        value: 'false'
+  nodeStatusExporter:
+    enabled: true
+  daemonsets:
+    rollingUpdate:
+      maxUnavailable: '1'
+    updateStrategy: RollingUpdate
+  sandboxWorkloads:
+    defaultWorkload: container
+    enabled: false
+  gds:
+    enabled: false
+  vgpuManager:
+    enabled: false
+  vfioManager:
+    enabled: true
+  toolkit:
+    installDir: /usr/local/nvidia
+    enabled: true
diff --git a/charts/all/gpu-operator/values.yaml b/charts/all/gpu-operator/values.yaml
@@ -0,0 +1,16 @@
+
+
+
+secretStore:
+  name: vault-backend
+  kind: ClusterSecretStore
+
+# Secret provisioned for the AWS Controller for Kubernetes - S3 
+# Begin global parameters
+
+
+dsp:
+  name: test-project
+
+# https://github.com/openshift-ai-examples/openshift-ai-examples/blob/main/openshift-ai-deploy-llm/manifests/3-notebook-template.yaml
+