Skip to content

Commit

Permalink
fix: GPU operator configuration
Browse files Browse the repository at this point in the history
Signed-off-by: Chris Butler <[email protected]>
  • Loading branch information
butler54 committed May 17, 2024
1 parent d837b81 commit 24790a6
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 0 deletions.
15 changes: 15 additions & 0 deletions charts/all/gpu-operator/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: v2
description: Example helm chart for configuring a OpenShift AI project
name: test-dsp
type: application

# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.0

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "1.0.0"
101 changes: 101 additions & 0 deletions charts/all/gpu-operator/templates/cluster-policy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
apiVersion: nvidia.com/v1
kind: ClusterPolicy
metadata:
name: gpu-cluster-policy
spec:
vgpuDeviceManager:
config:
default: default
enabled: true
migManager:
config:
default: all-disabled
name: default-mig-parted-config
enabled: true
operator:
defaultRuntime: crio
initContainer: {}
runtimeClass: nvidia
use_ocp_driver_toolkit: true
dcgm:
enabled: true
gfd:
enabled: true
dcgmExporter:
config:
name: ''
serviceMonitor:
enabled: true
enabled: true
cdi:
default: false
enabled: false
driver:
licensingConfig:
nlsEnabled: true
configMapName: ''
certConfig:
name: ''
kernelModuleConfig:
name: ''
upgradePolicy:
autoUpgrade: true
drain:
deleteEmptyDir: false
enable: false
force: false
timeoutSeconds: 300
maxParallelUpgrades: 1
maxUnavailable: 25%
podDeletion:
deleteEmptyDir: false
force: false
timeoutSeconds: 300
waitForCompletion:
timeoutSeconds: 0
repoConfig:
configMapName: ''
virtualTopology:
config: ''
enabled: true
useNvidiaDriverCRD: false
useOpenKernelModules: false
devicePlugin:
config:
name: ''
default: ''
mps:
root: /run/nvidia/mps
enabled: true
gdrcopy:
enabled: false
kataManager:
config:
artifactsDir: /opt/nvidia-gpu-operator/artifacts/runtimeclasses
mig:
strategy: mixed
sandboxDevicePlugin:
enabled: true
validator:
plugin:
env:
- name: WITH_WORKLOAD
value: 'false'
nodeStatusExporter:
enabled: true
daemonsets:
rollingUpdate:
maxUnavailable: '1'
updateStrategy: RollingUpdate
sandboxWorkloads:
defaultWorkload: container
enabled: false
gds:
enabled: false
vgpuManager:
enabled: false
vfioManager:
enabled: true
toolkit:
installDir: /usr/local/nvidia
enabled: true
16 changes: 16 additions & 0 deletions charts/all/gpu-operator/values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@



secretStore:
name: vault-backend
kind: ClusterSecretStore

# Secret provisioned for the AWS Controller for Kubernetes - S3
# Begin global parameters


dsp:
name: test-project

# https://github.com/openshift-ai-examples/openshift-ai-examples/blob/main/openshift-ai-deploy-llm/manifests/3-notebook-template.yaml

0 comments on commit 24790a6

Please sign in to comment.