Skip to content

Commit

Permalink
Merge pull request #600 from IBM/fk-misc
Browse files Browse the repository at this point in the history
watsonx.ai and various other changes
  • Loading branch information
fketelaars authored Dec 15, 2023
2 parents 2767371 + bf2878d commit 834ba4e
Show file tree
Hide file tree
Showing 29 changed files with 522 additions and 53 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -63,5 +63,11 @@
- name: Configure Multi-Cloud Object Gateway
include_role:
name: openshift-mcg
vars:
_p_openshift_cluster: "{{ current_openshift_cluster }}"

- name: Configure GPU for the OpenShift cluster
include_role:
name: openshift-gpu
vars:
_p_openshift_cluster: "{{ current_openshift_cluster }}"
51 changes: 51 additions & 0 deletions automation-roles/40-configure-infra/nfd-operator/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
---
- name: Create openshift-nfd OpenShift project
shell: |
oc create ns openshift-nfd || true
- name: Retrieve default channel for Node Feature Discovery manifest
shell:
oc get packagemanifest nfd -o jsonpath='{.status.defaultChannel}'
register: _nfd_packagemanifest

- set_fact:
_nfd_channel: "{{ _nfd_packagemanifest.stdout }}"

- name: Generate NFD operator file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-operator.yaml
template:
src: nfd-operator.j2
dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-operator.yaml"

- name: Create NFD operator
shell: |
oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-operator.yaml
- name: Wait until NFD Operator CSV has status Succeeded
shell: |
oc get csv -n openshift-nfd \
-l operators.coreos.com/nfd.openshift-nfd \
--no-headers \
-o custom-columns='name:metadata.name,phase:status.phase' | \
grep -i succeeded | wc -l
register: _nfd_csv_status
retries: 30
delay: 30
until: _nfd_csv_status.stdout == "1"
vars:
ansible_callback_diy_runner_retry_msg: >-
{%- set result = ansible_callback_diy.result.output -%}
{%- set retries_left = result.retries - result.attempts -%}
Retrying: {{ ansible_callback_diy.task.name }} ({{ retries_left }} Retries left) ...
- name: Get OpenShift version
include_role:
name: openshift-get-version

- name: Generate NodeFeatureDiscovery CR file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-cr.yaml
template:
src: nfd-cr.j2
dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-cr.yaml"

- name: Create NFD CR
shell: |
oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-cr.yaml
120 changes: 120 additions & 0 deletions automation-roles/40-configure-infra/nfd-operator/templates/nfd-cr.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
---
apiVersion: nfd.openshift.io/v1
kind: NodeFeatureDiscovery
metadata:
name: nfd-instance
namespace: openshift-nfd
spec:
instance: "" # instance is empty by default
topologyupdater: false # False by default
operand:
image: registry.redhat.io/openshift4/ose-node-feature-discovery:v{{ _p_current_ocp_version }}
servicePort: 12000
workerConfig:
configData: |
core:
# labelWhiteList:
# noPublish: false
sleepInterval: 60s
# sources: [all]
# klog:
# addDirHeader: false
# alsologtostderr: false
# logBacktraceAt:
# logtostderr: true
# skipHeaders: false
# stderrthreshold: 2
# v: 0
# vmodule:
## NOTE: the following options are not dynamically run-time
## configurable and require a nfd-worker restart to take effect
## after being changed
# logDir:
# logFile:
# logFileMaxSize: 1800
# skipLogHeaders: false
sources:
# cpu:
# cpuid:
## NOTE: whitelist has priority over blacklist
# attributeBlacklist:
# - "BMI1"
# - "BMI2"
# - "CLMUL"
# - "CMOV"
# - "CX16"
# - "ERMS"
# - "F16C"
# - "HTT"
# - "LZCNT"
# - "MMX"
# - "MMXEXT"
# - "NX"
# - "POPCNT"
# - "RDRAND"
# - "RDSEED"
# - "RDTSCP"
# - "SGX"
# - "SSE"
# - "SSE2"
# - "SSE3"
# - "SSE4.1"
# - "SSE4.2"
# - "SSSE3"
# attributeWhitelist:
# kernel:
# kconfigFile: "/path/to/kconfig"
# configOpts:
# - "NO_HZ"
# - "X86"
# - "DMI"
pci:
deviceClassWhitelist:
- "0200"
- "03"
- "12"
deviceLabelFields:
# - "class"
- "vendor"
# - "device"
# - "subsystem_vendor"
# - "subsystem_device"
# usb:
# deviceClassWhitelist:
# - "0e"
# - "ef"
# - "fe"
# - "ff"
# deviceLabelFields:
# - "class"
# - "vendor"
# - "device"
# custom:
# - name: "my.kernel.feature"
# matchOn:
# - loadedKMod: ["example_kmod1", "example_kmod2"]
# - name: "my.pci.feature"
# matchOn:
# - pciId:
# class: ["0200"]
# vendor: ["15b3"]
# device: ["1014", "1017"]
# - pciId :
# vendor: ["8086"]
# device: ["1000", "1100"]
# - name: "my.usb.feature"
# matchOn:
# - usbId:
# class: ["ff"]
# vendor: ["03e7"]
# device: ["2485"]
# - usbId:
# class: ["fe"]
# vendor: ["1a6e"]
# device: ["089a"]
# - name: "my.combined.feature"
# matchOn:
# - pciId:
# vendor: ["15b3"]
# device: ["1014", "1017"]
# loadedKMod : ["vendor_kmod1", "vendor_kmod2"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
---
apiVersion: operators.coreos.com/v1
kind: OperatorGroup
metadata:
name: openshift-nfd-og
namespace: openshift-nfd
spec:
upgradeStrategy: Default
---
apiVersion: operators.coreos.com/v1alpha1
kind: Subscription
metadata:
labels:
operators.coreos.com/nfd.openshift-nfd: ""
name: nfd
namespace: openshift-nfd
spec:
channel: {{ _nfd_channel }}
installPlanApproval: Automatic
name: nfd
source: redhat-operators
sourceNamespace: openshift-marketplace
63 changes: 63 additions & 0 deletions automation-roles/40-configure-infra/nvidia-operator/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
---
- name: Create nvidia-gpu-operator OpenShift project
shell: |
oc create ns nvidia-gpu-operator || true
- name: Retrieve default channel for the NVIDIA GPU manifest
shell:
oc get packagemanifest gpu-operator-certified -o jsonpath='{.status.defaultChannel}'
register: _nvidia_packagemanifest

- set_fact:
_nvidia_channel: "{{ _nvidia_packagemanifest.stdout }}"

- name: Generate NVIDIA operator file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-operator.yaml
template:
src: nvidia-operator.j2
dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-operator.yaml"

- name: Create NVIDIA operator
shell: |
oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-operator.yaml
- name: Wait until NVIDIA Operator CSV has status Succeeded
shell: |
oc get csv -n nvidia-gpu-operator \
-l operators.coreos.com/gpu-operator-certified.nvidia-gpu-operator \
--no-headers \
-o custom-columns='name:metadata.name,phase:status.phase' | \
grep -i succeeded | wc -l
register: _nvidia_csv_status
retries: 30
delay: 30
until: _nvidia_csv_status.stdout == "1"
vars:
ansible_callback_diy_runner_retry_msg: >-
{%- set result = ansible_callback_diy.result.output -%}
{%- set retries_left = result.retries - result.attempts -%}
Retrying: {{ ansible_callback_diy.task.name }} ({{ retries_left }} Retries left) ...
- name: Generate NVIDIA ClusterPolicy CR file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-cluster-policy-cr.yaml
template:
src: nvidia-cluster-policy-cr.j2
dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-cluster-policy-cr.yaml"

- name: Create NVIDIA ClusterPolicy CR
shell: |
oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-cluster-policy-cr.yaml
- name: Wait until NVIDIA ClusterPolicy has status Ready
shell: |
oc get clusterpolicies.nvidia.com gpu-cluster-policy \
--no-headers \
-o custom-columns='name:metadata.name,phase:status.state' | \
grep -i ready | wc -l
register: _nvidia_cluster_policy_status
retries: 30
delay: 30
until: _nvidia_cluster_policy_status.stdout == "1"
vars:
ansible_callback_diy_runner_retry_msg: >-
{%- set result = ansible_callback_diy.result.output -%}
{%- set retries_left = result.retries - result.attempts -%}
Retrying: {{ ansible_callback_diy.task.name }} ({{ retries_left }} Retries left) ...
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
---
apiVersion: nvidia.com/v1
kind: ClusterPolicy
metadata:
name: gpu-cluster-policy
spec:
cdi:
default: false
enabled: false
daemonsets:
rollingUpdate:
maxUnavailable: "1"
updateStrategy: RollingUpdate
dcgm:
enabled: true
dcgmExporter:
config:
name: ""
enabled: true
serviceMonitor:
enabled: true
devicePlugin:
config:
default: ""
name: ""
enabled: true
driver:
certConfig:
name: ""
enabled: true
kernelModuleConfig:
name: ""
licensingConfig:
configMapName: ""
nlsEnabled: true
repoConfig:
configMapName: ""
upgradePolicy:
autoUpgrade: true
drain:
deleteEmptyDir: false
enable: false
force: false
timeoutSeconds: 300
maxParallelUpgrades: 1
maxUnavailable: 25%
podDeletion:
deleteEmptyDir: false
force: false
timeoutSeconds: 300
waitForCompletion:
timeoutSeconds: 0
useNvidiaDriverCRD: false
useOpenKernelModules: false
virtualTopology:
config: ""
gds:
enabled: false
gfd:
enabled: true
kataManager:
config:
artifactsDir: /opt/nvidia-gpu-operator/artifacts/runtimeclasses
mig:
strategy: single
migManager:
config:
default: all-disabled
name: default-mig-parted-config
enabled: true
nodeStatusExporter:
enabled: true
operator:
defaultRuntime: crio
initContainer: {}
runtimeClass: nvidia
use_ocp_driver_toolkit: true
sandboxDevicePlugin:
enabled: true
sandboxWorkloads:
defaultWorkload: container
enabled: false
toolkit:
enabled: true
installDir: /usr/local/nvidia
validator:
plugin:
env:
- name: WITH_WORKLOAD
value: "false"
vfioManager:
enabled: true
vgpuDeviceManager:
config:
default: default
enabled: true
vgpuManager:
enabled: false
Loading

0 comments on commit 834ba4e

Please sign in to comment.