Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

watsonx.ai and various other changes #600

Merged
merged 18 commits into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -63,5 +63,11 @@
- name: Configure Multi-Cloud Object Gateway
include_role:
name: openshift-mcg
vars:
_p_openshift_cluster: "{{ current_openshift_cluster }}"

- name: Configure GPU for the OpenShift cluster
include_role:
name: openshift-gpu
vars:
_p_openshift_cluster: "{{ current_openshift_cluster }}"
51 changes: 51 additions & 0 deletions automation-roles/40-configure-infra/nfd-operator/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
---
- name: Create openshift-nfd OpenShift project
shell: |
oc create ns openshift-nfd || true

- name: Retrieve default channel for Node Feature Discovery manifest
shell:
oc get packagemanifest nfd -o jsonpath='{.status.defaultChannel}'
register: _nfd_packagemanifest

- set_fact:
_nfd_channel: "{{ _nfd_packagemanifest.stdout }}"

- name: Generate NFD operator file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-operator.yaml
template:
src: nfd-operator.j2
dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-operator.yaml"

- name: Create NFD operator
shell: |
oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-operator.yaml

- name: Wait until NFD Operator CSV has status Succeeded
shell: |
oc get csv -n openshift-nfd \
-l operators.coreos.com/nfd.openshift-nfd \
--no-headers \
-o custom-columns='name:metadata.name,phase:status.phase' | \
grep -i succeeded | wc -l
register: _nfd_csv_status
retries: 30
delay: 30
until: _nfd_csv_status.stdout == "1"
vars:
ansible_callback_diy_runner_retry_msg: >-
{%- set result = ansible_callback_diy.result.output -%}
{%- set retries_left = result.retries - result.attempts -%}
Retrying: {{ ansible_callback_diy.task.name }} ({{ retries_left }} Retries left) ...

- name: Get OpenShift version
include_role:
name: openshift-get-version

- name: Generate NodeFeatureDiscovery CR file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-cr.yaml
template:
src: nfd-cr.j2
dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-cr.yaml"

- name: Create NFD CR
shell: |
oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-cr.yaml
120 changes: 120 additions & 0 deletions automation-roles/40-configure-infra/nfd-operator/templates/nfd-cr.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
---
apiVersion: nfd.openshift.io/v1
kind: NodeFeatureDiscovery
metadata:
name: nfd-instance
namespace: openshift-nfd
spec:
instance: "" # instance is empty by default
topologyupdater: false # False by default
operand:
image: registry.redhat.io/openshift4/ose-node-feature-discovery:v{{ _p_current_ocp_version }}
servicePort: 12000
workerConfig:
configData: |
core:
# labelWhiteList:
# noPublish: false
sleepInterval: 60s
# sources: [all]
# klog:
# addDirHeader: false
# alsologtostderr: false
# logBacktraceAt:
# logtostderr: true
# skipHeaders: false
# stderrthreshold: 2
# v: 0
# vmodule:
## NOTE: the following options are not dynamically run-time
## configurable and require a nfd-worker restart to take effect
## after being changed
# logDir:
# logFile:
# logFileMaxSize: 1800
# skipLogHeaders: false
sources:
# cpu:
# cpuid:
## NOTE: whitelist has priority over blacklist
# attributeBlacklist:
# - "BMI1"
# - "BMI2"
# - "CLMUL"
# - "CMOV"
# - "CX16"
# - "ERMS"
# - "F16C"
# - "HTT"
# - "LZCNT"
# - "MMX"
# - "MMXEXT"
# - "NX"
# - "POPCNT"
# - "RDRAND"
# - "RDSEED"
# - "RDTSCP"
# - "SGX"
# - "SSE"
# - "SSE2"
# - "SSE3"
# - "SSE4.1"
# - "SSE4.2"
# - "SSSE3"
# attributeWhitelist:
# kernel:
# kconfigFile: "/path/to/kconfig"
# configOpts:
# - "NO_HZ"
# - "X86"
# - "DMI"
pci:
deviceClassWhitelist:
- "0200"
- "03"
- "12"
deviceLabelFields:
# - "class"
- "vendor"
# - "device"
# - "subsystem_vendor"
# - "subsystem_device"
# usb:
# deviceClassWhitelist:
# - "0e"
# - "ef"
# - "fe"
# - "ff"
# deviceLabelFields:
# - "class"
# - "vendor"
# - "device"
# custom:
# - name: "my.kernel.feature"
# matchOn:
# - loadedKMod: ["example_kmod1", "example_kmod2"]
# - name: "my.pci.feature"
# matchOn:
# - pciId:
# class: ["0200"]
# vendor: ["15b3"]
# device: ["1014", "1017"]
# - pciId :
# vendor: ["8086"]
# device: ["1000", "1100"]
# - name: "my.usb.feature"
# matchOn:
# - usbId:
# class: ["ff"]
# vendor: ["03e7"]
# device: ["2485"]
# - usbId:
# class: ["fe"]
# vendor: ["1a6e"]
# device: ["089a"]
# - name: "my.combined.feature"
# matchOn:
# - pciId:
# vendor: ["15b3"]
# device: ["1014", "1017"]
# loadedKMod : ["vendor_kmod1", "vendor_kmod2"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
---
apiVersion: operators.coreos.com/v1
kind: OperatorGroup
metadata:
name: openshift-nfd-og
namespace: openshift-nfd
spec:
upgradeStrategy: Default
---
apiVersion: operators.coreos.com/v1alpha1
kind: Subscription
metadata:
labels:
operators.coreos.com/nfd.openshift-nfd: ""
name: nfd
namespace: openshift-nfd
spec:
channel: {{ _nfd_channel }}
installPlanApproval: Automatic
name: nfd
source: redhat-operators
sourceNamespace: openshift-marketplace
63 changes: 63 additions & 0 deletions automation-roles/40-configure-infra/nvidia-operator/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
---
- name: Create nvidia-gpu-operator OpenShift project
shell: |
oc create ns nvidia-gpu-operator || true

- name: Retrieve default channel for the NVIDIA GPU manifest
shell:
oc get packagemanifest gpu-operator-certified -o jsonpath='{.status.defaultChannel}'
register: _nvidia_packagemanifest

- set_fact:
_nvidia_channel: "{{ _nvidia_packagemanifest.stdout }}"

- name: Generate NVIDIA operator file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-operator.yaml
template:
src: nvidia-operator.j2
dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-operator.yaml"

- name: Create NVIDIA operator
shell: |
oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-operator.yaml

- name: Wait until NVIDIA Operator CSV has status Succeeded
shell: |
oc get csv -n nvidia-gpu-operator \
-l operators.coreos.com/gpu-operator-certified.nvidia-gpu-operator \
--no-headers \
-o custom-columns='name:metadata.name,phase:status.phase' | \
grep -i succeeded | wc -l
register: _nvidia_csv_status
retries: 30
delay: 30
until: _nvidia_csv_status.stdout == "1"
vars:
ansible_callback_diy_runner_retry_msg: >-
{%- set result = ansible_callback_diy.result.output -%}
{%- set retries_left = result.retries - result.attempts -%}
Retrying: {{ ansible_callback_diy.task.name }} ({{ retries_left }} Retries left) ...

- name: Generate NVIDIA ClusterPolicy CR file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-cluster-policy-cr.yaml
template:
src: nvidia-cluster-policy-cr.j2
dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-cluster-policy-cr.yaml"

- name: Create NVIDIA ClusterPolicy CR
shell: |
oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-cluster-policy-cr.yaml

- name: Wait until NVIDIA ClusterPolicy has status Ready
shell: |
oc get clusterpolicies.nvidia.com gpu-cluster-policy \
--no-headers \
-o custom-columns='name:metadata.name,phase:status.state' | \
grep -i ready | wc -l
register: _nvidia_cluster_policy_status
retries: 30
delay: 30
until: _nvidia_cluster_policy_status.stdout == "1"
vars:
ansible_callback_diy_runner_retry_msg: >-
{%- set result = ansible_callback_diy.result.output -%}
{%- set retries_left = result.retries - result.attempts -%}
Retrying: {{ ansible_callback_diy.task.name }} ({{ retries_left }} Retries left) ...
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
---
apiVersion: nvidia.com/v1
kind: ClusterPolicy
metadata:
name: gpu-cluster-policy
spec:
cdi:
default: false
enabled: false
daemonsets:
rollingUpdate:
maxUnavailable: "1"
updateStrategy: RollingUpdate
dcgm:
enabled: true
dcgmExporter:
config:
name: ""
enabled: true
serviceMonitor:
enabled: true
devicePlugin:
config:
default: ""
name: ""
enabled: true
driver:
certConfig:
name: ""
enabled: true
kernelModuleConfig:
name: ""
licensingConfig:
configMapName: ""
nlsEnabled: true
repoConfig:
configMapName: ""
upgradePolicy:
autoUpgrade: true
drain:
deleteEmptyDir: false
enable: false
force: false
timeoutSeconds: 300
maxParallelUpgrades: 1
maxUnavailable: 25%
podDeletion:
deleteEmptyDir: false
force: false
timeoutSeconds: 300
waitForCompletion:
timeoutSeconds: 0
useNvidiaDriverCRD: false
useOpenKernelModules: false
virtualTopology:
config: ""
gds:
enabled: false
gfd:
enabled: true
kataManager:
config:
artifactsDir: /opt/nvidia-gpu-operator/artifacts/runtimeclasses
mig:
strategy: single
migManager:
config:
default: all-disabled
name: default-mig-parted-config
enabled: true
nodeStatusExporter:
enabled: true
operator:
defaultRuntime: crio
initContainer: {}
runtimeClass: nvidia
use_ocp_driver_toolkit: true
sandboxDevicePlugin:
enabled: true
sandboxWorkloads:
defaultWorkload: container
enabled: false
toolkit:
enabled: true
installDir: /usr/local/nvidia
validator:
plugin:
env:
- name: WITH_WORKLOAD
value: "false"
vfioManager:
enabled: true
vgpuDeviceManager:
config:
default: default
enabled: true
vgpuManager:
enabled: false
Loading
Loading