-
Notifications
You must be signed in to change notification settings - Fork 72
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #600 from IBM/fk-misc
watsonx.ai and various other changes
- Loading branch information
Showing
29 changed files
with
522 additions
and
53 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
51 changes: 51 additions & 0 deletions
51
automation-roles/40-configure-infra/nfd-operator/tasks/main.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
--- | ||
- name: Create openshift-nfd OpenShift project | ||
shell: | | ||
oc create ns openshift-nfd || true | ||
- name: Retrieve default channel for Node Feature Discovery manifest | ||
shell: | ||
oc get packagemanifest nfd -o jsonpath='{.status.defaultChannel}' | ||
register: _nfd_packagemanifest | ||
|
||
- set_fact: | ||
_nfd_channel: "{{ _nfd_packagemanifest.stdout }}" | ||
|
||
- name: Generate NFD operator file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-operator.yaml | ||
template: | ||
src: nfd-operator.j2 | ||
dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-operator.yaml" | ||
|
||
- name: Create NFD operator | ||
shell: | | ||
oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-operator.yaml | ||
- name: Wait until NFD Operator CSV has status Succeeded | ||
shell: | | ||
oc get csv -n openshift-nfd \ | ||
-l operators.coreos.com/nfd.openshift-nfd \ | ||
--no-headers \ | ||
-o custom-columns='name:metadata.name,phase:status.phase' | \ | ||
grep -i succeeded | wc -l | ||
register: _nfd_csv_status | ||
retries: 30 | ||
delay: 30 | ||
until: _nfd_csv_status.stdout == "1" | ||
vars: | ||
ansible_callback_diy_runner_retry_msg: >- | ||
{%- set result = ansible_callback_diy.result.output -%} | ||
{%- set retries_left = result.retries - result.attempts -%} | ||
Retrying: {{ ansible_callback_diy.task.name }} ({{ retries_left }} Retries left) ... | ||
- name: Get OpenShift version | ||
include_role: | ||
name: openshift-get-version | ||
|
||
- name: Generate NodeFeatureDiscovery CR file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-cr.yaml | ||
template: | ||
src: nfd-cr.j2 | ||
dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-cr.yaml" | ||
|
||
- name: Create NFD CR | ||
shell: | | ||
oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-cr.yaml |
120 changes: 120 additions & 0 deletions
120
automation-roles/40-configure-infra/nfd-operator/templates/nfd-cr.j2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
--- | ||
apiVersion: nfd.openshift.io/v1 | ||
kind: NodeFeatureDiscovery | ||
metadata: | ||
name: nfd-instance | ||
namespace: openshift-nfd | ||
spec: | ||
instance: "" # instance is empty by default | ||
topologyupdater: false # False by default | ||
operand: | ||
image: registry.redhat.io/openshift4/ose-node-feature-discovery:v{{ _p_current_ocp_version }} | ||
servicePort: 12000 | ||
workerConfig: | ||
configData: | | ||
core: | ||
# labelWhiteList: | ||
# noPublish: false | ||
sleepInterval: 60s | ||
# sources: [all] | ||
# klog: | ||
# addDirHeader: false | ||
# alsologtostderr: false | ||
# logBacktraceAt: | ||
# logtostderr: true | ||
# skipHeaders: false | ||
# stderrthreshold: 2 | ||
# v: 0 | ||
# vmodule: | ||
## NOTE: the following options are not dynamically run-time | ||
## configurable and require a nfd-worker restart to take effect | ||
## after being changed | ||
# logDir: | ||
# logFile: | ||
# logFileMaxSize: 1800 | ||
# skipLogHeaders: false | ||
sources: | ||
# cpu: | ||
# cpuid: | ||
## NOTE: whitelist has priority over blacklist | ||
# attributeBlacklist: | ||
# - "BMI1" | ||
# - "BMI2" | ||
# - "CLMUL" | ||
# - "CMOV" | ||
# - "CX16" | ||
# - "ERMS" | ||
# - "F16C" | ||
# - "HTT" | ||
# - "LZCNT" | ||
# - "MMX" | ||
# - "MMXEXT" | ||
# - "NX" | ||
# - "POPCNT" | ||
# - "RDRAND" | ||
# - "RDSEED" | ||
# - "RDTSCP" | ||
# - "SGX" | ||
# - "SSE" | ||
# - "SSE2" | ||
# - "SSE3" | ||
# - "SSE4.1" | ||
# - "SSE4.2" | ||
# - "SSSE3" | ||
# attributeWhitelist: | ||
# kernel: | ||
# kconfigFile: "/path/to/kconfig" | ||
# configOpts: | ||
# - "NO_HZ" | ||
# - "X86" | ||
# - "DMI" | ||
pci: | ||
deviceClassWhitelist: | ||
- "0200" | ||
- "03" | ||
- "12" | ||
deviceLabelFields: | ||
# - "class" | ||
- "vendor" | ||
# - "device" | ||
# - "subsystem_vendor" | ||
# - "subsystem_device" | ||
# usb: | ||
# deviceClassWhitelist: | ||
# - "0e" | ||
# - "ef" | ||
# - "fe" | ||
# - "ff" | ||
# deviceLabelFields: | ||
# - "class" | ||
# - "vendor" | ||
# - "device" | ||
# custom: | ||
# - name: "my.kernel.feature" | ||
# matchOn: | ||
# - loadedKMod: ["example_kmod1", "example_kmod2"] | ||
# - name: "my.pci.feature" | ||
# matchOn: | ||
# - pciId: | ||
# class: ["0200"] | ||
# vendor: ["15b3"] | ||
# device: ["1014", "1017"] | ||
# - pciId : | ||
# vendor: ["8086"] | ||
# device: ["1000", "1100"] | ||
# - name: "my.usb.feature" | ||
# matchOn: | ||
# - usbId: | ||
# class: ["ff"] | ||
# vendor: ["03e7"] | ||
# device: ["2485"] | ||
# - usbId: | ||
# class: ["fe"] | ||
# vendor: ["1a6e"] | ||
# device: ["089a"] | ||
# - name: "my.combined.feature" | ||
# matchOn: | ||
# - pciId: | ||
# vendor: ["15b3"] | ||
# device: ["1014", "1017"] | ||
# loadedKMod : ["vendor_kmod1", "vendor_kmod2"] |
22 changes: 22 additions & 0 deletions
22
automation-roles/40-configure-infra/nfd-operator/templates/nfd-operator.j2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
--- | ||
apiVersion: operators.coreos.com/v1 | ||
kind: OperatorGroup | ||
metadata: | ||
name: openshift-nfd-og | ||
namespace: openshift-nfd | ||
spec: | ||
upgradeStrategy: Default | ||
--- | ||
apiVersion: operators.coreos.com/v1alpha1 | ||
kind: Subscription | ||
metadata: | ||
labels: | ||
operators.coreos.com/nfd.openshift-nfd: "" | ||
name: nfd | ||
namespace: openshift-nfd | ||
spec: | ||
channel: {{ _nfd_channel }} | ||
installPlanApproval: Automatic | ||
name: nfd | ||
source: redhat-operators | ||
sourceNamespace: openshift-marketplace |
63 changes: 63 additions & 0 deletions
63
automation-roles/40-configure-infra/nvidia-operator/tasks/main.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
--- | ||
- name: Create nvidia-gpu-operator OpenShift project | ||
shell: | | ||
oc create ns nvidia-gpu-operator || true | ||
- name: Retrieve default channel for the NVIDIA GPU manifest | ||
shell: | ||
oc get packagemanifest gpu-operator-certified -o jsonpath='{.status.defaultChannel}' | ||
register: _nvidia_packagemanifest | ||
|
||
- set_fact: | ||
_nvidia_channel: "{{ _nvidia_packagemanifest.stdout }}" | ||
|
||
- name: Generate NVIDIA operator file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-operator.yaml | ||
template: | ||
src: nvidia-operator.j2 | ||
dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-operator.yaml" | ||
|
||
- name: Create NVIDIA operator | ||
shell: | | ||
oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-operator.yaml | ||
- name: Wait until NVIDIA Operator CSV has status Succeeded | ||
shell: | | ||
oc get csv -n nvidia-gpu-operator \ | ||
-l operators.coreos.com/gpu-operator-certified.nvidia-gpu-operator \ | ||
--no-headers \ | ||
-o custom-columns='name:metadata.name,phase:status.phase' | \ | ||
grep -i succeeded | wc -l | ||
register: _nvidia_csv_status | ||
retries: 30 | ||
delay: 30 | ||
until: _nvidia_csv_status.stdout == "1" | ||
vars: | ||
ansible_callback_diy_runner_retry_msg: >- | ||
{%- set result = ansible_callback_diy.result.output -%} | ||
{%- set retries_left = result.retries - result.attempts -%} | ||
Retrying: {{ ansible_callback_diy.task.name }} ({{ retries_left }} Retries left) ... | ||
- name: Generate NVIDIA ClusterPolicy CR file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-cluster-policy-cr.yaml | ||
template: | ||
src: nvidia-cluster-policy-cr.j2 | ||
dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-cluster-policy-cr.yaml" | ||
|
||
- name: Create NVIDIA ClusterPolicy CR | ||
shell: | | ||
oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-cluster-policy-cr.yaml | ||
- name: Wait until NVIDIA ClusterPolicy has status Ready | ||
shell: | | ||
oc get clusterpolicies.nvidia.com gpu-cluster-policy \ | ||
--no-headers \ | ||
-o custom-columns='name:metadata.name,phase:status.state' | \ | ||
grep -i ready | wc -l | ||
register: _nvidia_cluster_policy_status | ||
retries: 30 | ||
delay: 30 | ||
until: _nvidia_cluster_policy_status.stdout == "1" | ||
vars: | ||
ansible_callback_diy_runner_retry_msg: >- | ||
{%- set result = ansible_callback_diy.result.output -%} | ||
{%- set retries_left = result.retries - result.attempts -%} | ||
Retrying: {{ ansible_callback_diy.task.name }} ({{ retries_left }} Retries left) ... |
98 changes: 98 additions & 0 deletions
98
automation-roles/40-configure-infra/nvidia-operator/templates/nvidia-cluster-policy-cr.j2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
--- | ||
apiVersion: nvidia.com/v1 | ||
kind: ClusterPolicy | ||
metadata: | ||
name: gpu-cluster-policy | ||
spec: | ||
cdi: | ||
default: false | ||
enabled: false | ||
daemonsets: | ||
rollingUpdate: | ||
maxUnavailable: "1" | ||
updateStrategy: RollingUpdate | ||
dcgm: | ||
enabled: true | ||
dcgmExporter: | ||
config: | ||
name: "" | ||
enabled: true | ||
serviceMonitor: | ||
enabled: true | ||
devicePlugin: | ||
config: | ||
default: "" | ||
name: "" | ||
enabled: true | ||
driver: | ||
certConfig: | ||
name: "" | ||
enabled: true | ||
kernelModuleConfig: | ||
name: "" | ||
licensingConfig: | ||
configMapName: "" | ||
nlsEnabled: true | ||
repoConfig: | ||
configMapName: "" | ||
upgradePolicy: | ||
autoUpgrade: true | ||
drain: | ||
deleteEmptyDir: false | ||
enable: false | ||
force: false | ||
timeoutSeconds: 300 | ||
maxParallelUpgrades: 1 | ||
maxUnavailable: 25% | ||
podDeletion: | ||
deleteEmptyDir: false | ||
force: false | ||
timeoutSeconds: 300 | ||
waitForCompletion: | ||
timeoutSeconds: 0 | ||
useNvidiaDriverCRD: false | ||
useOpenKernelModules: false | ||
virtualTopology: | ||
config: "" | ||
gds: | ||
enabled: false | ||
gfd: | ||
enabled: true | ||
kataManager: | ||
config: | ||
artifactsDir: /opt/nvidia-gpu-operator/artifacts/runtimeclasses | ||
mig: | ||
strategy: single | ||
migManager: | ||
config: | ||
default: all-disabled | ||
name: default-mig-parted-config | ||
enabled: true | ||
nodeStatusExporter: | ||
enabled: true | ||
operator: | ||
defaultRuntime: crio | ||
initContainer: {} | ||
runtimeClass: nvidia | ||
use_ocp_driver_toolkit: true | ||
sandboxDevicePlugin: | ||
enabled: true | ||
sandboxWorkloads: | ||
defaultWorkload: container | ||
enabled: false | ||
toolkit: | ||
enabled: true | ||
installDir: /usr/local/nvidia | ||
validator: | ||
plugin: | ||
env: | ||
- name: WITH_WORKLOAD | ||
value: "false" | ||
vfioManager: | ||
enabled: true | ||
vgpuDeviceManager: | ||
config: | ||
default: default | ||
enabled: true | ||
vgpuManager: | ||
enabled: false |
Oops, something went wrong.