Merge pull request #600 from IBM/fk-misc

watsonx.ai and various other changes
IBM · Dec 15, 2023 · 834ba4e · 834ba4e
2 parents 2767371 + bf2878d
commit 834ba4e
Show file tree

Hide file tree

Showing 29 changed files with 522 additions and 53 deletions.
diff --git a/automation-roles/40-configure-infra/configure-openshift/tasks/main.yml b/automation-roles/40-configure-infra/configure-openshift/tasks/main.yml
@@ -63,5 +63,11 @@
 - name: Configure Multi-Cloud Object Gateway
   include_role:
     name: openshift-mcg
+  vars:
+    _p_openshift_cluster: "{{ current_openshift_cluster }}"
+
+- name: Configure GPU for the OpenShift cluster
+  include_role:
+    name: openshift-gpu
   vars:
     _p_openshift_cluster: "{{ current_openshift_cluster }}"
diff --git a/automation-roles/40-configure-infra/nfd-operator/tasks/main.yml b/automation-roles/40-configure-infra/nfd-operator/tasks/main.yml
@@ -0,0 +1,51 @@
+---
+- name: Create openshift-nfd OpenShift project
+  shell: |
+    oc create ns openshift-nfd || true
+
+- name: Retrieve default channel for Node Feature Discovery manifest
+  shell:
+    oc get packagemanifest nfd -o jsonpath='{.status.defaultChannel}'
+  register: _nfd_packagemanifest
+
+- set_fact:
+    _nfd_channel: "{{ _nfd_packagemanifest.stdout }}"
+
+- name: Generate NFD operator file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-operator.yaml
+  template:
+    src: nfd-operator.j2
+    dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-operator.yaml"
+
+- name: Create NFD operator
+  shell: |
+    oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-operator.yaml
+
+- name: Wait until NFD Operator CSV has status Succeeded
+  shell: |
+     oc get csv -n openshift-nfd \
+      -l operators.coreos.com/nfd.openshift-nfd \
+      --no-headers \
+      -o custom-columns='name:metadata.name,phase:status.phase' | \
+      grep -i succeeded | wc -l
+  register: _nfd_csv_status
+  retries: 30
+  delay: 30
+  until: _nfd_csv_status.stdout == "1"
+  vars:
+    ansible_callback_diy_runner_retry_msg: >-
+      {%- set result = ansible_callback_diy.result.output -%}
+      {%- set retries_left = result.retries - result.attempts -%}
+      Retrying: {{ ansible_callback_diy.task.name }} ({{ retries_left }} Retries left) ...
+
+- name: Get OpenShift version
+  include_role:
+    name: openshift-get-version
+
+- name: Generate NodeFeatureDiscovery CR file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-cr.yaml
+  template:
+    src: nfd-cr.j2
+    dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-cr.yaml"
+
+- name: Create NFD CR
+  shell: |
+    oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-cr.yaml
diff --git a/automation-roles/40-configure-infra/nfd-operator/templates/nfd-cr.j2 b/automation-roles/40-configure-infra/nfd-operator/templates/nfd-cr.j2
@@ -0,0 +1,120 @@
+---
+apiVersion: nfd.openshift.io/v1
+kind: NodeFeatureDiscovery
+metadata:
+  name: nfd-instance
+  namespace: openshift-nfd
+spec:
+  instance: "" # instance is empty by default
+  topologyupdater: false # False by default
+  operand:
+    image: registry.redhat.io/openshift4/ose-node-feature-discovery:v{{ _p_current_ocp_version }}
+    servicePort: 12000
+  workerConfig:
+    configData: |
+      core:
+      #  labelWhiteList:
+      #  noPublish: false
+        sleepInterval: 60s
+      #  sources: [all]
+      #  klog:
+      #    addDirHeader: false
+      #    alsologtostderr: false
+      #    logBacktraceAt:
+      #    logtostderr: true
+      #    skipHeaders: false
+      #    stderrthreshold: 2
+      #    v: 0
+      #    vmodule:
+      ##   NOTE: the following options are not dynamically run-time 
+      ##          configurable and require a nfd-worker restart to take effect
+      ##          after being changed
+      #    logDir:
+      #    logFile:
+      #    logFileMaxSize: 1800
+      #    skipLogHeaders: false
+      sources:
+      #  cpu:
+      #    cpuid:
+      ##     NOTE: whitelist has priority over blacklist
+      #      attributeBlacklist:
+      #        - "BMI1"
+      #        - "BMI2"
+      #        - "CLMUL"
+      #        - "CMOV"
+      #        - "CX16"
+      #        - "ERMS"
+      #        - "F16C"
+      #        - "HTT"
+      #        - "LZCNT"
+      #        - "MMX"
+      #        - "MMXEXT"
+      #        - "NX"
+      #        - "POPCNT"
+      #        - "RDRAND"
+      #        - "RDSEED"
+      #        - "RDTSCP"
+      #        - "SGX"
+      #        - "SSE"
+      #        - "SSE2"
+      #        - "SSE3"
+      #        - "SSE4.1"
+      #        - "SSE4.2"
+      #        - "SSSE3"
+      #      attributeWhitelist:
+      #  kernel:
+      #    kconfigFile: "/path/to/kconfig"
+      #    configOpts:
+      #      - "NO_HZ"
+      #      - "X86"
+      #      - "DMI"
+        pci:
+          deviceClassWhitelist:
+            - "0200"
+            - "03"
+            - "12"
+          deviceLabelFields:
+      #      - "class"
+            - "vendor"
+      #      - "device"
+      #      - "subsystem_vendor"
+      #      - "subsystem_device"
+      #  usb:
+      #    deviceClassWhitelist:
+      #      - "0e"
+      #      - "ef"
+      #      - "fe"
+      #      - "ff"
+      #    deviceLabelFields:
+      #      - "class"
+      #      - "vendor"
+      #      - "device"
+      #  custom:
+      #    - name: "my.kernel.feature"
+      #      matchOn:
+      #        - loadedKMod: ["example_kmod1", "example_kmod2"]
+      #    - name: "my.pci.feature"
+      #      matchOn:
+      #        - pciId:
+      #            class: ["0200"]
+      #            vendor: ["15b3"]
+      #            device: ["1014", "1017"]
+      #        - pciId :
+      #            vendor: ["8086"]
+      #            device: ["1000", "1100"]
+      #    - name: "my.usb.feature"
+      #      matchOn:
+      #        - usbId:
+      #          class: ["ff"]
+      #          vendor: ["03e7"]
+      #          device: ["2485"]
+      #        - usbId:
+      #          class: ["fe"]
+      #          vendor: ["1a6e"]
+      #          device: ["089a"]
+      #    - name: "my.combined.feature"
+      #      matchOn:
+      #        - pciId:
+      #            vendor: ["15b3"]
+      #            device: ["1014", "1017"]
+      #          loadedKMod : ["vendor_kmod1", "vendor_kmod2"]
diff --git a/automation-roles/40-configure-infra/nfd-operator/templates/nfd-operator.j2 b/automation-roles/40-configure-infra/nfd-operator/templates/nfd-operator.j2
@@ -0,0 +1,22 @@
+---
+apiVersion: operators.coreos.com/v1
+kind: OperatorGroup
+metadata:
+  name: openshift-nfd-og
+  namespace: openshift-nfd
+spec:
+  upgradeStrategy: Default
+---
+apiVersion: operators.coreos.com/v1alpha1
+kind: Subscription
+metadata:
+  labels:
+    operators.coreos.com/nfd.openshift-nfd: ""
+  name: nfd
+  namespace: openshift-nfd
+spec:
+  channel: {{ _nfd_channel }}
+  installPlanApproval: Automatic
+  name: nfd
+  source: redhat-operators
+  sourceNamespace: openshift-marketplace
diff --git a/automation-roles/40-configure-infra/nvidia-operator/tasks/main.yml b/automation-roles/40-configure-infra/nvidia-operator/tasks/main.yml
@@ -0,0 +1,63 @@
+---
+- name: Create nvidia-gpu-operator OpenShift project
+  shell: |
+    oc create ns nvidia-gpu-operator || true
+
+- name: Retrieve default channel for the NVIDIA GPU manifest
+  shell:
+    oc get packagemanifest gpu-operator-certified -o jsonpath='{.status.defaultChannel}'
+  register: _nvidia_packagemanifest
+
+- set_fact:
+    _nvidia_channel: "{{ _nvidia_packagemanifest.stdout }}"
+
+- name: Generate NVIDIA operator file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-operator.yaml
+  template:
+    src: nvidia-operator.j2
+    dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-operator.yaml"
+
+- name: Create NVIDIA operator
+  shell: |
+    oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-operator.yaml
+
+- name: Wait until NVIDIA Operator CSV has status Succeeded
+  shell: |
+     oc get csv -n nvidia-gpu-operator \
+      -l operators.coreos.com/gpu-operator-certified.nvidia-gpu-operator \
+      --no-headers \
+      -o custom-columns='name:metadata.name,phase:status.phase' | \
+      grep -i succeeded | wc -l
+  register: _nvidia_csv_status
+  retries: 30
+  delay: 30
+  until: _nvidia_csv_status.stdout == "1"
+  vars:
+    ansible_callback_diy_runner_retry_msg: >-
+      {%- set result = ansible_callback_diy.result.output -%}
+      {%- set retries_left = result.retries - result.attempts -%}
+      Retrying: {{ ansible_callback_diy.task.name }} ({{ retries_left }} Retries left) ...
+
+- name: Generate NVIDIA ClusterPolicy CR file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-cluster-policy-cr.yaml
+  template:
+    src: nvidia-cluster-policy-cr.j2
+    dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-cluster-policy-cr.yaml"
+
+- name: Create NVIDIA ClusterPolicy CR
+  shell: |
+    oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-cluster-policy-cr.yaml
+
+- name: Wait until NVIDIA ClusterPolicy has status Ready
+  shell: |
+    oc get clusterpolicies.nvidia.com gpu-cluster-policy \
+     --no-headers \
+     -o custom-columns='name:metadata.name,phase:status.state' | \
+     grep -i ready | wc -l
+  register: _nvidia_cluster_policy_status
+  retries: 30
+  delay: 30
+  until: _nvidia_cluster_policy_status.stdout == "1"
+  vars:
+    ansible_callback_diy_runner_retry_msg: >-
+      {%- set result = ansible_callback_diy.result.output -%}
+      {%- set retries_left = result.retries - result.attempts -%}
+      Retrying: {{ ansible_callback_diy.task.name }} ({{ retries_left }} Retries left) ...
diff --git a/automation-roles/40-configure-infra/nvidia-operator/templates/nvidia-cluster-policy-cr.j2 b/automation-roles/40-configure-infra/nvidia-operator/templates/nvidia-cluster-policy-cr.j2
@@ -0,0 +1,98 @@
+---
+apiVersion: nvidia.com/v1
+kind: ClusterPolicy
+metadata:
+  name: gpu-cluster-policy
+spec:
+  cdi:
+    default: false
+    enabled: false
+  daemonsets:
+    rollingUpdate:
+      maxUnavailable: "1"
+    updateStrategy: RollingUpdate
+  dcgm:
+    enabled: true
+  dcgmExporter:
+    config:
+      name: ""
+    enabled: true
+    serviceMonitor:
+      enabled: true
+  devicePlugin:
+    config:
+      default: ""
+      name: ""
+    enabled: true
+  driver:
+    certConfig:
+      name: ""
+    enabled: true
+    kernelModuleConfig:
+      name: ""
+    licensingConfig:
+      configMapName: ""
+      nlsEnabled: true
+    repoConfig:
+      configMapName: ""
+    upgradePolicy:
+      autoUpgrade: true
+      drain:
+        deleteEmptyDir: false
+        enable: false
+        force: false
+        timeoutSeconds: 300
+      maxParallelUpgrades: 1
+      maxUnavailable: 25%
+      podDeletion:
+        deleteEmptyDir: false
+        force: false
+        timeoutSeconds: 300
+      waitForCompletion:
+        timeoutSeconds: 0
+    useNvidiaDriverCRD: false
+    useOpenKernelModules: false
+    virtualTopology:
+      config: ""
+  gds:
+    enabled: false
+  gfd:
+    enabled: true
+  kataManager:
+    config:
+      artifactsDir: /opt/nvidia-gpu-operator/artifacts/runtimeclasses
+  mig:
+    strategy: single
+  migManager:
+    config:
+      default: all-disabled
+      name: default-mig-parted-config
+    enabled: true
+  nodeStatusExporter:
+    enabled: true
+  operator:
+    defaultRuntime: crio
+    initContainer: {}
+    runtimeClass: nvidia
+    use_ocp_driver_toolkit: true
+  sandboxDevicePlugin:
+    enabled: true
+  sandboxWorkloads:
+    defaultWorkload: container
+    enabled: false
+  toolkit:
+    enabled: true
+    installDir: /usr/local/nvidia
+  validator:
+    plugin:
+      env:
+      - name: WITH_WORKLOAD
+        value: "false"
+  vfioManager:
+    enabled: true
+  vgpuDeviceManager:
+    config:
+      default: default
+    enabled: true
+  vgpuManager:
+    enabled: false