Skip to content

Commit

Permalink
Habana v2 - updated habana role for OCP 4.17 and RHOAI 2.16 and above (
Browse files Browse the repository at this point in the history
…#9018)

* Updated NFD as per the changes reuqired to deploy NFD

* Updated for NFD

* Updated for NFD

* Updated for starting csv to nil

* Updated with pauses

* Updated NFD to operator install role

* Updated habana devicconfig to clusterpolicy as per the new requirements

---------

Co-authored-by: Ritesh <[email protected]>
  • Loading branch information
ritzshah and rshah-redhat authored Jan 27, 2025
1 parent fb22b83 commit ac4d637
Show file tree
Hide file tree
Showing 6 changed files with 254 additions and 51 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,13 @@ kmm_ignition_version: "3.2.0"
nfd_operator_namespace: "openshift-nfd"
nfd_operator_channel: "stable"
nfd_operator_automatic_install_plan_approval: true
nfd_operator_starting_csv: "nfd.4.12.0-202307182142"
nfd_operator_starting_csv: ""
nfd_operator_wait_for_deploy: true
nfd_operator_use_catalog_snapshot: false
nfd_operator_catalogsource_name: ""
nfd_operator_catalog_snapshot_image: ""
nfd_operator_catalog_snapshot_image_tag: ""
nfd_operator_operand_image: "registry.redhat.io/openshift4/ose-node-feature-discovery-rhel9:v4.17"

# ------------------------------------------------
# Habana Gaudi Operator
Expand All @@ -40,6 +41,7 @@ habana_gaudi_operator_use_catalog_snapshot: false
habana_gaudi_operator_catalogsource_name: ""
habana_gaudi_operator_catalog_snapshot_image: ""
habana_gaudi_operator_catalog_snapshot_image_tag: ""
habana_gaudi_image_tag: "1.19.1-26"
habana_gaudi_image_version: "1.10.0"
habana_gaudi_deviceplugin_version: "1.10.0"
habana_gaudi_driver_version: "1.10.0-494"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
retries: 10
delay: 5

- name: 60 second pause for Habana Gaudi GPU namespace check
pause:
seconds: 60

- name: Create Habana Gaudi GPU operatorgroup
kubernetes.core.k8s:
state: present
Expand All @@ -19,6 +23,10 @@
retries: 25
delay: 5

- name: 60 second pause for Habana Gaudi GPU operator setup
pause:
seconds: 60

- name: Create Habana Gaudi GPU subscription
kubernetes.core.k8s:
state: present
Expand All @@ -28,15 +36,15 @@
retries: 50
delay: 5

- name: 120 second pause for Habana Gaudi GPU operator setup
- name: 120 second pause for Habana Gaudi GPU subscription
pause:
seconds: 120

- name: Setup Habana Gaudi Device Config
- name: Setup Habana Gaudi Cluster policy setup
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', 'habana-gaudi/habana_gpu_deviceconfig.yaml.j2') | from_yaml }}"
register: devconfig_result
until: devconfig_result is successful
retries: 30
definition: "{{ lookup('template', 'habana-gaudi/habana_gpu_clusterpolicy.yaml.j2') | from_yaml }}"
register: devconfig_clusterpolicy_result
until: devconfig_clusterpolicy_result is successful
retries: 300
delay: 5
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
retries: 25
delay: 5

- name: 60 second pause for KMM operatorgroup
pause:
seconds: 60

- name: Create KMM subscription
kubernetes.core.k8s:
state: present
Expand All @@ -25,7 +29,7 @@
retries: 25
delay: 5

- name: 60 second pause for Habana Gaudi GPU operator setup
- name: 60 second pause for KMM subscription
pause:
seconds: 60

Expand Down
Original file line number Diff line number Diff line change
@@ -1,35 +1,28 @@
---
- name: "Ensure nfd namespace exists"
kubernetes.core.k8s:
state: present
api_version: v1
kind: Namespace
name: "{{ nfd_operator_namespace }}"
delay: 5

- name: Create NodeFeatureDiscovery operatorgroup
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', 'nfd/nodefeature_discovery_operatorgroup.yaml.j2') | from_yaml }}"
register: operatorgroup_result
until: operatorgroup_result is not failed
retries: 25
delay: 5

- name: Create NodeFeaturEDiscovery subscription
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', 'nfd/nodefeature_discovery_sub.yaml.j2') | from_yaml }}"
register: subscription_result
until: subscription_result is not failed
retries: 25
delay: 5
- name: Install NFD operator
ansible.builtin.include_role:
name: install_operator
vars:
install_operator_action: install
install_operator_name: nfd
install_operator_namespace: "{{ nfd_operator_namespace }}"
install_operator_channel: "{{ nfd_operator_channel }}"
install_operator_catalog: redhat-operators
install_operator_automatic_install_plan_approval: "{{ nfd_operator_automatic_install_plan_approval | default('true') }}"
install_operator_starting_csv: "{{ nfd_operator_starting_csv | default('') }}"
install_operator_catalogsource_setup: "{{ nfd_operator_use_catalog_snapshot | default(false) }}"
install_operator_catalogsource_name: "{{ nfd_operator_catalogsource_name }}"
install_operator_catalogsource_namespace: "{{ nfd_operator_namespace }}"
install_operator_catalogsource_image: "{{ nfd_operator_catalog_snapshot_image | default('') }}"
install_operator_catalogsource_image_tag: "{{ nfd_operator_catalog_snapshot_image_tag | default('') }}"
install_operator_manage_namespaces:
- "{{ nfd_operator_namespace }}"

- name: Create NodeFeatureDiscovery Custom Resource
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', 'nfd/nodefeature_discovery_cr.yaml.j2') | from_yaml }}"
register: result
until: result is not failed
retries: 25
delay: 5
retries: 30
delay: 20
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
kind: ClusterPolicy
apiVersion: habanalabs.habana.ai/v1
metadata:
name: habana-ai
spec:
bmc_monitoring:
image:
repository: vault.habana.ai/habana-bmc-exporter/bmc-exporter
tag: "{{ habana_gaudi_image_tag}}"
resources:
limits:
cpu: 250m
memory: 250Mi
requests:
cpu: 150m
memory: 100Mi
device_plugin:
image:
repository: vault.habana.ai/docker-k8s-device-plugin/docker-k8s-device-plugin
tag: "{{ habana_gaudi_image_tag}}"
resources:
limits:
cpu: 20m
memory: 64Mi
requests:
cpu: 10m
memory: 32Mi
driver:
driver_loader:
images:
rhel_8.6:
repository: vault.habana.ai/habana-ai-operator/driver/rhel8.6/driver-installer
tag: "{{ habana_gaudi_image_tag}}"
rhel_9.2:
repository: vault.habana.ai/habana-ai-operator/driver/rhel9.2/driver-installer
tag: "{{ habana_gaudi_image_tag}}"
rhel_9.4:
repository: vault.habana.ai/habana-ai-operator/driver/rhel9.4/driver-installer
tag: "{{ habana_gaudi_image_tag}}"
tencentos_3.1:
repository: vault.habana.ai/habana-ai-operator/driver/tencentos3.1/driver-installer
tag: "{{ habana_gaudi_image_tag}}"
ubuntu_22.04:
repository: vault.habana.ai/habana-ai-operator/driver/ubuntu22.04/driver-installer
tag: "{{ habana_gaudi_image_tag}}"
mlnx_ofed_repo_path: artifactory/gaudi-installer/deps
mlnx_ofed_version: mlnx-ofed-5.8-2.0.3.0-rhel8.4-x86_64.tar.gz
repo_path: artifactory/gaudi-installer/repos
repo_server: vault.habana.ai
resources:
limits:
cpu: 4000m
memory: 16Gi
requests:
cpu: 2000m
memory: 8Gi
driver_runner:
image:
repository: vault.habana.ai/habana-ai-operator/driver/ubuntu22.04/driver-installer
tag: "{{ habana_gaudi_image_tag}}"
resources:
limits:
cpu: 20m
memory: 64Mi
requests:
cpu: 10m
memory: 32Mi
feature_discovery:
nfd_plugin: false
runner:
image:
repository: vault.habana.ai/habana-ai-operator/habanalabs-feature-discovery
tag: "{{ habana_gaudi_image_tag}}"
resources:
limits:
cpu: 20m
memory: 64Mi
requests:
cpu: 10m
memory: 32Mi
image_registry: vault.habana.ai
metric_exporter:
interval: 20
port: 41611
runner:
image:
repository: vault.habana.ai/gaudi-metric-exporter/metric-exporter
tag: "{{ habana_gaudi_image_tag}}"
resources:
limits:
cpu: 150m
memory: 120Mi
requests:
cpu: 100m
memory: 100Mi
runtime:
configuration:
container_engine: crio
runner:
image:
repository: vault.habana.ai/habana-ai-operator/habana-container-runtime
tag: "{{ habana_gaudi_image_tag}}"
resources:
limits:
cpu: 20m
memory: 64Mi
requests:
cpu: 10m
memory: 32Mi
Original file line number Diff line number Diff line change
@@ -1,37 +1,124 @@
---
apiVersion: nfd.openshift.io/v1
kind: NodeFeatureDiscovery
apiVersion: nfd.openshift.io/v1
metadata:
name: nfd-instance
namespace: {{ nfd_operator_namespace }}
namespace: "{{ nfd_operator_namespace }}"
spec:
customConfig:
configData: |
# - name: "more.kernel.features"
# matchOn:
# - loadedKMod: ["example_kmod3"]
# - name: "more.features.by.nodename"
# value: customValue
# matchOn:
# - nodename: ["special-.*-node-.*"]
enableTaints: false
extraLabelNs:
- habana.ai
instance: ''
operand:
image: >-
registry.redhat.io/openshift4/ose-node-feature-discovery:v4.12
image: '{{ nfd_operator_operand_image }}'
imagePullPolicy: IfNotPresent
servicePort: 12000
topologyupdater: false
prunerOnDelete: false
topologyUpdater: false
workerConfig:
configData: |
core:
# labelWhiteList:
# noPublish: false
sleepInterval: 60s
# sources: [all]
# klog:
# addDirHeader: false
# alsologtostderr: false
# logBacktraceAt:
# logtostderr: true
# skipHeaders: false
# stderrthreshold: 2
# v: 0
# vmodule:
## NOTE: the following options are not dynamically run-time
## configurable and require a nfd-worker restart to take effect
## after being changed
# logDir:
# logFile:
# logFileMaxSize: 1800
# skipLogHeaders: false
sources:
# cpu:
# cpuid:
## NOTE: whitelist has priority over blacklist
# attributeBlacklist:
# - "BMI1"
# - "BMI2"
# - "CLMUL"
# - "CMOV"
# - "CX16"
# - "ERMS"
# - "F16C"
# - "HTT"
# - "LZCNT"
# - "MMX"
# - "MMXEXT"
# - "NX"
# - "POPCNT"
# - "RDRAND"
# - "RDSEED"
# - "RDTSCP"
# - "SGX"
# - "SSE"
# - "SSE2"
# - "SSE3"
# - "SSE4.1"
# - "SSE4.2"
# - "SSSE3"
# attributeWhitelist:
# kernel:
# kconfigFile: "/path/to/kconfig"
# configOpts:
# - "NO_HZ"
# - "X86"
# - "DMI"
pci:
deviceClassWhitelist:
- "0200"
- "03"
- "12"
deviceLabelFields:
# - "class"
- "vendor"

# - "device"
# - "subsystem_vendor"
# - "subsystem_device"
# usb:
# deviceClassWhitelist:
# - "0e"
# - "ef"
# - "fe"
# - "ff"
# deviceLabelFields:
# - "class"
# - "vendor"
# - "device"
# custom:
# - name: "my.kernel.feature"
# matchOn:
# - loadedKMod: ["example_kmod1", "example_kmod2"]
# - name: "my.pci.feature"
# matchOn:
# - pciId:
# class: ["0200"]
# vendor: ["15b3"]
# device: ["1014", "1017"]
# - pciId :
# vendor: ["8086"]
# device: ["1000", "1100"]
# - name: "my.usb.feature"
# matchOn:
# - usbId:
# class: ["ff"]
# vendor: ["03e7"]
# device: ["2485"]
# - usbId:
# class: ["fe"]
# vendor: ["1a6e"]
# device: ["089a"]
# - name: "my.combined.feature"
# matchOn:
# - pciId:
# vendor: ["15b3"]
# device: ["1014", "1017"]
# loadedKMod : ["vendor_kmod1", "vendor_kmod2"]

0 comments on commit ac4d637

Please sign in to comment.