Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Habana v2 - updated habana role for OCP 4.17 and RHOAI 2.16 and above #9018

Merged
merged 7 commits into from
Jan 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,13 @@ kmm_ignition_version: "3.2.0"
nfd_operator_namespace: "openshift-nfd"
nfd_operator_channel: "stable"
nfd_operator_automatic_install_plan_approval: true
nfd_operator_starting_csv: "nfd.4.12.0-202307182142"
nfd_operator_starting_csv: ""
nfd_operator_wait_for_deploy: true
nfd_operator_use_catalog_snapshot: false
nfd_operator_catalogsource_name: ""
nfd_operator_catalog_snapshot_image: ""
nfd_operator_catalog_snapshot_image_tag: ""
nfd_operator_operand_image: "registry.redhat.io/openshift4/ose-node-feature-discovery-rhel9:v4.17"

# ------------------------------------------------
# Habana Gaudi Operator
Expand All @@ -40,6 +41,7 @@ habana_gaudi_operator_use_catalog_snapshot: false
habana_gaudi_operator_catalogsource_name: ""
habana_gaudi_operator_catalog_snapshot_image: ""
habana_gaudi_operator_catalog_snapshot_image_tag: ""
habana_gaudi_image_tag: "1.19.1-26"
habana_gaudi_image_version: "1.10.0"
habana_gaudi_deviceplugin_version: "1.10.0"
habana_gaudi_driver_version: "1.10.0-494"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
retries: 10
delay: 5

- name: 60 second pause for Habana Gaudi GPU namespace check
pause:
seconds: 60

- name: Create Habana Gaudi GPU operatorgroup
kubernetes.core.k8s:
state: present
Expand All @@ -19,6 +23,10 @@
retries: 25
delay: 5

- name: 60 second pause for Habana Gaudi GPU operator setup
pause:
seconds: 60

- name: Create Habana Gaudi GPU subscription
kubernetes.core.k8s:
state: present
Expand All @@ -28,15 +36,15 @@
retries: 50
delay: 5

- name: 120 second pause for Habana Gaudi GPU operator setup
- name: 120 second pause for Habana Gaudi GPU subscription
pause:
seconds: 120

- name: Setup Habana Gaudi Device Config
- name: Setup Habana Gaudi Cluster policy setup
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', 'habana-gaudi/habana_gpu_deviceconfig.yaml.j2') | from_yaml }}"
register: devconfig_result
until: devconfig_result is successful
retries: 30
definition: "{{ lookup('template', 'habana-gaudi/habana_gpu_clusterpolicy.yaml.j2') | from_yaml }}"
register: devconfig_clusterpolicy_result
until: devconfig_clusterpolicy_result is successful
retries: 300
delay: 5
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
retries: 25
delay: 5

- name: 60 second pause for KMM operatorgroup
pause:
seconds: 60

- name: Create KMM subscription
kubernetes.core.k8s:
state: present
Expand All @@ -25,7 +29,7 @@
retries: 25
delay: 5

- name: 60 second pause for Habana Gaudi GPU operator setup
- name: 60 second pause for KMM subscription
pause:
seconds: 60

Expand Down
Original file line number Diff line number Diff line change
@@ -1,35 +1,28 @@
---
- name: "Ensure nfd namespace exists"
kubernetes.core.k8s:
state: present
api_version: v1
kind: Namespace
name: "{{ nfd_operator_namespace }}"
delay: 5

- name: Create NodeFeatureDiscovery operatorgroup
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', 'nfd/nodefeature_discovery_operatorgroup.yaml.j2') | from_yaml }}"
register: operatorgroup_result
until: operatorgroup_result is not failed
retries: 25
delay: 5

- name: Create NodeFeaturEDiscovery subscription
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', 'nfd/nodefeature_discovery_sub.yaml.j2') | from_yaml }}"
register: subscription_result
until: subscription_result is not failed
retries: 25
delay: 5
- name: Install NFD operator
ansible.builtin.include_role:
name: install_operator
vars:
install_operator_action: install
install_operator_name: nfd
install_operator_namespace: "{{ nfd_operator_namespace }}"
install_operator_channel: "{{ nfd_operator_channel }}"
install_operator_catalog: redhat-operators
install_operator_automatic_install_plan_approval: "{{ nfd_operator_automatic_install_plan_approval | default('true') }}"
install_operator_starting_csv: "{{ nfd_operator_starting_csv | default('') }}"
install_operator_catalogsource_setup: "{{ nfd_operator_use_catalog_snapshot | default(false) }}"
install_operator_catalogsource_name: "{{ nfd_operator_catalogsource_name }}"
install_operator_catalogsource_namespace: "{{ nfd_operator_namespace }}"
install_operator_catalogsource_image: "{{ nfd_operator_catalog_snapshot_image | default('') }}"
install_operator_catalogsource_image_tag: "{{ nfd_operator_catalog_snapshot_image_tag | default('') }}"
install_operator_manage_namespaces:
- "{{ nfd_operator_namespace }}"

- name: Create NodeFeatureDiscovery Custom Resource
kubernetes.core.k8s:
state: present
definition: "{{ lookup('template', 'nfd/nodefeature_discovery_cr.yaml.j2') | from_yaml }}"
register: result
until: result is not failed
retries: 25
delay: 5
retries: 30
delay: 20
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
kind: ClusterPolicy
apiVersion: habanalabs.habana.ai/v1
metadata:
name: habana-ai
spec:
bmc_monitoring:
image:
repository: vault.habana.ai/habana-bmc-exporter/bmc-exporter
tag: "{{ habana_gaudi_image_tag}}"
resources:
limits:
cpu: 250m
memory: 250Mi
requests:
cpu: 150m
memory: 100Mi
device_plugin:
image:
repository: vault.habana.ai/docker-k8s-device-plugin/docker-k8s-device-plugin
tag: "{{ habana_gaudi_image_tag}}"
resources:
limits:
cpu: 20m
memory: 64Mi
requests:
cpu: 10m
memory: 32Mi
driver:
driver_loader:
images:
rhel_8.6:
repository: vault.habana.ai/habana-ai-operator/driver/rhel8.6/driver-installer
tag: "{{ habana_gaudi_image_tag}}"
rhel_9.2:
repository: vault.habana.ai/habana-ai-operator/driver/rhel9.2/driver-installer
tag: "{{ habana_gaudi_image_tag}}"
rhel_9.4:
repository: vault.habana.ai/habana-ai-operator/driver/rhel9.4/driver-installer
tag: "{{ habana_gaudi_image_tag}}"
tencentos_3.1:
repository: vault.habana.ai/habana-ai-operator/driver/tencentos3.1/driver-installer
tag: "{{ habana_gaudi_image_tag}}"
ubuntu_22.04:
repository: vault.habana.ai/habana-ai-operator/driver/ubuntu22.04/driver-installer
tag: "{{ habana_gaudi_image_tag}}"
mlnx_ofed_repo_path: artifactory/gaudi-installer/deps
mlnx_ofed_version: mlnx-ofed-5.8-2.0.3.0-rhel8.4-x86_64.tar.gz
repo_path: artifactory/gaudi-installer/repos
repo_server: vault.habana.ai
resources:
limits:
cpu: 4000m
memory: 16Gi
requests:
cpu: 2000m
memory: 8Gi
driver_runner:
image:
repository: vault.habana.ai/habana-ai-operator/driver/ubuntu22.04/driver-installer
tag: "{{ habana_gaudi_image_tag}}"
resources:
limits:
cpu: 20m
memory: 64Mi
requests:
cpu: 10m
memory: 32Mi
feature_discovery:
nfd_plugin: false
runner:
image:
repository: vault.habana.ai/habana-ai-operator/habanalabs-feature-discovery
tag: "{{ habana_gaudi_image_tag}}"
resources:
limits:
cpu: 20m
memory: 64Mi
requests:
cpu: 10m
memory: 32Mi
image_registry: vault.habana.ai
metric_exporter:
interval: 20
port: 41611
runner:
image:
repository: vault.habana.ai/gaudi-metric-exporter/metric-exporter
tag: "{{ habana_gaudi_image_tag}}"
resources:
limits:
cpu: 150m
memory: 120Mi
requests:
cpu: 100m
memory: 100Mi
runtime:
configuration:
container_engine: crio
runner:
image:
repository: vault.habana.ai/habana-ai-operator/habana-container-runtime
tag: "{{ habana_gaudi_image_tag}}"
resources:
limits:
cpu: 20m
memory: 64Mi
requests:
cpu: 10m
memory: 32Mi
Original file line number Diff line number Diff line change
@@ -1,37 +1,124 @@
---
apiVersion: nfd.openshift.io/v1
kind: NodeFeatureDiscovery
apiVersion: nfd.openshift.io/v1
metadata:
name: nfd-instance
namespace: {{ nfd_operator_namespace }}
namespace: "{{ nfd_operator_namespace }}"
spec:
customConfig:
configData: |
# - name: "more.kernel.features"
# matchOn:
# - loadedKMod: ["example_kmod3"]
# - name: "more.features.by.nodename"
# value: customValue
# matchOn:
# - nodename: ["special-.*-node-.*"]
enableTaints: false
extraLabelNs:
- habana.ai
instance: ''
operand:
image: >-
registry.redhat.io/openshift4/ose-node-feature-discovery:v4.12
image: '{{ nfd_operator_operand_image }}'
imagePullPolicy: IfNotPresent
servicePort: 12000
topologyupdater: false
prunerOnDelete: false
topologyUpdater: false
workerConfig:
configData: |
core:
# labelWhiteList:
# noPublish: false
sleepInterval: 60s
# sources: [all]
# klog:
# addDirHeader: false
# alsologtostderr: false
# logBacktraceAt:
# logtostderr: true
# skipHeaders: false
# stderrthreshold: 2
# v: 0
# vmodule:
## NOTE: the following options are not dynamically run-time
## configurable and require a nfd-worker restart to take effect
## after being changed
# logDir:
# logFile:
# logFileMaxSize: 1800
# skipLogHeaders: false
sources:
# cpu:
# cpuid:
## NOTE: whitelist has priority over blacklist
# attributeBlacklist:
# - "BMI1"
# - "BMI2"
# - "CLMUL"
# - "CMOV"
# - "CX16"
# - "ERMS"
# - "F16C"
# - "HTT"
# - "LZCNT"
# - "MMX"
# - "MMXEXT"
# - "NX"
# - "POPCNT"
# - "RDRAND"
# - "RDSEED"
# - "RDTSCP"
# - "SGX"
# - "SSE"
# - "SSE2"
# - "SSE3"
# - "SSE4.1"
# - "SSE4.2"
# - "SSSE3"
# attributeWhitelist:
# kernel:
# kconfigFile: "/path/to/kconfig"
# configOpts:
# - "NO_HZ"
# - "X86"
# - "DMI"
pci:
deviceClassWhitelist:
- "0200"
- "03"
- "12"
deviceLabelFields:
# - "class"
- "vendor"

# - "device"
# - "subsystem_vendor"
# - "subsystem_device"
# usb:
# deviceClassWhitelist:
# - "0e"
# - "ef"
# - "fe"
# - "ff"
# deviceLabelFields:
# - "class"
# - "vendor"
# - "device"
# custom:
# - name: "my.kernel.feature"
# matchOn:
# - loadedKMod: ["example_kmod1", "example_kmod2"]
# - name: "my.pci.feature"
# matchOn:
# - pciId:
# class: ["0200"]
# vendor: ["15b3"]
# device: ["1014", "1017"]
# - pciId :
# vendor: ["8086"]
# device: ["1000", "1100"]
# - name: "my.usb.feature"
# matchOn:
# - usbId:
# class: ["ff"]
# vendor: ["03e7"]
# device: ["2485"]
# - usbId:
# class: ["fe"]
# vendor: ["1a6e"]
# device: ["089a"]
# - name: "my.combined.feature"
# matchOn:
# - pciId:
# vendor: ["15b3"]
# device: ["1014", "1017"]
# loadedKMod : ["vendor_kmod1", "vendor_kmod2"]
Loading