-
Notifications
You must be signed in to change notification settings - Fork 500
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Habana v2 - updated habana role for OCP 4.17 and RHOAI 2.16 and above (…
…#9018) * Updated NFD as per the changes reuqired to deploy NFD * Updated for NFD * Updated for NFD * Updated for starting csv to nil * Updated with pauses * Updated NFD to operator install role * Updated habana devicconfig to clusterpolicy as per the new requirements --------- Co-authored-by: Ritesh <[email protected]>
- Loading branch information
1 parent
fb22b83
commit ac4d637
Showing
6 changed files
with
254 additions
and
51 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
47 changes: 20 additions & 27 deletions
47
ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/nfd_operator.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,35 +1,28 @@ | ||
--- | ||
- name: "Ensure nfd namespace exists" | ||
kubernetes.core.k8s: | ||
state: present | ||
api_version: v1 | ||
kind: Namespace | ||
name: "{{ nfd_operator_namespace }}" | ||
delay: 5 | ||
|
||
- name: Create NodeFeatureDiscovery operatorgroup | ||
kubernetes.core.k8s: | ||
state: present | ||
definition: "{{ lookup('template', 'nfd/nodefeature_discovery_operatorgroup.yaml.j2') | from_yaml }}" | ||
register: operatorgroup_result | ||
until: operatorgroup_result is not failed | ||
retries: 25 | ||
delay: 5 | ||
|
||
- name: Create NodeFeaturEDiscovery subscription | ||
kubernetes.core.k8s: | ||
state: present | ||
definition: "{{ lookup('template', 'nfd/nodefeature_discovery_sub.yaml.j2') | from_yaml }}" | ||
register: subscription_result | ||
until: subscription_result is not failed | ||
retries: 25 | ||
delay: 5 | ||
- name: Install NFD operator | ||
ansible.builtin.include_role: | ||
name: install_operator | ||
vars: | ||
install_operator_action: install | ||
install_operator_name: nfd | ||
install_operator_namespace: "{{ nfd_operator_namespace }}" | ||
install_operator_channel: "{{ nfd_operator_channel }}" | ||
install_operator_catalog: redhat-operators | ||
install_operator_automatic_install_plan_approval: "{{ nfd_operator_automatic_install_plan_approval | default('true') }}" | ||
install_operator_starting_csv: "{{ nfd_operator_starting_csv | default('') }}" | ||
install_operator_catalogsource_setup: "{{ nfd_operator_use_catalog_snapshot | default(false) }}" | ||
install_operator_catalogsource_name: "{{ nfd_operator_catalogsource_name }}" | ||
install_operator_catalogsource_namespace: "{{ nfd_operator_namespace }}" | ||
install_operator_catalogsource_image: "{{ nfd_operator_catalog_snapshot_image | default('') }}" | ||
install_operator_catalogsource_image_tag: "{{ nfd_operator_catalog_snapshot_image_tag | default('') }}" | ||
install_operator_manage_namespaces: | ||
- "{{ nfd_operator_namespace }}" | ||
|
||
- name: Create NodeFeatureDiscovery Custom Resource | ||
kubernetes.core.k8s: | ||
state: present | ||
definition: "{{ lookup('template', 'nfd/nodefeature_discovery_cr.yaml.j2') | from_yaml }}" | ||
register: result | ||
until: result is not failed | ||
retries: 25 | ||
delay: 5 | ||
retries: 30 | ||
delay: 20 |
109 changes: 109 additions & 0 deletions
109
...4_workload_habana_gaudi_gpu_setup/templates/habana-gaudi/habana_gpu_clusterpolicy.yaml.j2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
kind: ClusterPolicy | ||
apiVersion: habanalabs.habana.ai/v1 | ||
metadata: | ||
name: habana-ai | ||
spec: | ||
bmc_monitoring: | ||
image: | ||
repository: vault.habana.ai/habana-bmc-exporter/bmc-exporter | ||
tag: "{{ habana_gaudi_image_tag}}" | ||
resources: | ||
limits: | ||
cpu: 250m | ||
memory: 250Mi | ||
requests: | ||
cpu: 150m | ||
memory: 100Mi | ||
device_plugin: | ||
image: | ||
repository: vault.habana.ai/docker-k8s-device-plugin/docker-k8s-device-plugin | ||
tag: "{{ habana_gaudi_image_tag}}" | ||
resources: | ||
limits: | ||
cpu: 20m | ||
memory: 64Mi | ||
requests: | ||
cpu: 10m | ||
memory: 32Mi | ||
driver: | ||
driver_loader: | ||
images: | ||
rhel_8.6: | ||
repository: vault.habana.ai/habana-ai-operator/driver/rhel8.6/driver-installer | ||
tag: "{{ habana_gaudi_image_tag}}" | ||
rhel_9.2: | ||
repository: vault.habana.ai/habana-ai-operator/driver/rhel9.2/driver-installer | ||
tag: "{{ habana_gaudi_image_tag}}" | ||
rhel_9.4: | ||
repository: vault.habana.ai/habana-ai-operator/driver/rhel9.4/driver-installer | ||
tag: "{{ habana_gaudi_image_tag}}" | ||
tencentos_3.1: | ||
repository: vault.habana.ai/habana-ai-operator/driver/tencentos3.1/driver-installer | ||
tag: "{{ habana_gaudi_image_tag}}" | ||
ubuntu_22.04: | ||
repository: vault.habana.ai/habana-ai-operator/driver/ubuntu22.04/driver-installer | ||
tag: "{{ habana_gaudi_image_tag}}" | ||
mlnx_ofed_repo_path: artifactory/gaudi-installer/deps | ||
mlnx_ofed_version: mlnx-ofed-5.8-2.0.3.0-rhel8.4-x86_64.tar.gz | ||
repo_path: artifactory/gaudi-installer/repos | ||
repo_server: vault.habana.ai | ||
resources: | ||
limits: | ||
cpu: 4000m | ||
memory: 16Gi | ||
requests: | ||
cpu: 2000m | ||
memory: 8Gi | ||
driver_runner: | ||
image: | ||
repository: vault.habana.ai/habana-ai-operator/driver/ubuntu22.04/driver-installer | ||
tag: "{{ habana_gaudi_image_tag}}" | ||
resources: | ||
limits: | ||
cpu: 20m | ||
memory: 64Mi | ||
requests: | ||
cpu: 10m | ||
memory: 32Mi | ||
feature_discovery: | ||
nfd_plugin: false | ||
runner: | ||
image: | ||
repository: vault.habana.ai/habana-ai-operator/habanalabs-feature-discovery | ||
tag: "{{ habana_gaudi_image_tag}}" | ||
resources: | ||
limits: | ||
cpu: 20m | ||
memory: 64Mi | ||
requests: | ||
cpu: 10m | ||
memory: 32Mi | ||
image_registry: vault.habana.ai | ||
metric_exporter: | ||
interval: 20 | ||
port: 41611 | ||
runner: | ||
image: | ||
repository: vault.habana.ai/gaudi-metric-exporter/metric-exporter | ||
tag: "{{ habana_gaudi_image_tag}}" | ||
resources: | ||
limits: | ||
cpu: 150m | ||
memory: 120Mi | ||
requests: | ||
cpu: 100m | ||
memory: 100Mi | ||
runtime: | ||
configuration: | ||
container_engine: crio | ||
runner: | ||
image: | ||
repository: vault.habana.ai/habana-ai-operator/habana-container-runtime | ||
tag: "{{ habana_gaudi_image_tag}}" | ||
resources: | ||
limits: | ||
cpu: 20m | ||
memory: 64Mi | ||
requests: | ||
cpu: 10m | ||
memory: 32Mi |
119 changes: 103 additions & 16 deletions
119
...loads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,37 +1,124 @@ | ||
--- | ||
apiVersion: nfd.openshift.io/v1 | ||
kind: NodeFeatureDiscovery | ||
apiVersion: nfd.openshift.io/v1 | ||
metadata: | ||
name: nfd-instance | ||
namespace: {{ nfd_operator_namespace }} | ||
namespace: "{{ nfd_operator_namespace }}" | ||
spec: | ||
customConfig: | ||
configData: | | ||
# - name: "more.kernel.features" | ||
# matchOn: | ||
# - loadedKMod: ["example_kmod3"] | ||
# - name: "more.features.by.nodename" | ||
# value: customValue | ||
# matchOn: | ||
# - nodename: ["special-.*-node-.*"] | ||
enableTaints: false | ||
extraLabelNs: | ||
- habana.ai | ||
instance: '' | ||
operand: | ||
image: >- | ||
registry.redhat.io/openshift4/ose-node-feature-discovery:v4.12 | ||
image: '{{ nfd_operator_operand_image }}' | ||
imagePullPolicy: IfNotPresent | ||
servicePort: 12000 | ||
topologyupdater: false | ||
prunerOnDelete: false | ||
topologyUpdater: false | ||
workerConfig: | ||
configData: | | ||
core: | ||
# labelWhiteList: | ||
# noPublish: false | ||
sleepInterval: 60s | ||
# sources: [all] | ||
# klog: | ||
# addDirHeader: false | ||
# alsologtostderr: false | ||
# logBacktraceAt: | ||
# logtostderr: true | ||
# skipHeaders: false | ||
# stderrthreshold: 2 | ||
# v: 0 | ||
# vmodule: | ||
## NOTE: the following options are not dynamically run-time | ||
## configurable and require a nfd-worker restart to take effect | ||
## after being changed | ||
# logDir: | ||
# logFile: | ||
# logFileMaxSize: 1800 | ||
# skipLogHeaders: false | ||
sources: | ||
# cpu: | ||
# cpuid: | ||
## NOTE: whitelist has priority over blacklist | ||
# attributeBlacklist: | ||
# - "BMI1" | ||
# - "BMI2" | ||
# - "CLMUL" | ||
# - "CMOV" | ||
# - "CX16" | ||
# - "ERMS" | ||
# - "F16C" | ||
# - "HTT" | ||
# - "LZCNT" | ||
# - "MMX" | ||
# - "MMXEXT" | ||
# - "NX" | ||
# - "POPCNT" | ||
# - "RDRAND" | ||
# - "RDSEED" | ||
# - "RDTSCP" | ||
# - "SGX" | ||
# - "SSE" | ||
# - "SSE2" | ||
# - "SSE3" | ||
# - "SSE4.1" | ||
# - "SSE4.2" | ||
# - "SSSE3" | ||
# attributeWhitelist: | ||
# kernel: | ||
# kconfigFile: "/path/to/kconfig" | ||
# configOpts: | ||
# - "NO_HZ" | ||
# - "X86" | ||
# - "DMI" | ||
pci: | ||
deviceClassWhitelist: | ||
- "0200" | ||
- "03" | ||
- "12" | ||
deviceLabelFields: | ||
# - "class" | ||
- "vendor" | ||
|
||
# - "device" | ||
# - "subsystem_vendor" | ||
# - "subsystem_device" | ||
# usb: | ||
# deviceClassWhitelist: | ||
# - "0e" | ||
# - "ef" | ||
# - "fe" | ||
# - "ff" | ||
# deviceLabelFields: | ||
# - "class" | ||
# - "vendor" | ||
# - "device" | ||
# custom: | ||
# - name: "my.kernel.feature" | ||
# matchOn: | ||
# - loadedKMod: ["example_kmod1", "example_kmod2"] | ||
# - name: "my.pci.feature" | ||
# matchOn: | ||
# - pciId: | ||
# class: ["0200"] | ||
# vendor: ["15b3"] | ||
# device: ["1014", "1017"] | ||
# - pciId : | ||
# vendor: ["8086"] | ||
# device: ["1000", "1100"] | ||
# - name: "my.usb.feature" | ||
# matchOn: | ||
# - usbId: | ||
# class: ["ff"] | ||
# vendor: ["03e7"] | ||
# device: ["2485"] | ||
# - usbId: | ||
# class: ["fe"] | ||
# vendor: ["1a6e"] | ||
# device: ["089a"] | ||
# - name: "my.combined.feature" | ||
# matchOn: | ||
# - pciId: | ||
# vendor: ["15b3"] | ||
# device: ["1014", "1017"] | ||
# loadedKMod : ["vendor_kmod1", "vendor_kmod2"] |