From 1ef2c346d0333d6f1fe6c203b350166b4f9c02c1 Mon Sep 17 00:00:00 2001 From: Ritesh Date: Thu, 23 Jan 2025 12:31:04 +0530 Subject: [PATCH 1/7] Updated NFD as per the changes reuqired to deploy NFD --- .../ocp4_workload_habana_gaudi_gpu_setup/defaults/main.yml | 1 + .../templates/nfd/nodefeature_discovery_cr.yaml.j2 | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/defaults/main.yml b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/defaults/main.yml index eded3852dea..7b314102251 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/defaults/main.yml +++ b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/defaults/main.yml @@ -27,6 +27,7 @@ nfd_operator_use_catalog_snapshot: false nfd_operator_catalogsource_name: "" nfd_operator_catalog_snapshot_image: "" nfd_operator_catalog_snapshot_image_tag: "" +nfd_operator_operand_image: "registry.redhat.io/openshift4/ose-node-feature-discovery-rhel9:v4.17" # ------------------------------------------------ # Habana Gaudi Operator diff --git a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 index 2f356cc045b..9f29bc92dde 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 +++ b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 @@ -19,7 +19,7 @@ spec: instance: '' operand: image: >- - registry.redhat.io/openshift4/ose-node-feature-discovery:v4.12 + {{ nfd_operator_operand_image}} servicePort: 12000 topologyupdater: false workerConfig: From cb79624dd72b644cb4e29e98a45079cae0e55718 Mon Sep 17 00:00:00 2001 From: Ritesh Date: Thu, 23 Jan 2025 14:35:27 +0530 Subject: [PATCH 2/7] Updated for NFD --- .../tasks/nfd_operator.yml | 2 +- .../nfd/nodefeature_discovery_cr.yaml.j2 | 17 +++++------------ 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/nfd_operator.yml b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/nfd_operator.yml index 52765a1501a..38d6e1a1b6b 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/nfd_operator.yml +++ b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/nfd_operator.yml @@ -16,7 +16,7 @@ retries: 25 delay: 5 -- name: Create NodeFeaturEDiscovery subscription +- name: Create NodeFeatureDiscovery subscription kubernetes.core.k8s: state: present definition: "{{ lookup('template', 'nfd/nodefeature_discovery_sub.yaml.j2') | from_yaml }}" diff --git a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 index 9f29bc92dde..dc654f3bcbc 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 +++ b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 @@ -5,23 +5,16 @@ metadata: name: nfd-instance namespace: {{ nfd_operator_namespace }} spec: - customConfig: - configData: | - # - name: "more.kernel.features" - # matchOn: - # - loadedKMod: ["example_kmod3"] - # - name: "more.features.by.nodename" - # value: customValue - # matchOn: - # - nodename: ["special-.*-node-.*"] + enableTaints: false extraLabelNs: - habana.ai instance: '' operand: - image: >- - {{ nfd_operator_operand_image}} + image: 'registry.redhat.io/openshift4/ose-node-feature-discovery-rhel9:v4.17' + imagePullPolicy: IfNotPresent servicePort: 12000 - topologyupdater: false + prunerOnDelete: false + topologyUpdater: false workerConfig: configData: | core: From 5e9011537f4c74fea4f82da51ba319258e4201a0 Mon Sep 17 00:00:00 2001 From: Ritesh Date: Thu, 23 Jan 2025 14:36:09 +0530 Subject: [PATCH 3/7] Updated for NFD --- .../templates/nfd/nodefeature_discovery_cr.yaml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 index dc654f3bcbc..1a360b1faf5 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 +++ b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 @@ -10,7 +10,7 @@ spec: - habana.ai instance: '' operand: - image: 'registry.redhat.io/openshift4/ose-node-feature-discovery-rhel9:v4.17' + image: '{{ nfd_operator_operand_image }}' imagePullPolicy: IfNotPresent servicePort: 12000 prunerOnDelete: false From 83a46a805c5ff5ad106cbf0901adc6b00bf510aa Mon Sep 17 00:00:00 2001 From: Ritesh Date: Fri, 24 Jan 2025 09:35:04 +0530 Subject: [PATCH 4/7] Updated for starting csv to nil --- .../ocp4_workload_habana_gaudi_gpu_setup/defaults/main.yml | 2 +- .../templates/nfd/nodefeature_discovery_cr.yaml.j2 | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/defaults/main.yml b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/defaults/main.yml index 7b314102251..38604633d66 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/defaults/main.yml +++ b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/defaults/main.yml @@ -21,7 +21,7 @@ kmm_ignition_version: "3.2.0" nfd_operator_namespace: "openshift-nfd" nfd_operator_channel: "stable" nfd_operator_automatic_install_plan_approval: true -nfd_operator_starting_csv: "nfd.4.12.0-202307182142" +nfd_operator_starting_csv: "" nfd_operator_wait_for_deploy: true nfd_operator_use_catalog_snapshot: false nfd_operator_catalogsource_name: "" diff --git a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 index 1a360b1faf5..6e5fef47ad8 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 +++ b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 @@ -26,5 +26,4 @@ spec: - "03" - "12" deviceLabelFields: - - "vendor" - + - "vendor" \ No newline at end of file From 534ae9f135cd6593a624a4347491d3b352fbe3e9 Mon Sep 17 00:00:00 2001 From: Ritesh Date: Fri, 24 Jan 2025 10:20:09 +0530 Subject: [PATCH 5/7] Updated with pauses --- .../tasks/habana_gaudi_operator.yml | 10 +++++++++- .../tasks/kmm_operator.yml | 6 +++++- .../tasks/nfd_operator.yml | 14 ++++++++++---- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/habana_gaudi_operator.yml b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/habana_gaudi_operator.yml index 9e5a73a3b15..877b88c5535 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/habana_gaudi_operator.yml +++ b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/habana_gaudi_operator.yml @@ -10,6 +10,10 @@ retries: 10 delay: 5 +- name: 60 second pause for Habana Gaudi GPU namespace check + pause: + seconds: 60 + - name: Create Habana Gaudi GPU operatorgroup kubernetes.core.k8s: state: present @@ -19,6 +23,10 @@ retries: 25 delay: 5 +- name: 60 second pause for Habana Gaudi GPU operator setup + pause: + seconds: 60 + - name: Create Habana Gaudi GPU subscription kubernetes.core.k8s: state: present @@ -28,7 +36,7 @@ retries: 50 delay: 5 -- name: 120 second pause for Habana Gaudi GPU operator setup +- name: 120 second pause for Habana Gaudi GPU subscription pause: seconds: 120 diff --git a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/kmm_operator.yml b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/kmm_operator.yml index 5d28b01bbad..9ff05436d9a 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/kmm_operator.yml +++ b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/kmm_operator.yml @@ -16,6 +16,10 @@ retries: 25 delay: 5 +- name: 60 second pause for KMM operatorgroup + pause: + seconds: 60 + - name: Create KMM subscription kubernetes.core.k8s: state: present @@ -25,7 +29,7 @@ retries: 25 delay: 5 -- name: 60 second pause for Habana Gaudi GPU operator setup +- name: 60 second pause for KMM subscription pause: seconds: 60 diff --git a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/nfd_operator.yml b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/nfd_operator.yml index 38d6e1a1b6b..811e99f3ed3 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/nfd_operator.yml +++ b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/nfd_operator.yml @@ -12,24 +12,30 @@ state: present definition: "{{ lookup('template', 'nfd/nodefeature_discovery_operatorgroup.yaml.j2') | from_yaml }}" register: operatorgroup_result - until: operatorgroup_result is not failed retries: 25 delay: 5 +- name: 60 second pause for Create NodeFeatureDiscovery operatorgroup + pause: + seconds: 60 + - name: Create NodeFeatureDiscovery subscription kubernetes.core.k8s: state: present definition: "{{ lookup('template', 'nfd/nodefeature_discovery_sub.yaml.j2') | from_yaml }}" register: subscription_result until: subscription_result is not failed - retries: 25 + retries: 100 delay: 5 +- name: 60 second pause for Create NodeFeatureDiscovery subscription + pause: + seconds: 60 + - name: Create NodeFeatureDiscovery Custom Resource kubernetes.core.k8s: state: present definition: "{{ lookup('template', 'nfd/nodefeature_discovery_cr.yaml.j2') | from_yaml }}" register: result - until: result is not failed - retries: 25 + retries: 50 delay: 5 From 392a12c273e4d467ad48aefa7dbb1fda84c44b4f Mon Sep 17 00:00:00 2001 From: Ritesh Date: Fri, 24 Jan 2025 15:47:50 +0530 Subject: [PATCH 6/7] Updated NFD to operator install role --- .../tasks/nfd_operator.yml | 55 ++++------ .../nfd/nodefeature_discovery_cr.yaml.j2 | 103 +++++++++++++++++- 2 files changed, 120 insertions(+), 38 deletions(-) diff --git a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/nfd_operator.yml b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/nfd_operator.yml index 811e99f3ed3..e62721580ec 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/nfd_operator.yml +++ b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/nfd_operator.yml @@ -1,41 +1,28 @@ --- -- name: "Ensure nfd namespace exists" - kubernetes.core.k8s: - state: present - api_version: v1 - kind: Namespace - name: "{{ nfd_operator_namespace }}" - delay: 5 - -- name: Create NodeFeatureDiscovery operatorgroup - kubernetes.core.k8s: - state: present - definition: "{{ lookup('template', 'nfd/nodefeature_discovery_operatorgroup.yaml.j2') | from_yaml }}" - register: operatorgroup_result - retries: 25 - delay: 5 - -- name: 60 second pause for Create NodeFeatureDiscovery operatorgroup - pause: - seconds: 60 - -- name: Create NodeFeatureDiscovery subscription - kubernetes.core.k8s: - state: present - definition: "{{ lookup('template', 'nfd/nodefeature_discovery_sub.yaml.j2') | from_yaml }}" - register: subscription_result - until: subscription_result is not failed - retries: 100 - delay: 5 - -- name: 60 second pause for Create NodeFeatureDiscovery subscription - pause: - seconds: 60 +- name: Install NFD operator + ansible.builtin.include_role: + name: install_operator + vars: + install_operator_action: install + install_operator_name: nfd + install_operator_namespace: "{{ nfd_operator_namespace }}" + install_operator_channel: "{{ nfd_operator_channel }}" + install_operator_catalog: redhat-operators + install_operator_automatic_install_plan_approval: "{{ nfd_operator_automatic_install_plan_approval | default('true') }}" + install_operator_starting_csv: "{{ nfd_operator_starting_csv | default('') }}" + install_operator_catalogsource_setup: "{{ nfd_operator_use_catalog_snapshot | default(false) }}" + install_operator_catalogsource_name: "{{ nfd_operator_catalogsource_name }}" + install_operator_catalogsource_namespace: "{{ nfd_operator_namespace }}" + install_operator_catalogsource_image: "{{ nfd_operator_catalog_snapshot_image | default('') }}" + install_operator_catalogsource_image_tag: "{{ nfd_operator_catalog_snapshot_image_tag | default('') }}" + install_operator_manage_namespaces: + - "{{ nfd_operator_namespace }}" - name: Create NodeFeatureDiscovery Custom Resource kubernetes.core.k8s: state: present definition: "{{ lookup('template', 'nfd/nodefeature_discovery_cr.yaml.j2') | from_yaml }}" register: result - retries: 50 - delay: 5 + until: result is not failed + retries: 30 + delay: 20 diff --git a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 index 6e5fef47ad8..0e0c581d602 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 +++ b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 @@ -1,9 +1,8 @@ ---- -apiVersion: nfd.openshift.io/v1 kind: NodeFeatureDiscovery +apiVersion: nfd.openshift.io/v1 metadata: name: nfd-instance - namespace: {{ nfd_operator_namespace }} + namespace: "{{ nfd_operator_namespace }}" spec: enableTaints: false extraLabelNs: @@ -18,12 +17,108 @@ spec: workerConfig: configData: | core: + # labelWhiteList: + # noPublish: false sleepInterval: 60s + # sources: [all] + # klog: + # addDirHeader: false + # alsologtostderr: false + # logBacktraceAt: + # logtostderr: true + # skipHeaders: false + # stderrthreshold: 2 + # v: 0 + # vmodule: + ## NOTE: the following options are not dynamically run-time + ## configurable and require a nfd-worker restart to take effect + ## after being changed + # logDir: + # logFile: + # logFileMaxSize: 1800 + # skipLogHeaders: false sources: + # cpu: + # cpuid: + ## NOTE: whitelist has priority over blacklist + # attributeBlacklist: + # - "BMI1" + # - "BMI2" + # - "CLMUL" + # - "CMOV" + # - "CX16" + # - "ERMS" + # - "F16C" + # - "HTT" + # - "LZCNT" + # - "MMX" + # - "MMXEXT" + # - "NX" + # - "POPCNT" + # - "RDRAND" + # - "RDSEED" + # - "RDTSCP" + # - "SGX" + # - "SSE" + # - "SSE2" + # - "SSE3" + # - "SSE4.1" + # - "SSE4.2" + # - "SSSE3" + # attributeWhitelist: + # kernel: + # kconfigFile: "/path/to/kconfig" + # configOpts: + # - "NO_HZ" + # - "X86" + # - "DMI" pci: deviceClassWhitelist: - "0200" - "03" - "12" deviceLabelFields: - - "vendor" \ No newline at end of file + # - "class" + - "vendor" + # - "device" + # - "subsystem_vendor" + # - "subsystem_device" + # usb: + # deviceClassWhitelist: + # - "0e" + # - "ef" + # - "fe" + # - "ff" + # deviceLabelFields: + # - "class" + # - "vendor" + # - "device" + # custom: + # - name: "my.kernel.feature" + # matchOn: + # - loadedKMod: ["example_kmod1", "example_kmod2"] + # - name: "my.pci.feature" + # matchOn: + # - pciId: + # class: ["0200"] + # vendor: ["15b3"] + # device: ["1014", "1017"] + # - pciId : + # vendor: ["8086"] + # device: ["1000", "1100"] + # - name: "my.usb.feature" + # matchOn: + # - usbId: + # class: ["ff"] + # vendor: ["03e7"] + # device: ["2485"] + # - usbId: + # class: ["fe"] + # vendor: ["1a6e"] + # device: ["089a"] + # - name: "my.combined.feature" + # matchOn: + # - pciId: + # vendor: ["15b3"] + # device: ["1014", "1017"] + # loadedKMod : ["vendor_kmod1", "vendor_kmod2"] \ No newline at end of file From bdb948eeea3b4014b72f6f99b4a09fe251997387 Mon Sep 17 00:00:00 2001 From: Ritesh Date: Fri, 24 Jan 2025 16:12:55 +0530 Subject: [PATCH 7/7] Updated habana devicconfig to clusterpolicy as per the new requirements --- .../defaults/main.yml | 1 + .../tasks/habana_gaudi_operator.yml | 10 +- .../habana_gpu_clusterpolicy.yaml.j2 | 109 ++++++++++++++++++ 3 files changed, 115 insertions(+), 5 deletions(-) create mode 100644 ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/habana-gaudi/habana_gpu_clusterpolicy.yaml.j2 diff --git a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/defaults/main.yml b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/defaults/main.yml index 38604633d66..c1e658df40e 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/defaults/main.yml +++ b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/defaults/main.yml @@ -41,6 +41,7 @@ habana_gaudi_operator_use_catalog_snapshot: false habana_gaudi_operator_catalogsource_name: "" habana_gaudi_operator_catalog_snapshot_image: "" habana_gaudi_operator_catalog_snapshot_image_tag: "" +habana_gaudi_image_tag: "1.19.1-26" habana_gaudi_image_version: "1.10.0" habana_gaudi_deviceplugin_version: "1.10.0" habana_gaudi_driver_version: "1.10.0-494" diff --git a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/habana_gaudi_operator.yml b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/habana_gaudi_operator.yml index 877b88c5535..a30ba2dfab6 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/habana_gaudi_operator.yml +++ b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/habana_gaudi_operator.yml @@ -40,11 +40,11 @@ pause: seconds: 120 -- name: Setup Habana Gaudi Device Config +- name: Setup Habana Gaudi Cluster policy setup kubernetes.core.k8s: state: present - definition: "{{ lookup('template', 'habana-gaudi/habana_gpu_deviceconfig.yaml.j2') | from_yaml }}" - register: devconfig_result - until: devconfig_result is successful - retries: 30 + definition: "{{ lookup('template', 'habana-gaudi/habana_gpu_clusterpolicy.yaml.j2') | from_yaml }}" + register: devconfig_clusterpolicy_result + until: devconfig_clusterpolicy_result is successful + retries: 300 delay: 5 diff --git a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/habana-gaudi/habana_gpu_clusterpolicy.yaml.j2 b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/habana-gaudi/habana_gpu_clusterpolicy.yaml.j2 new file mode 100644 index 00000000000..1cb11bd1e36 --- /dev/null +++ b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/habana-gaudi/habana_gpu_clusterpolicy.yaml.j2 @@ -0,0 +1,109 @@ +kind: ClusterPolicy +apiVersion: habanalabs.habana.ai/v1 +metadata: + name: habana-ai +spec: + bmc_monitoring: + image: + repository: vault.habana.ai/habana-bmc-exporter/bmc-exporter + tag: "{{ habana_gaudi_image_tag}}" + resources: + limits: + cpu: 250m + memory: 250Mi + requests: + cpu: 150m + memory: 100Mi + device_plugin: + image: + repository: vault.habana.ai/docker-k8s-device-plugin/docker-k8s-device-plugin + tag: "{{ habana_gaudi_image_tag}}" + resources: + limits: + cpu: 20m + memory: 64Mi + requests: + cpu: 10m + memory: 32Mi + driver: + driver_loader: + images: + rhel_8.6: + repository: vault.habana.ai/habana-ai-operator/driver/rhel8.6/driver-installer + tag: "{{ habana_gaudi_image_tag}}" + rhel_9.2: + repository: vault.habana.ai/habana-ai-operator/driver/rhel9.2/driver-installer + tag: "{{ habana_gaudi_image_tag}}" + rhel_9.4: + repository: vault.habana.ai/habana-ai-operator/driver/rhel9.4/driver-installer + tag: "{{ habana_gaudi_image_tag}}" + tencentos_3.1: + repository: vault.habana.ai/habana-ai-operator/driver/tencentos3.1/driver-installer + tag: "{{ habana_gaudi_image_tag}}" + ubuntu_22.04: + repository: vault.habana.ai/habana-ai-operator/driver/ubuntu22.04/driver-installer + tag: "{{ habana_gaudi_image_tag}}" + mlnx_ofed_repo_path: artifactory/gaudi-installer/deps + mlnx_ofed_version: mlnx-ofed-5.8-2.0.3.0-rhel8.4-x86_64.tar.gz + repo_path: artifactory/gaudi-installer/repos + repo_server: vault.habana.ai + resources: + limits: + cpu: 4000m + memory: 16Gi + requests: + cpu: 2000m + memory: 8Gi + driver_runner: + image: + repository: vault.habana.ai/habana-ai-operator/driver/ubuntu22.04/driver-installer + tag: "{{ habana_gaudi_image_tag}}" + resources: + limits: + cpu: 20m + memory: 64Mi + requests: + cpu: 10m + memory: 32Mi + feature_discovery: + nfd_plugin: false + runner: + image: + repository: vault.habana.ai/habana-ai-operator/habanalabs-feature-discovery + tag: "{{ habana_gaudi_image_tag}}" + resources: + limits: + cpu: 20m + memory: 64Mi + requests: + cpu: 10m + memory: 32Mi + image_registry: vault.habana.ai + metric_exporter: + interval: 20 + port: 41611 + runner: + image: + repository: vault.habana.ai/gaudi-metric-exporter/metric-exporter + tag: "{{ habana_gaudi_image_tag}}" + resources: + limits: + cpu: 150m + memory: 120Mi + requests: + cpu: 100m + memory: 100Mi + runtime: + configuration: + container_engine: crio + runner: + image: + repository: vault.habana.ai/habana-ai-operator/habana-container-runtime + tag: "{{ habana_gaudi_image_tag}}" + resources: + limits: + cpu: 20m + memory: 64Mi + requests: + cpu: 10m + memory: 32Mi \ No newline at end of file