From ed4fc2cb6a2e212a498791cd55e5586eef809b90 Mon Sep 17 00:00:00 2001 From: Frank Ketelaars Date: Wed, 6 Dec 2023 13:25:10 +0000 Subject: [PATCH 01/18] #593 Product list of models differently --- .../templates/patch-watsonx_ai-models.j2 | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/templates/patch-watsonx_ai-models.j2 b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/templates/patch-watsonx_ai-models.j2 index 53dcf8b8d..678994cff 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/templates/patch-watsonx_ai-models.j2 +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/templates/patch-watsonx_ai-models.j2 @@ -1,9 +1,4 @@ oc patch Watsonxaiifm watsonxaiifm-cr \ -n {{ current_cp4d_cluster.project }} \ --type=merge \ - --patch '{"spec": {"install_model_list": [{% for m in _configured_watsonxai_instances | default([]) -%} -{%- if ((m.state | default('installed')) == 'installed') -%} -{%- if not loop.first -%},{% endif -%} -"{{ m.model_id }}" -{%- endif -%} -{%- endfor -%}] } }' \ No newline at end of file + --patch '{"spec": {"install_model_list": [ {{ _configured_watsonxai_instances | default([]) | selectattr('state','match','installed' ) | map(attribute='model_id') | join(',') }} ] } }' \ No newline at end of file From c465b65b163fb8f0096db08e132d6bb68197189d Mon Sep 17 00:00:00 2001 From: Frank Ketelaars Date: Wed, 6 Dec 2023 14:54:36 +0000 Subject: [PATCH 02/18] #593 Add quotes --- .../templates/patch-watsonx_ai-models.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/templates/patch-watsonx_ai-models.j2 b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/templates/patch-watsonx_ai-models.j2 index 678994cff..a3d9a3d25 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/templates/patch-watsonx_ai-models.j2 +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/templates/patch-watsonx_ai-models.j2 @@ -1,4 +1,4 @@ oc patch Watsonxaiifm watsonxaiifm-cr \ -n {{ current_cp4d_cluster.project }} \ --type=merge \ - --patch '{"spec": {"install_model_list": [ {{ _configured_watsonxai_instances | default([]) | selectattr('state','match','installed' ) | map(attribute='model_id') | join(',') }} ] } }' \ No newline at end of file + --patch '{"spec": {"install_model_list": [ {{ '\"' + _configured_watsonxai_instances | default([]) | selectattr('state','match','installed' ) | map(attribute='model_id') | join('\",\"') + '\"' }} ] } }' \ No newline at end of file From 061c95ed3801c55b1bd003fe755f46d8e1586185 Mon Sep 17 00:00:00 2001 From: Frank Ketelaars Date: Mon, 11 Dec 2023 19:51:15 +0000 Subject: [PATCH 03/18] #594 Distinguish between specific and generic oc-login --- .../existing-ocp-login-oc-login-generic.yml | 9 ++++++++ .../existing-ocp-login-oc-login-specific.yml | 9 ++++++++ .../tasks/existing-ocp-login-oc-login.yml | 21 +++---------------- 3 files changed, 21 insertions(+), 18 deletions(-) create mode 100644 automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login-generic.yml create mode 100644 automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login-specific.yml diff --git a/automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login-generic.yml b/automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login-generic.yml new file mode 100644 index 000000000..2e2847ebc --- /dev/null +++ b/automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login-generic.yml @@ -0,0 +1,9 @@ +--- +- name: Login to OpenShift cluster if oc login command was found in secret oc-login + shell: | + {{ _oc_login_generic }} + register: _oc_login_result + failed_when: False + retries: "{{ _ocp_login_retries }}" + delay: "{{ _ocp_login_delay }}" + until: _oc_login_result.rc==0 \ No newline at end of file diff --git a/automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login-specific.yml b/automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login-specific.yml new file mode 100644 index 000000000..3837bb17e --- /dev/null +++ b/automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login-specific.yml @@ -0,0 +1,9 @@ +--- +- name: Login to OpenShift cluster if oc login command was found in secret {{ _p_openshift_cluster_name }}-oc-login + shell: | + {{ _oc_login_cluster }} + register: _oc_login_result + failed_when: False + retries: "{{ _ocp_login_retries }}" + delay: "{{ _ocp_login_delay }}" + until: _oc_login_result.rc==0 \ No newline at end of file diff --git a/automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login.yml b/automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login.yml index f53392ce0..d12a6c38f 100644 --- a/automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login.yml +++ b/automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login.yml @@ -1,30 +1,15 @@ --- -# Log in using oc login command -- name: Login to OpenShift cluster if oc login command was found in secret {{ _p_openshift_cluster_name }}-oc-login - shell: | - {{ _oc_login_cluster }} - register: _oc_login_result - failed_when: False - retries: "{{ _ocp_login_retries }}" - delay: "{{ _ocp_login_delay }}" - until: _oc_login_result.rc==0 +- include_tasks: existing-ocp-login-oc-login-specific.yml when: _oc_login_cluster != '' -- name: Login to OpenShift cluster if oc login command was found in secret oc-login - shell: | - {{ _oc_login_generic }} - register: _oc_login_result - failed_when: False - retries: "{{ _ocp_login_retries }}" - delay: "{{ _ocp_login_delay }}" - until: _oc_login_result.rc==0 +- include_tasks: existing-ocp-login-oc-login-generic.yml when: - _oc_login_generic != '' - _oc_login_cluster == '' - name: Show OpenShift login result debug: - msg: "{{_oc_login_result }}" + var: _oc_login_result - fail: msg: "OpenShift login to cluster {{ _p_openshift_cluster_name }} failed, details: {{ _oc_login_result }}" From 3aa7ffa5333be16a4862e565a6f93245fe85b3a2 Mon Sep 17 00:00:00 2001 From: Frank Ketelaars Date: Mon, 11 Dec 2023 19:52:17 +0000 Subject: [PATCH 04/18] #595 Fix lists in dev docs --- docs/src/80-development/deployer-development-setup.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/src/80-development/deployer-development-setup.md b/docs/src/80-development/deployer-development-setup.md index ef86d11c4..32ee9f302 100644 --- a/docs/src/80-development/deployer-development-setup.md +++ b/docs/src/80-development/deployer-development-setup.md @@ -80,6 +80,7 @@ gpg --default-new-key-algo rsa4096 --gen-key ``` You will be prompted to specify your user information: + * Real name: Enter your full name * Email address: Your e-mail address that will be used to sign the commits @@ -132,6 +133,7 @@ git config --global user.signingkey A83C67A6D7F71756 ``` Next, add your GPG key to your Git user. + * Go to https://github.com/IBM/cloud-pak-deployer.git * Log in using your public GitHub user * Click on your user at the top right of the pages @@ -153,6 +155,7 @@ git clone https://github.com/IBM/cloud-pak-deployer.git ``` ### Connect VSCode to the development server + * Install the **Remote - SSH** extension in VSCode * Click on the green icon in the lower left of VSCode * Open SSH Config file, choose the one in your home directory @@ -164,6 +167,7 @@ Host nickname_of_your_server ``` Once you have set up this server in the SSH config file, you can connect to it and start remote development. + * Open * Select the `cloud-pak-deployer` directory (this is the cloned repository) * As the directory is a cloned Git repo, VSCode will automatically open the default branch From d21637c3b7072c9e5d8847018943c0643608be43 Mon Sep 17 00:00:00 2001 From: Frank Ketelaars Date: Tue, 12 Dec 2023 12:36:00 +0000 Subject: [PATCH 05/18] #593 Handle scenario of no FMs --- .../tasks/configure-watsonx_ai-instances.yml | 3 ++- .../cp4d/cp4d-instance-watsonx_ai/tasks/main.yml | 2 +- .../tasks/wait-watsonx_ai-instances.yml | 7 ++++++- .../templates/patch-watsonx_ai-models.j2 | 9 ++++++++- 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/configure-watsonx_ai-instances.yml b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/configure-watsonx_ai-instances.yml index 696ff4697..9ae069fd2 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/configure-watsonx_ai-instances.yml +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/configure-watsonx_ai-instances.yml @@ -7,4 +7,5 @@ - name: Run script to configure running watsonx.ai models, output is in {{ status_dir }}/log/{{ current_cp4d_cluster.project }}-patch-watsonxaiifm.log shell: | - {{ status_dir }}/cp4d/{{ current_cp4d_cluster.project }}-patch-watsonxaiifm.sh > {{ status_dir }}/log/{{ current_cp4d_cluster.project }}-patch-watsonxaiifm.log \ No newline at end of file + {{ status_dir }}/cp4d/{{ current_cp4d_cluster.project }}-patch-watsonxaiifm.sh > {{ status_dir }}/log/{{ current_cp4d_cluster.project }}-patch-watsonxaiifm.log + register: _patch_watsonxaiifm_result \ No newline at end of file diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/main.yml b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/main.yml index 90a8383eb..404624995 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/main.yml +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/main.yml @@ -13,7 +13,7 @@ - set_fact: _configured_watsonxai_instances: [] - set_fact: - _configured_watsonxai_instances: "{{ _configured_watsonxai_instances + (_watsonxai_cartridge.models | default([])) }}" + _configured_watsonxai_instances: "{{ _watsonxai_cartridge.models | default([]) }}" - include_tasks: configure-watsonx_ai-instances.yml when: diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/wait-watsonx_ai-instances.yml b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/wait-watsonx_ai-instances.yml index 73ac38b97..8398d68e7 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/wait-watsonx_ai-instances.yml +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/wait-watsonx_ai-instances.yml @@ -1,9 +1,14 @@ --- +- name: Wait for 1 minute to let the operator update the Watsonsaiifm CR if a change was made + pause: + seconds: 60 + when: _patch_watsonxaiifm_result.changed | default(False) + - name: Wait for Watsonxaiifm watsonxaiifm-cr to reach Completed status shell: | oc get Watsonxaiifm watsonxaiifm-cr -n {{ current_cp4d_cluster.project }} --output json | jq -r '.status.watsonxaiifmStatus' | grep -i 'completed' | wc -l register: _deployed_watsonxaiifm_status - retries: 30 + retries: 60 delay: 60 until: _deployed_watsonxaiifm_status.stdout == "1" vars: diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/templates/patch-watsonx_ai-models.j2 b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/templates/patch-watsonx_ai-models.j2 index a3d9a3d25..059f9adf5 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/templates/patch-watsonx_ai-models.j2 +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/templates/patch-watsonx_ai-models.j2 @@ -1,4 +1,11 @@ +{% if (_configured_watsonxai_instances | default([]) | selectattr('state','match','installed' ) | length) != 0 %} oc patch Watsonxaiifm watsonxaiifm-cr \ -n {{ current_cp4d_cluster.project }} \ --type=merge \ - --patch '{"spec": {"install_model_list": [ {{ '\"' + _configured_watsonxai_instances | default([]) | selectattr('state','match','installed' ) | map(attribute='model_id') | join('\",\"') + '\"' }} ] } }' \ No newline at end of file + --patch '{"spec": {"install_model_list": [ {{ '\"' + _configured_watsonxai_instances | default([]) | selectattr('state','match','installed' ) | map(attribute='model_id') | join('\",\"') + '\"' }} ] } }' +{% else %} +oc patch Watsonxaiifm watsonxaiifm-cr \ + -n {{ current_cp4d_cluster.project }} \ + --type=merge \ + --patch '{"spec": {"install_model_list": [] } }' +{% endif %} \ No newline at end of file From 6f17798e5384407c17b61673ac6229dbf694a8c9 Mon Sep 17 00:00:00 2001 From: Frank Ketelaars Date: Tue, 12 Dec 2023 20:46:06 +0000 Subject: [PATCH 06/18] #593 Delete FM inference server deployments --- .../tasks/delete-watsonx_ai-instance.yml | 8 ++++++++ .../tasks/delete-watsonx_ai-instances.yml | 6 ++++++ .../cp4d/cp4d-instance-watsonx_ai/tasks/main.yml | 2 ++ 3 files changed, 16 insertions(+) create mode 100644 automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml create mode 100644 automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instances.yml diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml new file mode 100644 index 000000000..aca179d0f --- /dev/null +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml @@ -0,0 +1,8 @@ +--- +- name: Delete deployment for watsonx.ai instance {{ _watsonx_ai_instance.model_id }} + shell: | + oc delete deployment \ + -n {{ current_cp4d_cluster.project }} \ + {{ _watsonx_ai_instance.model_id }}-inference-server \ + --ignore-not-found + when: (_watsonx_ai_instance.state | default('')) == 'removed' \ No newline at end of file diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instances.yml b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instances.yml new file mode 100644 index 000000000..442b5c8ff --- /dev/null +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instances.yml @@ -0,0 +1,6 @@ +--- +- name: Loop through each watsonx.ai foundation model inference server to check if it must be deleted + include_tasks: delete-watsonx_ai-instance.yml + loop: "{{ _configured_watsonxai_instances }}" + loop_control: + loop_var: _watsonx_ai_instance \ No newline at end of file diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/main.yml b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/main.yml index 404624995..657a6098b 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/main.yml +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/main.yml @@ -28,3 +28,5 @@ when: - _watsonxai_cartridge != {} - (_watsonxai_cartridge.state | default('installed')) == 'installed' + +- include_tasks: delete-watsonx_ai-instances.yml \ No newline at end of file From ce6b112eaf06baca4b3efb28c820896645d2477d Mon Sep 17 00:00:00 2001 From: Frank Ketelaars Date: Tue, 12 Dec 2023 21:58:21 +0000 Subject: [PATCH 07/18] #597 Detect Satellite cluster --- .../tasks/retrieve-openshift-type-existing-ocp.yml | 2 +- .../40-configure-infra/retrieve-cloud-infra-type/vars/main.yml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/automation-roles/40-configure-infra/retrieve-cloud-infra-type/tasks/retrieve-openshift-type-existing-ocp.yml b/automation-roles/40-configure-infra/retrieve-cloud-infra-type/tasks/retrieve-openshift-type-existing-ocp.yml index 9f0b9249e..e087c5ec6 100644 --- a/automation-roles/40-configure-infra/retrieve-cloud-infra-type/tasks/retrieve-openshift-type-existing-ocp.yml +++ b/automation-roles/40-configure-infra/retrieve-cloud-infra-type/tasks/retrieve-openshift-type-existing-ocp.yml @@ -23,7 +23,7 @@ # Extra handling for ibm cloud - set_fact: _existing_ocp_infra_type: "ibm-roks" - when: _storage_inferred_ocp_infra_type == "ibm-classic" or _storage_inferred_ocp_infra_type == "ibm-vpc-gen2" + when: _storage_inferred_ocp_infra_type == "ibm-classic" or _storage_inferred_ocp_infra_type == "ibm-vpc-gen2" or _storage_inferred_ocp_infra_type == "ibm-satellite" # Extra handling for AWS - name: Distinquish AWS OpenShift between self-managed and ROSA diff --git a/automation-roles/40-configure-infra/retrieve-cloud-infra-type/vars/main.yml b/automation-roles/40-configure-infra/retrieve-cloud-infra-type/vars/main.yml index 8ad35ee55..62e4311fe 100644 --- a/automation-roles/40-configure-infra/retrieve-cloud-infra-type/vars/main.yml +++ b/automation-roles/40-configure-infra/retrieve-cloud-infra-type/vars/main.yml @@ -3,6 +3,8 @@ existing_ocp_cloud_infra: storage_class: "ibmc-file-gold" ibm-vpc-gen2: storage_class: "ibmc-vpc" + ibm-satellite: + storage_class: "sat-ocs-ceph" aws: storage_class: "gp" azure-aro: From e3808be369a1f32238c0dcb4e8f72b0d5c1d40d5 Mon Sep 17 00:00:00 2001 From: Frank Ketelaars Date: Tue, 12 Dec 2023 22:31:48 +0000 Subject: [PATCH 08/18] #596 Install NFD and NVIDIA operators --- .../configure-openshift/tasks/main.yml | 6 ++ .../nfd-operator/tasks/main.yml | 51 ++++++++++ .../nfd-operator/templates/nfd-cr.j2 | 81 +++++++++++++++ .../nfd-operator/templates/nfd-operator.j2 | 22 +++++ .../nvidia-operator/tasks/main.yml | 63 ++++++++++++ .../templates/nvidia-cluster-policy-cr.j2 | 98 +++++++++++++++++++ .../templates/nvidia-operator.j2 | 24 +++++ .../openshift-gpu/tasks/main.yml | 16 +++ 8 files changed, 361 insertions(+) create mode 100644 automation-roles/40-configure-infra/nfd-operator/tasks/main.yml create mode 100644 automation-roles/40-configure-infra/nfd-operator/templates/nfd-cr.j2 create mode 100644 automation-roles/40-configure-infra/nfd-operator/templates/nfd-operator.j2 create mode 100644 automation-roles/40-configure-infra/nvidia-operator/tasks/main.yml create mode 100644 automation-roles/40-configure-infra/nvidia-operator/templates/nvidia-cluster-policy-cr.j2 create mode 100644 automation-roles/40-configure-infra/nvidia-operator/templates/nvidia-operator.j2 create mode 100644 automation-roles/40-configure-infra/openshift-gpu/tasks/main.yml diff --git a/automation-roles/40-configure-infra/configure-openshift/tasks/main.yml b/automation-roles/40-configure-infra/configure-openshift/tasks/main.yml index 40c7561dd..989942bf8 100644 --- a/automation-roles/40-configure-infra/configure-openshift/tasks/main.yml +++ b/automation-roles/40-configure-infra/configure-openshift/tasks/main.yml @@ -63,5 +63,11 @@ - name: Configure Multi-Cloud Object Gateway include_role: name: openshift-mcg + vars: + _p_openshift_cluster: "{{ current_openshift_cluster }}" + +- name: Configure GPU for the OpenShift cluster + include_role: + name: openshift-gpu vars: _p_openshift_cluster: "{{ current_openshift_cluster }}" \ No newline at end of file diff --git a/automation-roles/40-configure-infra/nfd-operator/tasks/main.yml b/automation-roles/40-configure-infra/nfd-operator/tasks/main.yml new file mode 100644 index 000000000..aff58418d --- /dev/null +++ b/automation-roles/40-configure-infra/nfd-operator/tasks/main.yml @@ -0,0 +1,51 @@ +--- +- name: Create openshift-nfd OpenShift project + shell: | + oc create ns openshift-nfd || true + +- name: Retrieve default channel for Node Feature Discovery manifest + shell: + oc get packagemanifest nfd -o jsonpath='{.status.defaultChannel}' + register: _nfd_packagemanifest + +- set_fact: + _nfd_channel: "{{ _nfd_packagemanifest.stdout }}" + +- name: Generate NFD operator file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-operator.yaml + template: + src: nfd-operator.j2 + dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-operator.yaml" + +- name: Create NFD operator + shell: | + oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-operator.yaml + +- name: Wait until NFD Operator CSV has status Succeeded + shell: | + oc get csv -n openshift-nfd \ + -l operators.coreos.com/nfd.openshift-nfd \ + --no-headers \ + -o custom-columns='name:metadata.name,phase:status.phase' | \ + grep -i succeeded | wc -l + register: _nfd_csv_status + retries: 30 + delay: 30 + until: _nfd_csv_status.stdout == "1" + vars: + ansible_callback_diy_runner_retry_msg: >- + {%- set result = ansible_callback_diy.result.output -%} + {%- set retries_left = result.retries - result.attempts -%} + Retrying: {{ ansible_callback_diy.task.name }} ({{ retries_left }} Retries left) ... + +- name: Get OpenShift version + include_role: + name: openshift-get-version + +- name: Generate NodeFeatureDiscovery CR file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-cr.yaml + template: + src: nfd-cr.j2 + dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-cr.yaml" + +- name: Create NFD CR + shell: | + oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-cr.yaml \ No newline at end of file diff --git a/automation-roles/40-configure-infra/nfd-operator/templates/nfd-cr.j2 b/automation-roles/40-configure-infra/nfd-operator/templates/nfd-cr.j2 new file mode 100644 index 000000000..0ce755a4b --- /dev/null +++ b/automation-roles/40-configure-infra/nfd-operator/templates/nfd-cr.j2 @@ -0,0 +1,81 @@ +--- +apiVersion: nfd.openshift.io/v1 +kind: NodeFeatureDiscovery +metadata: + name: nfd-instance + namespace: openshift-nfd +spec: + instance: "" # instance is empty by default + topologyupdater: false # False by default + operand: + image: registry.redhat.io/openshift4/ose-node-feature-discovery:v{{ _p_openshift_version }} + imagePullPolicy: Always + workerConfig: + configData: | + core: + # labelWhiteList: + # noPublish: false + sleepInterval: 60s + # sources: [all] + # klog: + # addDirHeader: false + # alsologtostderr: false + # logBacktraceAt: + # logtostderr: true + # skipHeaders: false + # stderrthreshold: 2 + # v: 0 + # vmodule: + ## NOTE: the following options are not dynamically run-time configurable + ## and require a nfd-worker restart to take effect after being changed + # logDir: + # logFile: + # logFileMaxSize: 1800 + # skipLogHeaders: false + sources: + cpu: + cpuid: + # NOTE: whitelist has priority over blacklist + attributeBlacklist: + - "BMI1" + - "BMI2" + - "CLMUL" + - "CMOV" + - "CX16" + - "ERMS" + - "F16C" + - "HTT" + - "LZCNT" + - "MMX" + - "MMXEXT" + - "NX" + - "POPCNT" + - "RDRAND" + - "RDSEED" + - "RDTSCP" + - "SGX" + - "SSE" + - "SSE2" + - "SSE3" + - "SSE4.1" + - "SSE4.2" + - "SSSE3" + attributeWhitelist: + kernel: + kconfigFile: "/path/to/kconfig" + configOpts: + - "NO_HZ" + - "X86" + - "DMI" + pci: + deviceClassWhitelist: + - "0200" + - "03" + - "12" + deviceLabelFields: + - "class" + customConfig: + configData: | + - name: "more.kernel.features" + matchOn: + - loadedKMod: ["example_kmod3"] \ No newline at end of file diff --git a/automation-roles/40-configure-infra/nfd-operator/templates/nfd-operator.j2 b/automation-roles/40-configure-infra/nfd-operator/templates/nfd-operator.j2 new file mode 100644 index 000000000..63fb805d3 --- /dev/null +++ b/automation-roles/40-configure-infra/nfd-operator/templates/nfd-operator.j2 @@ -0,0 +1,22 @@ +--- +apiVersion: operators.coreos.com/v1 +kind: OperatorGroup +metadata: + generateName: openshift-nfd- + namespace: openshift-nfd +spec: + upgradeStrategy: Default +--- +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + labels: + operators.coreos.com/nfd.openshift-nfd: "" + name: nfd + namespace: openshift-nfd +spec: + channel: {{ _nfd_channel }} + installPlanApproval: Automatic + name: nfd + source: redhat-operators + sourceNamespace: openshift-marketplace \ No newline at end of file diff --git a/automation-roles/40-configure-infra/nvidia-operator/tasks/main.yml b/automation-roles/40-configure-infra/nvidia-operator/tasks/main.yml new file mode 100644 index 000000000..062d51ef3 --- /dev/null +++ b/automation-roles/40-configure-infra/nvidia-operator/tasks/main.yml @@ -0,0 +1,63 @@ +--- +- name: Create nvidia-gpu-operator OpenShift project + shell: | + oc create ns nvidia-gpu-operator || true + +- name: Retrieve default channel for the NVIDIA GPU manifest + shell: + oc get packagemanifest nvidia-network-operator -o jsonpath='{.status.defaultChannel}' + register: _nvidia_packagemanifest + +- set_fact: + _nvidia_channel: "{{ _nvidia_packagemanifest.stdout }}" + +- name: Generate NVIDIA operator file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-operator.yaml + template: + src: nvidia-operator.j2 + dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-operator.yaml" + +- name: Create NVIDIA operator + shell: | + oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-operator.yaml + +- name: Wait until NVIDIA Operator CSV has status Succeeded + shell: | + oc get csv -n nvidia-gpu-operator \ + -l operators.coreos.com/gpu-operator-certified.nvidia-gpu-operator \ + --no-headers \ + -o custom-columns='name:metadata.name,phase:status.phase' | \ + grep -i succeeded | wc -l + register: _nvidia_csv_status + retries: 30 + delay: 30 + until: _nvidia_csv_status.stdout == "1" + vars: + ansible_callback_diy_runner_retry_msg: >- + {%- set result = ansible_callback_diy.result.output -%} + {%- set retries_left = result.retries - result.attempts -%} + Retrying: {{ ansible_callback_diy.task.name }} ({{ retries_left }} Retries left) ... + +- name: Generate NVIDIA ClusterPolicy CR file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-cluster-policy-cr.yaml + template: + src: nvidia-cluster-policy-cr.j2 + dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-cluster-policy-cr.yaml" + +- name: Create NVIDIA ClusterPolicy CR + shell: | + oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-cluster-policy-cr.yaml + +- name: Wait until NVIDIA ClusterPolicy has status Ready + shell: | + oc get clusterpolicies.nvidia.com gpu-cluster-policy \ + --no-headers \ + -o custom-columns='name:metadata.name,phase:status.state' | \ + grep -i ready | wc -l + register: _nvidia_cluster_policy_status + retries: 30 + delay: 30 + until: _nvidia_cluster_policy_status.stdout == "1" + vars: + ansible_callback_diy_runner_retry_msg: >- + {%- set result = ansible_callback_diy.result.output -%} + {%- set retries_left = result.retries - result.attempts -%} + Retrying: {{ ansible_callback_diy.task.name }} ({{ retries_left }} Retries left) ... \ No newline at end of file diff --git a/automation-roles/40-configure-infra/nvidia-operator/templates/nvidia-cluster-policy-cr.j2 b/automation-roles/40-configure-infra/nvidia-operator/templates/nvidia-cluster-policy-cr.j2 new file mode 100644 index 000000000..92161c75e --- /dev/null +++ b/automation-roles/40-configure-infra/nvidia-operator/templates/nvidia-cluster-policy-cr.j2 @@ -0,0 +1,98 @@ +--- +apiVersion: nvidia.com/v1 +kind: ClusterPolicy +metadata: + name: gpu-cluster-policy +spec: + cdi: + default: false + enabled: false + daemonsets: + rollingUpdate: + maxUnavailable: "1" + updateStrategy: RollingUpdate + dcgm: + enabled: true + dcgmExporter: + config: + name: "" + enabled: true + serviceMonitor: + enabled: true + devicePlugin: + config: + default: "" + name: "" + enabled: true + driver: + certConfig: + name: "" + enabled: true + kernelModuleConfig: + name: "" + licensingConfig: + configMapName: "" + nlsEnabled: true + repoConfig: + configMapName: "" + upgradePolicy: + autoUpgrade: true + drain: + deleteEmptyDir: false + enable: false + force: false + timeoutSeconds: 300 + maxParallelUpgrades: 1 + maxUnavailable: 25% + podDeletion: + deleteEmptyDir: false + force: false + timeoutSeconds: 300 + waitForCompletion: + timeoutSeconds: 0 + useNvidiaDriverCRD: false + useOpenKernelModules: false + virtualTopology: + config: "" + gds: + enabled: false + gfd: + enabled: true + kataManager: + config: + artifactsDir: /opt/nvidia-gpu-operator/artifacts/runtimeclasses + mig: + strategy: single + migManager: + config: + default: all-disabled + name: default-mig-parted-config + enabled: true + nodeStatusExporter: + enabled: true + operator: + defaultRuntime: crio + initContainer: {} + runtimeClass: nvidia + use_ocp_driver_toolkit: true + sandboxDevicePlugin: + enabled: true + sandboxWorkloads: + defaultWorkload: container + enabled: false + toolkit: + enabled: true + installDir: /usr/local/nvidia + validator: + plugin: + env: + - name: WITH_WORKLOAD + value: "false" + vfioManager: + enabled: true + vgpuDeviceManager: + config: + default: default + enabled: true + vgpuManager: + enabled: false \ No newline at end of file diff --git a/automation-roles/40-configure-infra/nvidia-operator/templates/nvidia-operator.j2 b/automation-roles/40-configure-infra/nvidia-operator/templates/nvidia-operator.j2 new file mode 100644 index 000000000..a9f631228 --- /dev/null +++ b/automation-roles/40-configure-infra/nvidia-operator/templates/nvidia-operator.j2 @@ -0,0 +1,24 @@ +--- +apiVersion: operators.coreos.com/v1 +kind: OperatorGroup +metadata: + generateName: nvidia-gpu-operator- + namespace: nvidia-gpu-operator +spec: + targetNamespaces: + - nvidia-gpu-operator + upgradeStrategy: Default +--- +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + labels: + operators.coreos.com/gpu-operator-certified.nvidia-gpu-operator: "" + name: gpu-operator-certified + namespace: nvidia-gpu-operator +spec: + channel: {{ _nvidia_channel }} + installPlanApproval: Automatic + name: gpu-operator-certified + source: certified-operators + sourceNamespace: openshift-marketplace \ No newline at end of file diff --git a/automation-roles/40-configure-infra/openshift-gpu/tasks/main.yml b/automation-roles/40-configure-infra/openshift-gpu/tasks/main.yml new file mode 100644 index 000000000..de1ae7223 --- /dev/null +++ b/automation-roles/40-configure-infra/openshift-gpu/tasks/main.yml @@ -0,0 +1,16 @@ +--- +- name: Validate mandatory variables for OpenShift NFD and GPU operators + assert: + that: + - _p_openshift_cluster is defined + +- block: + - name: Install Node Feature Discovery operator and CR + include_role: + name: nfd-operator + + - name: Install NVIDIA operator and CR + include_role: + name: nvidia-operator + + when: _p_openshift_cluster.gpu.install | default(False) | bool \ No newline at end of file From 6d192e88f5c1be19067274fc29aaf99cdeb0b396 Mon Sep 17 00:00:00 2001 From: Frank Ketelaars Date: Wed, 13 Dec 2023 21:20:48 +0000 Subject: [PATCH 09/18] #596 Install NFD and NVIDIA operators --- .../40-configure-infra/nfd-operator/templates/nfd-cr.j2 | 2 +- .../nfd-operator/templates/nfd-operator.j2 | 2 +- .../40-configure-infra/nvidia-operator/tasks/main.yml | 2 +- .../nvidia-operator/templates/nvidia-operator.j2 | 2 +- .../tasks/cp4d-delete-cr-instances.yml | 7 ++++--- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/automation-roles/40-configure-infra/nfd-operator/templates/nfd-cr.j2 b/automation-roles/40-configure-infra/nfd-operator/templates/nfd-cr.j2 index 0ce755a4b..c146af451 100644 --- a/automation-roles/40-configure-infra/nfd-operator/templates/nfd-cr.j2 +++ b/automation-roles/40-configure-infra/nfd-operator/templates/nfd-cr.j2 @@ -8,7 +8,7 @@ spec: instance: "" # instance is empty by default topologyupdater: false # False by default operand: - image: registry.redhat.io/openshift4/ose-node-feature-discovery:v{{ _p_openshift_version }} + image: registry.redhat.io/openshift4/ose-node-feature-discovery:v{{ _p_current_ocp_version }} imagePullPolicy: Always workerConfig: configData: | diff --git a/automation-roles/40-configure-infra/nfd-operator/templates/nfd-operator.j2 b/automation-roles/40-configure-infra/nfd-operator/templates/nfd-operator.j2 index 63fb805d3..2ab943566 100644 --- a/automation-roles/40-configure-infra/nfd-operator/templates/nfd-operator.j2 +++ b/automation-roles/40-configure-infra/nfd-operator/templates/nfd-operator.j2 @@ -2,7 +2,7 @@ apiVersion: operators.coreos.com/v1 kind: OperatorGroup metadata: - generateName: openshift-nfd- + name: openshift-nfd-og namespace: openshift-nfd spec: upgradeStrategy: Default diff --git a/automation-roles/40-configure-infra/nvidia-operator/tasks/main.yml b/automation-roles/40-configure-infra/nvidia-operator/tasks/main.yml index 062d51ef3..4aad1e595 100644 --- a/automation-roles/40-configure-infra/nvidia-operator/tasks/main.yml +++ b/automation-roles/40-configure-infra/nvidia-operator/tasks/main.yml @@ -5,7 +5,7 @@ - name: Retrieve default channel for the NVIDIA GPU manifest shell: - oc get packagemanifest nvidia-network-operator -o jsonpath='{.status.defaultChannel}' + oc get packagemanifest gpu-operator-certified -o jsonpath='{.status.defaultChannel}' register: _nvidia_packagemanifest - set_fact: diff --git a/automation-roles/40-configure-infra/nvidia-operator/templates/nvidia-operator.j2 b/automation-roles/40-configure-infra/nvidia-operator/templates/nvidia-operator.j2 index a9f631228..d856f1dd4 100644 --- a/automation-roles/40-configure-infra/nvidia-operator/templates/nvidia-operator.j2 +++ b/automation-roles/40-configure-infra/nvidia-operator/templates/nvidia-operator.j2 @@ -2,7 +2,7 @@ apiVersion: operators.coreos.com/v1 kind: OperatorGroup metadata: - generateName: nvidia-gpu-operator- + name: nvidia-gpu-operator-og namespace: nvidia-gpu-operator spec: targetNamespaces: diff --git a/automation-roles/50-install-cloud-pak/cp4d/cp4d-cartridge-remove/tasks/cp4d-delete-cr-instances.yml b/automation-roles/50-install-cloud-pak/cp4d/cp4d-cartridge-remove/tasks/cp4d-delete-cr-instances.yml index 16c8ffde1..351b73238 100644 --- a/automation-roles/50-install-cloud-pak/cp4d/cp4d-cartridge-remove/tasks/cp4d-delete-cr-instances.yml +++ b/automation-roles/50-install-cloud-pak/cp4d/cp4d-cartridge-remove/tasks/cp4d-delete-cr-instances.yml @@ -41,9 +41,10 @@ _p_delete_all_instances: True when: (_current_cartridge_cr.olm_utils_name | default("")) == "dv" -- name: Delete all OpenPages instances +- name: Delete all watsonx.ai instances include_role: - name: cp4d-instance-openpages + name: cp4d-instance-watsonx_ai vars: _p_delete_all_instances: True - when: (_current_cartridge_cr.olm_utils_name | default("")) == "openpages" \ No newline at end of file + when: (_current_cartridge_cr.olm_utils_name | default("")) == "watsonx_ai" + From cd7e2b4004cb6b0ad5aea2519355de6a44ba3466 Mon Sep 17 00:00:00 2001 From: Frank Ketelaars Date: Wed, 13 Dec 2023 21:21:21 +0000 Subject: [PATCH 10/18] #593 Handle deletion of watsonx.ai --- .../tasks/delete-watsonx_ai-instance.yml | 10 +++++- .../cp4d-instance-watsonx_ai/tasks/main.yml | 34 +++++++++---------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml index aca179d0f..79963206c 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml @@ -5,4 +5,12 @@ -n {{ current_cp4d_cluster.project }} \ {{ _watsonx_ai_instance.model_id }}-inference-server \ --ignore-not-found - when: (_watsonx_ai_instance.state | default('')) == 'removed' \ No newline at end of file + when: (_watsonx_ai_instance.state | default('')) == 'removed' or (_p_delete_all_instances | default(False)) + +- name: Delete Watsonxaiifm CR if watsonx.ai was removed + shell: + oc delete Watsonxaiifm \ + -n {{ current_cp4d_cluster.project }} \ + watsonxaiifm-cr \ + --ignore-not-found + when: (_p_delete_all_instances | default(False)) \ No newline at end of file diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/main.yml b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/main.yml index 657a6098b..1589c3b77 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/main.yml +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/main.yml @@ -9,24 +9,24 @@ debug: var: _watsonxai_cartridge -- block: - - set_fact: - _configured_watsonxai_instances: [] - - set_fact: - _configured_watsonxai_instances: "{{ _watsonxai_cartridge.models | default([]) }}" - - - include_tasks: configure-watsonx_ai-instances.yml - when: - - (_p_delete_all_instances | default(False)) == False - - (_p_wait_instances | default(False)) == False - - - include_tasks: wait-watsonx_ai-instances.yml - when: - - (_p_delete_all_instances | default(False)) == False - - _p_wait_instances | default(False) - +- set_fact: + _configured_watsonxai_instances: [] +- set_fact: + _configured_watsonxai_instances: "{{ _watsonxai_cartridge.models | default([]) }}" when: - _watsonxai_cartridge != {} - (_watsonxai_cartridge.state | default('installed')) == 'installed' -- include_tasks: delete-watsonx_ai-instances.yml \ No newline at end of file +- include_tasks: configure-watsonx_ai-instances.yml + when: + - (_p_delete_all_instances | default(False)) == False + - (_p_wait_instances | default(False)) == False + +- include_tasks: delete-watsonx_ai-instances.yml + when: + - (_p_wait_instances | default(False)) == False + +- include_tasks: wait-watsonx_ai-instances.yml + when: + - (_p_delete_all_instances | default(False)) == False + - _p_wait_instances | default(False) \ No newline at end of file From c6891ed0355ce5d851e9d4c90baa6e7b052da5e1 Mon Sep 17 00:00:00 2001 From: Frank Ketelaars Date: Thu, 14 Dec 2023 06:55:18 +0000 Subject: [PATCH 11/18] #593 Handle deletion of watsonx.ai --- .../tasks/delete-watsonx_ai-instance.yml | 4 ++-- .../tasks/delete-watsonx_ai-instances.yml | 2 +- .../cp4d/cp4d-instance-watsonx_ai/tasks/main.yml | 15 +++++++++------ 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml index 79963206c..0ad27de8c 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml @@ -5,7 +5,7 @@ -n {{ current_cp4d_cluster.project }} \ {{ _watsonx_ai_instance.model_id }}-inference-server \ --ignore-not-found - when: (_watsonx_ai_instance.state | default('')) == 'removed' or (_p_delete_all_instances | default(False)) + when: (_watsonx_ai_instance.state | default('')) == 'removed' or _delete_all_watsonx_ai_instances - name: Delete Watsonxaiifm CR if watsonx.ai was removed shell: @@ -13,4 +13,4 @@ -n {{ current_cp4d_cluster.project }} \ watsonxaiifm-cr \ --ignore-not-found - when: (_p_delete_all_instances | default(False)) \ No newline at end of file + when: _delete_all_watsonx_ai_instances \ No newline at end of file diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instances.yml b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instances.yml index 442b5c8ff..ee56b5660 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instances.yml +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instances.yml @@ -1,6 +1,6 @@ --- - name: Loop through each watsonx.ai foundation model inference server to check if it must be deleted include_tasks: delete-watsonx_ai-instance.yml - loop: "{{ _configured_watsonxai_instances }}" + loop: "{{ _configured_watsonx_ai_instances }}" loop_control: loop_var: _watsonx_ai_instance \ No newline at end of file diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/main.yml b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/main.yml index 1589c3b77..cf60d44ca 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/main.yml +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/main.yml @@ -10,16 +10,19 @@ var: _watsonxai_cartridge - set_fact: - _configured_watsonxai_instances: [] + _configured_watsonx_ai_instances: [] + _delete_all_watsonx_ai_instances: False - set_fact: - _configured_watsonxai_instances: "{{ _watsonxai_cartridge.models | default([]) }}" + _configured_watsonx_ai_instances: "{{ _watsonxai_cartridge.models | default([]) }}" + +- set_fact: + _delete_all_watsonx_ai_instances: True when: - - _watsonxai_cartridge != {} - - (_watsonxai_cartridge.state | default('installed')) == 'installed' + - _watsonxai_cartridge == {} or (_watsonxai_cartridge.state | default('installed')) == 'removed' - include_tasks: configure-watsonx_ai-instances.yml when: - - (_p_delete_all_instances | default(False)) == False + - _delete_all_watsonx_ai_instances == False - (_p_wait_instances | default(False)) == False - include_tasks: delete-watsonx_ai-instances.yml @@ -28,5 +31,5 @@ - include_tasks: wait-watsonx_ai-instances.yml when: - - (_p_delete_all_instances | default(False)) == False + - _delete_all_watsonx_ai_instances == False - _p_wait_instances | default(False) \ No newline at end of file From 76a5ee68990af4539305b2cd0593516b4d9219d3 Mon Sep 17 00:00:00 2001 From: Frank Ketelaars Date: Thu, 14 Dec 2023 11:54:25 +0000 Subject: [PATCH 12/18] #593 Fix typo --- .../templates/patch-watsonx_ai-models.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/templates/patch-watsonx_ai-models.j2 b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/templates/patch-watsonx_ai-models.j2 index 059f9adf5..5a9251b9e 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/templates/patch-watsonx_ai-models.j2 +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/templates/patch-watsonx_ai-models.j2 @@ -1,8 +1,8 @@ -{% if (_configured_watsonxai_instances | default([]) | selectattr('state','match','installed' ) | length) != 0 %} +{% if (_configured_watsonx_ai_instances | default([]) | selectattr('state','match','installed' ) | length) != 0 %} oc patch Watsonxaiifm watsonxaiifm-cr \ -n {{ current_cp4d_cluster.project }} \ --type=merge \ - --patch '{"spec": {"install_model_list": [ {{ '\"' + _configured_watsonxai_instances | default([]) | selectattr('state','match','installed' ) | map(attribute='model_id') | join('\",\"') + '\"' }} ] } }' + --patch '{"spec": {"install_model_list": [ {{ '\"' + _configured_watsonx_ai_instances | default([]) | selectattr('state','match','installed' ) | map(attribute='model_id') | join('\",\"') + '\"' }} ] } }' {% else %} oc patch Watsonxaiifm watsonxaiifm-cr \ -n {{ current_cp4d_cluster.project }} \ From d9024b3ce810065f4ae2138395c767b19d11bf5d Mon Sep 17 00:00:00 2001 From: Frank Ketelaars Date: Thu, 14 Dec 2023 19:37:50 +0000 Subject: [PATCH 13/18] #596 Use the correct attributes for NFD --- .../nfd-operator/templates/nfd-cr.j2 | 125 ++++++++++++------ 1 file changed, 82 insertions(+), 43 deletions(-) diff --git a/automation-roles/40-configure-infra/nfd-operator/templates/nfd-cr.j2 b/automation-roles/40-configure-infra/nfd-operator/templates/nfd-cr.j2 index c146af451..021cf77fd 100644 --- a/automation-roles/40-configure-infra/nfd-operator/templates/nfd-cr.j2 +++ b/automation-roles/40-configure-infra/nfd-operator/templates/nfd-cr.j2 @@ -9,7 +9,7 @@ spec: topologyupdater: false # False by default operand: image: registry.redhat.io/openshift4/ose-node-feature-discovery:v{{ _p_current_ocp_version }} - imagePullPolicy: Always + servicePort: 12000 workerConfig: configData: | core: @@ -26,56 +26,95 @@ spec: # stderrthreshold: 2 # v: 0 # vmodule: - ## NOTE: the following options are not dynamically run-time configurable - ## and require a nfd-worker restart to take effect after being changed + ## NOTE: the following options are not dynamically run-time + ## configurable and require a nfd-worker restart to take effect + ## after being changed # logDir: # logFile: # logFileMaxSize: 1800 # skipLogHeaders: false sources: - cpu: - cpuid: - # NOTE: whitelist has priority over blacklist - attributeBlacklist: - - "BMI1" - - "BMI2" - - "CLMUL" - - "CMOV" - - "CX16" - - "ERMS" - - "F16C" - - "HTT" - - "LZCNT" - - "MMX" - - "MMXEXT" - - "NX" - - "POPCNT" - - "RDRAND" - - "RDSEED" - - "RDTSCP" - - "SGX" - - "SSE" - - "SSE2" - - "SSE3" - - "SSE4.1" - - "SSE4.2" - - "SSSE3" - attributeWhitelist: - kernel: - kconfigFile: "/path/to/kconfig" - configOpts: - - "NO_HZ" - - "X86" - - "DMI" + # cpu: + # cpuid: + ## NOTE: whitelist has priority over blacklist + # attributeBlacklist: + # - "BMI1" + # - "BMI2" + # - "CLMUL" + # - "CMOV" + # - "CX16" + # - "ERMS" + # - "F16C" + # - "HTT" + # - "LZCNT" + # - "MMX" + # - "MMXEXT" + # - "NX" + # - "POPCNT" + # - "RDRAND" + # - "RDSEED" + # - "RDTSCP" + # - "SGX" + # - "SSE" + # - "SSE2" + # - "SSE3" + # - "SSE4.1" + # - "SSE4.2" + # - "SSSE3" + # attributeWhitelist: + # kernel: + # kconfigFile: "/path/to/kconfig" + # configOpts: + # - "NO_HZ" + # - "X86" + # - "DMI" pci: deviceClassWhitelist: - "0200" - "03" - "12" deviceLabelFields: - - "class" - customConfig: - configData: | - - name: "more.kernel.features" - matchOn: - - loadedKMod: ["example_kmod3"] \ No newline at end of file + # - "class" + - "vendor" + # - "device" + # - "subsystem_vendor" + # - "subsystem_device" + # usb: + # deviceClassWhitelist: + # - "0e" + # - "ef" + # - "fe" + # - "ff" + # deviceLabelFields: + # - "class" + # - "vendor" + # - "device" + # custom: + # - name: "my.kernel.feature" + # matchOn: + # - loadedKMod: ["example_kmod1", "example_kmod2"] + # - name: "my.pci.feature" + # matchOn: + # - pciId: + # class: ["0200"] + # vendor: ["15b3"] + # device: ["1014", "1017"] + # - pciId : + # vendor: ["8086"] + # device: ["1000", "1100"] + # - name: "my.usb.feature" + # matchOn: + # - usbId: + # class: ["ff"] + # vendor: ["03e7"] + # device: ["2485"] + # - usbId: + # class: ["fe"] + # vendor: ["1a6e"] + # device: ["089a"] + # - name: "my.combined.feature" + # matchOn: + # - pciId: + # vendor: ["15b3"] + # device: ["1014", "1017"] + # loadedKMod : ["vendor_kmod1", "vendor_kmod2"] From 46d3da728b5beed4b47412f6d24259c803170cfd Mon Sep 17 00:00:00 2001 From: Frank Ketelaars Date: Thu, 14 Dec 2023 21:32:27 +0000 Subject: [PATCH 14/18] #596 Document GPU configuration + sample configs --- docs/src/30-reference/configuration/openshift.md | 4 ++++ .../sample-dynamic/config-samples/cp4d-480.yaml | 2 ++ .../sample-dynamic/config-samples/ocp-existing-ocp-auto.yaml | 2 ++ .../config-samples/ocp-existing-ocp-satellite-ocs.yaml | 2 ++ .../sample-dynamic/config-samples/ocp-existing-ocp.yaml | 2 ++ .../config-samples/ocp-existing-roks-classic.yaml | 2 ++ .../sample-dynamic/config-samples/watsonx-480.yaml | 5 ++--- 7 files changed, 16 insertions(+), 3 deletions(-) diff --git a/docs/src/30-reference/configuration/openshift.md b/docs/src/30-reference/configuration/openshift.md index af9e407d4..67e769991 100644 --- a/docs/src/30-reference/configuration/openshift.md +++ b/docs/src/30-reference/configuration/openshift.md @@ -475,6 +475,8 @@ openshift: - example.com dns_servers: - 172.31.2.73:53 + gpu: + install: False mcg: install: True storage_type: storage-class @@ -500,6 +502,8 @@ openshift: | infrastructure.processor_architecture | Architecture of the processor that the OpenShift cluster is deployed on | No | amd64 (default), ppc64le, s390x | | openshift_logging[] | Logging attributes for OpenShift cluster, see [OpenShift logging](logging-auditing.md) | No | | | upstream_dns[] | Upstream DNS servers(s), see [Upstream DNS Servers](./dns.md) | No | | +| gpu | Control Node Feature Discovery and NVIDIA GPU operators | No | | +| gpu.install | Must Node Feature Discovery and NVIDIA GPU operators be installed (Once installed, False does not uninstall) | Yes | True, False | | mcg | Multicloud Object Gateway properties | No | | | mcg.install | Must Multicloud Object Gateway be installed (Once installed, False does not uninstall) | Yes | True, False | | mcg.storage_type | Type of storage supporting the object Noobaa object storage | Yes | storage-class | diff --git a/sample-configurations/sample-dynamic/config-samples/cp4d-480.yaml b/sample-configurations/sample-dynamic/config-samples/cp4d-480.yaml index edbaca64e..15faa743c 100644 --- a/sample-configurations/sample-dynamic/config-samples/cp4d-480.yaml +++ b/sample-configurations/sample-dynamic/config-samples/cp4d-480.yaml @@ -222,6 +222,8 @@ cp4d: # noobaa_cert_secret: noobaa-s3-serving-cert state: removed + # Please note that for watsonx.ai foundation models, you neeed to install the + # Node Feature Discovery and NVIDIA GPU operators. You can do so by setting the openshift.gpu.install property to True - name: watsonx_ai description: watsonx.ai state: removed diff --git a/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp-auto.yaml b/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp-auto.yaml index e37b9a3e5..0ce26f55a 100644 --- a/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp-auto.yaml +++ b/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp-auto.yaml @@ -14,6 +14,8 @@ openshift: install: True storage_type: storage-class storage_class: managed-nfs-storage + gpu: + install: False openshift_storage: - storage_name: auto-storage storage_type: auto diff --git a/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp-satellite-ocs.yaml b/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp-satellite-ocs.yaml index d5b007355..f793a9311 100644 --- a/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp-satellite-ocs.yaml +++ b/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp-satellite-ocs.yaml @@ -18,6 +18,8 @@ openshift: domain_name: example.com infrastructure: type: ibm-roks + gpu: + install: False mcg: install: False storage_type: storage-class diff --git a/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp.yaml b/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp.yaml index fbe5739df..866972e53 100644 --- a/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp.yaml +++ b/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp.yaml @@ -10,6 +10,8 @@ openshift: ocp_version: 4.8 cluster_name: "{{ env_id }}" domain_name: example.com + gpu: + install: False mcg: install: False storage_type: storage-class diff --git a/sample-configurations/sample-dynamic/config-samples/ocp-existing-roks-classic.yaml b/sample-configurations/sample-dynamic/config-samples/ocp-existing-roks-classic.yaml index 44d515869..80111a2ea 100644 --- a/sample-configurations/sample-dynamic/config-samples/ocp-existing-roks-classic.yaml +++ b/sample-configurations/sample-dynamic/config-samples/ocp-existing-roks-classic.yaml @@ -10,6 +10,8 @@ openshift: ocp_version: 4.8 cluster_name: "{{ env_id }}" domain_name: example.com + gpu: + install: False mcg: install: True storage_type: storage-class diff --git a/sample-configurations/sample-dynamic/config-samples/watsonx-480.yaml b/sample-configurations/sample-dynamic/config-samples/watsonx-480.yaml index bafe1756d..b1c81e273 100644 --- a/sample-configurations/sample-dynamic/config-samples/watsonx-480.yaml +++ b/sample-configurations/sample-dynamic/config-samples/watsonx-480.yaml @@ -25,11 +25,10 @@ cp4d: # All tested cartridges. To install, change the "state" property to "installed". To uninstall, change the state # to "removed" or comment out the entire cartridge. Make sure that the "-" and properties are aligned with the lite # cartridge; the "-" is at position 3 and the property starts at position 5. -# -# If a cartridge has dependencies and you want to install it, you must ensure that the dependent cartridge is also -# installed. # + # Please note that for watsonx.ai foundation models, you neeed to install the + # Node Feature Discovery and NVIDIA GPU operators. You can do so by setting the openshift.gpu.install property to True - name: watsonx_ai description: watsonx.ai state: removed From f69a8682b507d6cbb576d6d9e56d5205de76af41 Mon Sep 17 00:00:00 2001 From: Frank Ketelaars Date: Fri, 15 Dec 2023 11:44:44 +0000 Subject: [PATCH 15/18] #602 WQ instance fails to provision --- .../cp4d-instance-dv/templates/dv_instance_40.json.j2 | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-dv/templates/dv_instance_40.json.j2 b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-dv/templates/dv_instance_40.json.j2 index c5222077c..90dd02c0b 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-dv/templates/dv_instance_40.json.j2 +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-dv/templates/dv_instance_40.json.j2 @@ -12,26 +12,26 @@ "enableHostIPC": "{{ cp4d_dv_instance_enable_host_ipc }}", {% if _storage_type == 'pwx' %} "persistence.storageClass": "portworx-dv-shared-gp3", - "persistence.workerpv.storageClass": "portworx-dv-shared-gp3", + "persistence.autidpv.storageClass": "portworx-dv-shared-gp3", "persistence.cachingpv.storageClass": "portworx-dv-shared-gp3", {% elif _storage_type == 'ocs' %} "persistence.storageClass": "{{ ocp_storage_class_block }}", - "persistence.workerpv.storageClass": "{{ ocp_storage_class_block }}", + "persistence.auditpv.storageClass": "{{ ocp_storage_class_block }}", "persistence.cachingpv.storageClass": "{{ ocp_storage_class_block }}", {% else %} "persistence.storageClass": "{{ ocp_storage_class_file }}", - "persistence.workerpv.storageClass": "{{ ocp_storage_class_file }}", + "persistence.auditpv.storageClass": "{{ ocp_storage_class_file }}", "persistence.cachingpv.storageClass": "{{ ocp_storage_class_file }}", {% endif %} "persistence.size": "{{ cp4d_dv_instance_persistence_storage_size }}", - "persistence.workerpv.size": "{{ cp4d_dv_instance_persistence_compute_storage_size }}", + "persistence.auditpv.size": "{{ cp4d_dv_instance_persistence_compute_storage_size }}", "persistence.cachingpv.size": "{{ cp4d_dv_instance_persistence_caching_storage_size }}" }, "resources":{ "cpu":"{{ cp4d_dv_instance_requests_cpu }}", "memory":"{{ cp4d_dv_instance_requests_memory }}" }, - "description":"Data Virtualization", + "description":"Watson Query", "metaData":{ } From aa946220e8701c8854809db6aef313f3da0e2713 Mon Sep 17 00:00:00 2001 From: Frank Ketelaars Date: Fri, 15 Dec 2023 14:11:54 +0000 Subject: [PATCH 16/18] #593 Do not fail if watsonx.ai not installed --- .../tasks/delete-watsonx_ai-instance.yml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml index 0ad27de8c..37b504bd5 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml @@ -1,11 +1,19 @@ --- +- name: Check if Watsonxaiifm CRD exists + shell: | + oc get crd watsonxaiifm.watsonxaiifm.cpd.ibm.com + register: _watsonxiifm_crd_state + - name: Delete deployment for watsonx.ai instance {{ _watsonx_ai_instance.model_id }} shell: | oc delete deployment \ -n {{ current_cp4d_cluster.project }} \ {{ _watsonx_ai_instance.model_id }}-inference-server \ --ignore-not-found - when: (_watsonx_ai_instance.state | default('')) == 'removed' or _delete_all_watsonx_ai_instances + when: + - _watsonxiifm_crd_state.rc == 0 + - (_watsonx_ai_instance.state | default('')) == 'removed' or _delete_all_watsonx_ai_instances + - name: Delete Watsonxaiifm CR if watsonx.ai was removed shell: @@ -13,4 +21,6 @@ -n {{ current_cp4d_cluster.project }} \ watsonxaiifm-cr \ --ignore-not-found - when: _delete_all_watsonx_ai_instances \ No newline at end of file + when: + - _watsonxiifm_crd_state.rc == 0 + - _delete_all_watsonx_ai_instances \ No newline at end of file From b17b20c3a3c0e65989618ef3dd12b312dab0ce29 Mon Sep 17 00:00:00 2001 From: Frank Ketelaars Date: Fri, 15 Dec 2023 14:17:39 +0000 Subject: [PATCH 17/18] #593 Don't fail if CRD doesn't exist --- .../tasks/delete-watsonx_ai-instance.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml index 37b504bd5..f60b98769 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml @@ -2,6 +2,7 @@ - name: Check if Watsonxaiifm CRD exists shell: | oc get crd watsonxaiifm.watsonxaiifm.cpd.ibm.com + failed_when: False register: _watsonxiifm_crd_state - name: Delete deployment for watsonx.ai instance {{ _watsonx_ai_instance.model_id }} From bf2878d4426559c6ab583fd5adfbef6c46e16bb1 Mon Sep 17 00:00:00 2001 From: Frank Ketelaars Date: Fri, 15 Dec 2023 14:25:00 +0000 Subject: [PATCH 18/18] #602 Use file storage class for Audit storage --- .../cp4d/cp4d-instance-dv/templates/dv_instance_40.json.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-dv/templates/dv_instance_40.json.j2 b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-dv/templates/dv_instance_40.json.j2 index 90dd02c0b..1b3b70466 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-dv/templates/dv_instance_40.json.j2 +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-dv/templates/dv_instance_40.json.j2 @@ -16,7 +16,7 @@ "persistence.cachingpv.storageClass": "portworx-dv-shared-gp3", {% elif _storage_type == 'ocs' %} "persistence.storageClass": "{{ ocp_storage_class_block }}", - "persistence.auditpv.storageClass": "{{ ocp_storage_class_block }}", + "persistence.auditpv.storageClass": "{{ ocp_storage_class_file }}", "persistence.cachingpv.storageClass": "{{ ocp_storage_class_block }}", {% else %} "persistence.storageClass": "{{ ocp_storage_class_file }}",