diff --git a/automation-roles/40-configure-infra/configure-openshift/tasks/main.yml b/automation-roles/40-configure-infra/configure-openshift/tasks/main.yml index 40c7561dd..989942bf8 100644 --- a/automation-roles/40-configure-infra/configure-openshift/tasks/main.yml +++ b/automation-roles/40-configure-infra/configure-openshift/tasks/main.yml @@ -63,5 +63,11 @@ - name: Configure Multi-Cloud Object Gateway include_role: name: openshift-mcg + vars: + _p_openshift_cluster: "{{ current_openshift_cluster }}" + +- name: Configure GPU for the OpenShift cluster + include_role: + name: openshift-gpu vars: _p_openshift_cluster: "{{ current_openshift_cluster }}" \ No newline at end of file diff --git a/automation-roles/40-configure-infra/nfd-operator/tasks/main.yml b/automation-roles/40-configure-infra/nfd-operator/tasks/main.yml new file mode 100644 index 000000000..aff58418d --- /dev/null +++ b/automation-roles/40-configure-infra/nfd-operator/tasks/main.yml @@ -0,0 +1,51 @@ +--- +- name: Create openshift-nfd OpenShift project + shell: | + oc create ns openshift-nfd || true + +- name: Retrieve default channel for Node Feature Discovery manifest + shell: + oc get packagemanifest nfd -o jsonpath='{.status.defaultChannel}' + register: _nfd_packagemanifest + +- set_fact: + _nfd_channel: "{{ _nfd_packagemanifest.stdout }}" + +- name: Generate NFD operator file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-operator.yaml + template: + src: nfd-operator.j2 + dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-operator.yaml" + +- name: Create NFD operator + shell: | + oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-operator.yaml + +- name: Wait until NFD Operator CSV has status Succeeded + shell: | + oc get csv -n openshift-nfd \ + -l operators.coreos.com/nfd.openshift-nfd \ + --no-headers \ + -o custom-columns='name:metadata.name,phase:status.phase' | \ + grep -i succeeded | wc -l + register: _nfd_csv_status + retries: 30 + delay: 30 + until: _nfd_csv_status.stdout == "1" + vars: + ansible_callback_diy_runner_retry_msg: >- + {%- set result = ansible_callback_diy.result.output -%} + {%- set retries_left = result.retries - result.attempts -%} + Retrying: {{ ansible_callback_diy.task.name }} ({{ retries_left }} Retries left) ... + +- name: Get OpenShift version + include_role: + name: openshift-get-version + +- name: Generate NodeFeatureDiscovery CR file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-cr.yaml + template: + src: nfd-cr.j2 + dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-cr.yaml" + +- name: Create NFD CR + shell: | + oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nfd-cr.yaml \ No newline at end of file diff --git a/automation-roles/40-configure-infra/nfd-operator/templates/nfd-cr.j2 b/automation-roles/40-configure-infra/nfd-operator/templates/nfd-cr.j2 new file mode 100644 index 000000000..021cf77fd --- /dev/null +++ b/automation-roles/40-configure-infra/nfd-operator/templates/nfd-cr.j2 @@ -0,0 +1,120 @@ +--- +apiVersion: nfd.openshift.io/v1 +kind: NodeFeatureDiscovery +metadata: + name: nfd-instance + namespace: openshift-nfd +spec: + instance: "" # instance is empty by default + topologyupdater: false # False by default + operand: + image: registry.redhat.io/openshift4/ose-node-feature-discovery:v{{ _p_current_ocp_version }} + servicePort: 12000 + workerConfig: + configData: | + core: + # labelWhiteList: + # noPublish: false + sleepInterval: 60s + # sources: [all] + # klog: + # addDirHeader: false + # alsologtostderr: false + # logBacktraceAt: + # logtostderr: true + # skipHeaders: false + # stderrthreshold: 2 + # v: 0 + # vmodule: + ## NOTE: the following options are not dynamically run-time + ## configurable and require a nfd-worker restart to take effect + ## after being changed + # logDir: + # logFile: + # logFileMaxSize: 1800 + # skipLogHeaders: false + sources: + # cpu: + # cpuid: + ## NOTE: whitelist has priority over blacklist + # attributeBlacklist: + # - "BMI1" + # - "BMI2" + # - "CLMUL" + # - "CMOV" + # - "CX16" + # - "ERMS" + # - "F16C" + # - "HTT" + # - "LZCNT" + # - "MMX" + # - "MMXEXT" + # - "NX" + # - "POPCNT" + # - "RDRAND" + # - "RDSEED" + # - "RDTSCP" + # - "SGX" + # - "SSE" + # - "SSE2" + # - "SSE3" + # - "SSE4.1" + # - "SSE4.2" + # - "SSSE3" + # attributeWhitelist: + # kernel: + # kconfigFile: "/path/to/kconfig" + # configOpts: + # - "NO_HZ" + # - "X86" + # - "DMI" + pci: + deviceClassWhitelist: + - "0200" + - "03" + - "12" + deviceLabelFields: + # - "class" + - "vendor" + # - "device" + # - "subsystem_vendor" + # - "subsystem_device" + # usb: + # deviceClassWhitelist: + # - "0e" + # - "ef" + # - "fe" + # - "ff" + # deviceLabelFields: + # - "class" + # - "vendor" + # - "device" + # custom: + # - name: "my.kernel.feature" + # matchOn: + # - loadedKMod: ["example_kmod1", "example_kmod2"] + # - name: "my.pci.feature" + # matchOn: + # - pciId: + # class: ["0200"] + # vendor: ["15b3"] + # device: ["1014", "1017"] + # - pciId : + # vendor: ["8086"] + # device: ["1000", "1100"] + # - name: "my.usb.feature" + # matchOn: + # - usbId: + # class: ["ff"] + # vendor: ["03e7"] + # device: ["2485"] + # - usbId: + # class: ["fe"] + # vendor: ["1a6e"] + # device: ["089a"] + # - name: "my.combined.feature" + # matchOn: + # - pciId: + # vendor: ["15b3"] + # device: ["1014", "1017"] + # loadedKMod : ["vendor_kmod1", "vendor_kmod2"] diff --git a/automation-roles/40-configure-infra/nfd-operator/templates/nfd-operator.j2 b/automation-roles/40-configure-infra/nfd-operator/templates/nfd-operator.j2 new file mode 100644 index 000000000..2ab943566 --- /dev/null +++ b/automation-roles/40-configure-infra/nfd-operator/templates/nfd-operator.j2 @@ -0,0 +1,22 @@ +--- +apiVersion: operators.coreos.com/v1 +kind: OperatorGroup +metadata: + name: openshift-nfd-og + namespace: openshift-nfd +spec: + upgradeStrategy: Default +--- +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + labels: + operators.coreos.com/nfd.openshift-nfd: "" + name: nfd + namespace: openshift-nfd +spec: + channel: {{ _nfd_channel }} + installPlanApproval: Automatic + name: nfd + source: redhat-operators + sourceNamespace: openshift-marketplace \ No newline at end of file diff --git a/automation-roles/40-configure-infra/nvidia-operator/tasks/main.yml b/automation-roles/40-configure-infra/nvidia-operator/tasks/main.yml new file mode 100644 index 000000000..4aad1e595 --- /dev/null +++ b/automation-roles/40-configure-infra/nvidia-operator/tasks/main.yml @@ -0,0 +1,63 @@ +--- +- name: Create nvidia-gpu-operator OpenShift project + shell: | + oc create ns nvidia-gpu-operator || true + +- name: Retrieve default channel for the NVIDIA GPU manifest + shell: + oc get packagemanifest gpu-operator-certified -o jsonpath='{.status.defaultChannel}' + register: _nvidia_packagemanifest + +- set_fact: + _nvidia_channel: "{{ _nvidia_packagemanifest.stdout }}" + +- name: Generate NVIDIA operator file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-operator.yaml + template: + src: nvidia-operator.j2 + dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-operator.yaml" + +- name: Create NVIDIA operator + shell: | + oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-operator.yaml + +- name: Wait until NVIDIA Operator CSV has status Succeeded + shell: | + oc get csv -n nvidia-gpu-operator \ + -l operators.coreos.com/gpu-operator-certified.nvidia-gpu-operator \ + --no-headers \ + -o custom-columns='name:metadata.name,phase:status.phase' | \ + grep -i succeeded | wc -l + register: _nvidia_csv_status + retries: 30 + delay: 30 + until: _nvidia_csv_status.stdout == "1" + vars: + ansible_callback_diy_runner_retry_msg: >- + {%- set result = ansible_callback_diy.result.output -%} + {%- set retries_left = result.retries - result.attempts -%} + Retrying: {{ ansible_callback_diy.task.name }} ({{ retries_left }} Retries left) ... + +- name: Generate NVIDIA ClusterPolicy CR file {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-cluster-policy-cr.yaml + template: + src: nvidia-cluster-policy-cr.j2 + dest: "{{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-cluster-policy-cr.yaml" + +- name: Create NVIDIA ClusterPolicy CR + shell: | + oc apply -f {{ status_dir }}/openshift/{{ current_openshift_cluster.name }}-nvidia-cluster-policy-cr.yaml + +- name: Wait until NVIDIA ClusterPolicy has status Ready + shell: | + oc get clusterpolicies.nvidia.com gpu-cluster-policy \ + --no-headers \ + -o custom-columns='name:metadata.name,phase:status.state' | \ + grep -i ready | wc -l + register: _nvidia_cluster_policy_status + retries: 30 + delay: 30 + until: _nvidia_cluster_policy_status.stdout == "1" + vars: + ansible_callback_diy_runner_retry_msg: >- + {%- set result = ansible_callback_diy.result.output -%} + {%- set retries_left = result.retries - result.attempts -%} + Retrying: {{ ansible_callback_diy.task.name }} ({{ retries_left }} Retries left) ... \ No newline at end of file diff --git a/automation-roles/40-configure-infra/nvidia-operator/templates/nvidia-cluster-policy-cr.j2 b/automation-roles/40-configure-infra/nvidia-operator/templates/nvidia-cluster-policy-cr.j2 new file mode 100644 index 000000000..92161c75e --- /dev/null +++ b/automation-roles/40-configure-infra/nvidia-operator/templates/nvidia-cluster-policy-cr.j2 @@ -0,0 +1,98 @@ +--- +apiVersion: nvidia.com/v1 +kind: ClusterPolicy +metadata: + name: gpu-cluster-policy +spec: + cdi: + default: false + enabled: false + daemonsets: + rollingUpdate: + maxUnavailable: "1" + updateStrategy: RollingUpdate + dcgm: + enabled: true + dcgmExporter: + config: + name: "" + enabled: true + serviceMonitor: + enabled: true + devicePlugin: + config: + default: "" + name: "" + enabled: true + driver: + certConfig: + name: "" + enabled: true + kernelModuleConfig: + name: "" + licensingConfig: + configMapName: "" + nlsEnabled: true + repoConfig: + configMapName: "" + upgradePolicy: + autoUpgrade: true + drain: + deleteEmptyDir: false + enable: false + force: false + timeoutSeconds: 300 + maxParallelUpgrades: 1 + maxUnavailable: 25% + podDeletion: + deleteEmptyDir: false + force: false + timeoutSeconds: 300 + waitForCompletion: + timeoutSeconds: 0 + useNvidiaDriverCRD: false + useOpenKernelModules: false + virtualTopology: + config: "" + gds: + enabled: false + gfd: + enabled: true + kataManager: + config: + artifactsDir: /opt/nvidia-gpu-operator/artifacts/runtimeclasses + mig: + strategy: single + migManager: + config: + default: all-disabled + name: default-mig-parted-config + enabled: true + nodeStatusExporter: + enabled: true + operator: + defaultRuntime: crio + initContainer: {} + runtimeClass: nvidia + use_ocp_driver_toolkit: true + sandboxDevicePlugin: + enabled: true + sandboxWorkloads: + defaultWorkload: container + enabled: false + toolkit: + enabled: true + installDir: /usr/local/nvidia + validator: + plugin: + env: + - name: WITH_WORKLOAD + value: "false" + vfioManager: + enabled: true + vgpuDeviceManager: + config: + default: default + enabled: true + vgpuManager: + enabled: false \ No newline at end of file diff --git a/automation-roles/40-configure-infra/nvidia-operator/templates/nvidia-operator.j2 b/automation-roles/40-configure-infra/nvidia-operator/templates/nvidia-operator.j2 new file mode 100644 index 000000000..d856f1dd4 --- /dev/null +++ b/automation-roles/40-configure-infra/nvidia-operator/templates/nvidia-operator.j2 @@ -0,0 +1,24 @@ +--- +apiVersion: operators.coreos.com/v1 +kind: OperatorGroup +metadata: + name: nvidia-gpu-operator-og + namespace: nvidia-gpu-operator +spec: + targetNamespaces: + - nvidia-gpu-operator + upgradeStrategy: Default +--- +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + labels: + operators.coreos.com/gpu-operator-certified.nvidia-gpu-operator: "" + name: gpu-operator-certified + namespace: nvidia-gpu-operator +spec: + channel: {{ _nvidia_channel }} + installPlanApproval: Automatic + name: gpu-operator-certified + source: certified-operators + sourceNamespace: openshift-marketplace \ No newline at end of file diff --git a/automation-roles/40-configure-infra/openshift-gpu/tasks/main.yml b/automation-roles/40-configure-infra/openshift-gpu/tasks/main.yml new file mode 100644 index 000000000..de1ae7223 --- /dev/null +++ b/automation-roles/40-configure-infra/openshift-gpu/tasks/main.yml @@ -0,0 +1,16 @@ +--- +- name: Validate mandatory variables for OpenShift NFD and GPU operators + assert: + that: + - _p_openshift_cluster is defined + +- block: + - name: Install Node Feature Discovery operator and CR + include_role: + name: nfd-operator + + - name: Install NVIDIA operator and CR + include_role: + name: nvidia-operator + + when: _p_openshift_cluster.gpu.install | default(False) | bool \ No newline at end of file diff --git a/automation-roles/40-configure-infra/retrieve-cloud-infra-type/tasks/retrieve-openshift-type-existing-ocp.yml b/automation-roles/40-configure-infra/retrieve-cloud-infra-type/tasks/retrieve-openshift-type-existing-ocp.yml index 9f0b9249e..e087c5ec6 100644 --- a/automation-roles/40-configure-infra/retrieve-cloud-infra-type/tasks/retrieve-openshift-type-existing-ocp.yml +++ b/automation-roles/40-configure-infra/retrieve-cloud-infra-type/tasks/retrieve-openshift-type-existing-ocp.yml @@ -23,7 +23,7 @@ # Extra handling for ibm cloud - set_fact: _existing_ocp_infra_type: "ibm-roks" - when: _storage_inferred_ocp_infra_type == "ibm-classic" or _storage_inferred_ocp_infra_type == "ibm-vpc-gen2" + when: _storage_inferred_ocp_infra_type == "ibm-classic" or _storage_inferred_ocp_infra_type == "ibm-vpc-gen2" or _storage_inferred_ocp_infra_type == "ibm-satellite" # Extra handling for AWS - name: Distinquish AWS OpenShift between self-managed and ROSA diff --git a/automation-roles/40-configure-infra/retrieve-cloud-infra-type/vars/main.yml b/automation-roles/40-configure-infra/retrieve-cloud-infra-type/vars/main.yml index 8ad35ee55..62e4311fe 100644 --- a/automation-roles/40-configure-infra/retrieve-cloud-infra-type/vars/main.yml +++ b/automation-roles/40-configure-infra/retrieve-cloud-infra-type/vars/main.yml @@ -3,6 +3,8 @@ existing_ocp_cloud_infra: storage_class: "ibmc-file-gold" ibm-vpc-gen2: storage_class: "ibmc-vpc" + ibm-satellite: + storage_class: "sat-ocs-ceph" aws: storage_class: "gp" azure-aro: diff --git a/automation-roles/50-install-cloud-pak/cp4d/cp4d-cartridge-remove/tasks/cp4d-delete-cr-instances.yml b/automation-roles/50-install-cloud-pak/cp4d/cp4d-cartridge-remove/tasks/cp4d-delete-cr-instances.yml index 16c8ffde1..351b73238 100644 --- a/automation-roles/50-install-cloud-pak/cp4d/cp4d-cartridge-remove/tasks/cp4d-delete-cr-instances.yml +++ b/automation-roles/50-install-cloud-pak/cp4d/cp4d-cartridge-remove/tasks/cp4d-delete-cr-instances.yml @@ -41,9 +41,10 @@ _p_delete_all_instances: True when: (_current_cartridge_cr.olm_utils_name | default("")) == "dv" -- name: Delete all OpenPages instances +- name: Delete all watsonx.ai instances include_role: - name: cp4d-instance-openpages + name: cp4d-instance-watsonx_ai vars: _p_delete_all_instances: True - when: (_current_cartridge_cr.olm_utils_name | default("")) == "openpages" \ No newline at end of file + when: (_current_cartridge_cr.olm_utils_name | default("")) == "watsonx_ai" + diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-dv/templates/dv_instance_40.json.j2 b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-dv/templates/dv_instance_40.json.j2 index c5222077c..1b3b70466 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-dv/templates/dv_instance_40.json.j2 +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-dv/templates/dv_instance_40.json.j2 @@ -12,26 +12,26 @@ "enableHostIPC": "{{ cp4d_dv_instance_enable_host_ipc }}", {% if _storage_type == 'pwx' %} "persistence.storageClass": "portworx-dv-shared-gp3", - "persistence.workerpv.storageClass": "portworx-dv-shared-gp3", + "persistence.autidpv.storageClass": "portworx-dv-shared-gp3", "persistence.cachingpv.storageClass": "portworx-dv-shared-gp3", {% elif _storage_type == 'ocs' %} "persistence.storageClass": "{{ ocp_storage_class_block }}", - "persistence.workerpv.storageClass": "{{ ocp_storage_class_block }}", + "persistence.auditpv.storageClass": "{{ ocp_storage_class_file }}", "persistence.cachingpv.storageClass": "{{ ocp_storage_class_block }}", {% else %} "persistence.storageClass": "{{ ocp_storage_class_file }}", - "persistence.workerpv.storageClass": "{{ ocp_storage_class_file }}", + "persistence.auditpv.storageClass": "{{ ocp_storage_class_file }}", "persistence.cachingpv.storageClass": "{{ ocp_storage_class_file }}", {% endif %} "persistence.size": "{{ cp4d_dv_instance_persistence_storage_size }}", - "persistence.workerpv.size": "{{ cp4d_dv_instance_persistence_compute_storage_size }}", + "persistence.auditpv.size": "{{ cp4d_dv_instance_persistence_compute_storage_size }}", "persistence.cachingpv.size": "{{ cp4d_dv_instance_persistence_caching_storage_size }}" }, "resources":{ "cpu":"{{ cp4d_dv_instance_requests_cpu }}", "memory":"{{ cp4d_dv_instance_requests_memory }}" }, - "description":"Data Virtualization", + "description":"Watson Query", "metaData":{ } diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/configure-watsonx_ai-instances.yml b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/configure-watsonx_ai-instances.yml index 696ff4697..9ae069fd2 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/configure-watsonx_ai-instances.yml +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/configure-watsonx_ai-instances.yml @@ -7,4 +7,5 @@ - name: Run script to configure running watsonx.ai models, output is in {{ status_dir }}/log/{{ current_cp4d_cluster.project }}-patch-watsonxaiifm.log shell: | - {{ status_dir }}/cp4d/{{ current_cp4d_cluster.project }}-patch-watsonxaiifm.sh > {{ status_dir }}/log/{{ current_cp4d_cluster.project }}-patch-watsonxaiifm.log \ No newline at end of file + {{ status_dir }}/cp4d/{{ current_cp4d_cluster.project }}-patch-watsonxaiifm.sh > {{ status_dir }}/log/{{ current_cp4d_cluster.project }}-patch-watsonxaiifm.log + register: _patch_watsonxaiifm_result \ No newline at end of file diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml new file mode 100644 index 000000000..f60b98769 --- /dev/null +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instance.yml @@ -0,0 +1,27 @@ +--- +- name: Check if Watsonxaiifm CRD exists + shell: | + oc get crd watsonxaiifm.watsonxaiifm.cpd.ibm.com + failed_when: False + register: _watsonxiifm_crd_state + +- name: Delete deployment for watsonx.ai instance {{ _watsonx_ai_instance.model_id }} + shell: | + oc delete deployment \ + -n {{ current_cp4d_cluster.project }} \ + {{ _watsonx_ai_instance.model_id }}-inference-server \ + --ignore-not-found + when: + - _watsonxiifm_crd_state.rc == 0 + - (_watsonx_ai_instance.state | default('')) == 'removed' or _delete_all_watsonx_ai_instances + + +- name: Delete Watsonxaiifm CR if watsonx.ai was removed + shell: + oc delete Watsonxaiifm \ + -n {{ current_cp4d_cluster.project }} \ + watsonxaiifm-cr \ + --ignore-not-found + when: + - _watsonxiifm_crd_state.rc == 0 + - _delete_all_watsonx_ai_instances \ No newline at end of file diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instances.yml b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instances.yml new file mode 100644 index 000000000..ee56b5660 --- /dev/null +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/delete-watsonx_ai-instances.yml @@ -0,0 +1,6 @@ +--- +- name: Loop through each watsonx.ai foundation model inference server to check if it must be deleted + include_tasks: delete-watsonx_ai-instance.yml + loop: "{{ _configured_watsonx_ai_instances }}" + loop_control: + loop_var: _watsonx_ai_instance \ No newline at end of file diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/main.yml b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/main.yml index 90a8383eb..cf60d44ca 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/main.yml +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/main.yml @@ -9,22 +9,27 @@ debug: var: _watsonxai_cartridge -- block: - - set_fact: - _configured_watsonxai_instances: [] - - set_fact: - _configured_watsonxai_instances: "{{ _configured_watsonxai_instances + (_watsonxai_cartridge.models | default([])) }}" +- set_fact: + _configured_watsonx_ai_instances: [] + _delete_all_watsonx_ai_instances: False +- set_fact: + _configured_watsonx_ai_instances: "{{ _watsonxai_cartridge.models | default([]) }}" - - include_tasks: configure-watsonx_ai-instances.yml - when: - - (_p_delete_all_instances | default(False)) == False - - (_p_wait_instances | default(False)) == False +- set_fact: + _delete_all_watsonx_ai_instances: True + when: + - _watsonxai_cartridge == {} or (_watsonxai_cartridge.state | default('installed')) == 'removed' - - include_tasks: wait-watsonx_ai-instances.yml - when: - - (_p_delete_all_instances | default(False)) == False - - _p_wait_instances | default(False) +- include_tasks: configure-watsonx_ai-instances.yml + when: + - _delete_all_watsonx_ai_instances == False + - (_p_wait_instances | default(False)) == False + +- include_tasks: delete-watsonx_ai-instances.yml + when: + - (_p_wait_instances | default(False)) == False +- include_tasks: wait-watsonx_ai-instances.yml when: - - _watsonxai_cartridge != {} - - (_watsonxai_cartridge.state | default('installed')) == 'installed' + - _delete_all_watsonx_ai_instances == False + - _p_wait_instances | default(False) \ No newline at end of file diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/wait-watsonx_ai-instances.yml b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/wait-watsonx_ai-instances.yml index 73ac38b97..8398d68e7 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/wait-watsonx_ai-instances.yml +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/tasks/wait-watsonx_ai-instances.yml @@ -1,9 +1,14 @@ --- +- name: Wait for 1 minute to let the operator update the Watsonsaiifm CR if a change was made + pause: + seconds: 60 + when: _patch_watsonxaiifm_result.changed | default(False) + - name: Wait for Watsonxaiifm watsonxaiifm-cr to reach Completed status shell: | oc get Watsonxaiifm watsonxaiifm-cr -n {{ current_cp4d_cluster.project }} --output json | jq -r '.status.watsonxaiifmStatus' | grep -i 'completed' | wc -l register: _deployed_watsonxaiifm_status - retries: 30 + retries: 60 delay: 60 until: _deployed_watsonxaiifm_status.stdout == "1" vars: diff --git a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/templates/patch-watsonx_ai-models.j2 b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/templates/patch-watsonx_ai-models.j2 index 53dcf8b8d..5a9251b9e 100644 --- a/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/templates/patch-watsonx_ai-models.j2 +++ b/automation-roles/60-configure-cloud-pak/cp4d/cp4d-instance-watsonx_ai/templates/patch-watsonx_ai-models.j2 @@ -1,9 +1,11 @@ +{% if (_configured_watsonx_ai_instances | default([]) | selectattr('state','match','installed' ) | length) != 0 %} oc patch Watsonxaiifm watsonxaiifm-cr \ -n {{ current_cp4d_cluster.project }} \ --type=merge \ - --patch '{"spec": {"install_model_list": [{% for m in _configured_watsonxai_instances | default([]) -%} -{%- if ((m.state | default('installed')) == 'installed') -%} -{%- if not loop.first -%},{% endif -%} -"{{ m.model_id }}" -{%- endif -%} -{%- endfor -%}] } }' \ No newline at end of file + --patch '{"spec": {"install_model_list": [ {{ '\"' + _configured_watsonx_ai_instances | default([]) | selectattr('state','match','installed' ) | map(attribute='model_id') | join('\",\"') + '\"' }} ] } }' +{% else %} +oc patch Watsonxaiifm watsonxaiifm-cr \ + -n {{ current_cp4d_cluster.project }} \ + --type=merge \ + --patch '{"spec": {"install_model_list": [] } }' +{% endif %} \ No newline at end of file diff --git a/automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login-generic.yml b/automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login-generic.yml new file mode 100644 index 000000000..2e2847ebc --- /dev/null +++ b/automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login-generic.yml @@ -0,0 +1,9 @@ +--- +- name: Login to OpenShift cluster if oc login command was found in secret oc-login + shell: | + {{ _oc_login_generic }} + register: _oc_login_result + failed_when: False + retries: "{{ _ocp_login_retries }}" + delay: "{{ _ocp_login_delay }}" + until: _oc_login_result.rc==0 \ No newline at end of file diff --git a/automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login-specific.yml b/automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login-specific.yml new file mode 100644 index 000000000..3837bb17e --- /dev/null +++ b/automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login-specific.yml @@ -0,0 +1,9 @@ +--- +- name: Login to OpenShift cluster if oc login command was found in secret {{ _p_openshift_cluster_name }}-oc-login + shell: | + {{ _oc_login_cluster }} + register: _oc_login_result + failed_when: False + retries: "{{ _ocp_login_retries }}" + delay: "{{ _ocp_login_delay }}" + until: _oc_login_result.rc==0 \ No newline at end of file diff --git a/automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login.yml b/automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login.yml index f53392ce0..d12a6c38f 100644 --- a/automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login.yml +++ b/automation-roles/99-generic/openshift/openshift-login/tasks/existing-ocp-login-oc-login.yml @@ -1,30 +1,15 @@ --- -# Log in using oc login command -- name: Login to OpenShift cluster if oc login command was found in secret {{ _p_openshift_cluster_name }}-oc-login - shell: | - {{ _oc_login_cluster }} - register: _oc_login_result - failed_when: False - retries: "{{ _ocp_login_retries }}" - delay: "{{ _ocp_login_delay }}" - until: _oc_login_result.rc==0 +- include_tasks: existing-ocp-login-oc-login-specific.yml when: _oc_login_cluster != '' -- name: Login to OpenShift cluster if oc login command was found in secret oc-login - shell: | - {{ _oc_login_generic }} - register: _oc_login_result - failed_when: False - retries: "{{ _ocp_login_retries }}" - delay: "{{ _ocp_login_delay }}" - until: _oc_login_result.rc==0 +- include_tasks: existing-ocp-login-oc-login-generic.yml when: - _oc_login_generic != '' - _oc_login_cluster == '' - name: Show OpenShift login result debug: - msg: "{{_oc_login_result }}" + var: _oc_login_result - fail: msg: "OpenShift login to cluster {{ _p_openshift_cluster_name }} failed, details: {{ _oc_login_result }}" diff --git a/docs/src/30-reference/configuration/openshift.md b/docs/src/30-reference/configuration/openshift.md index af9e407d4..67e769991 100644 --- a/docs/src/30-reference/configuration/openshift.md +++ b/docs/src/30-reference/configuration/openshift.md @@ -475,6 +475,8 @@ openshift: - example.com dns_servers: - 172.31.2.73:53 + gpu: + install: False mcg: install: True storage_type: storage-class @@ -500,6 +502,8 @@ openshift: | infrastructure.processor_architecture | Architecture of the processor that the OpenShift cluster is deployed on | No | amd64 (default), ppc64le, s390x | | openshift_logging[] | Logging attributes for OpenShift cluster, see [OpenShift logging](logging-auditing.md) | No | | | upstream_dns[] | Upstream DNS servers(s), see [Upstream DNS Servers](./dns.md) | No | | +| gpu | Control Node Feature Discovery and NVIDIA GPU operators | No | | +| gpu.install | Must Node Feature Discovery and NVIDIA GPU operators be installed (Once installed, False does not uninstall) | Yes | True, False | | mcg | Multicloud Object Gateway properties | No | | | mcg.install | Must Multicloud Object Gateway be installed (Once installed, False does not uninstall) | Yes | True, False | | mcg.storage_type | Type of storage supporting the object Noobaa object storage | Yes | storage-class | diff --git a/docs/src/80-development/deployer-development-setup.md b/docs/src/80-development/deployer-development-setup.md index ef86d11c4..32ee9f302 100644 --- a/docs/src/80-development/deployer-development-setup.md +++ b/docs/src/80-development/deployer-development-setup.md @@ -80,6 +80,7 @@ gpg --default-new-key-algo rsa4096 --gen-key ``` You will be prompted to specify your user information: + * Real name: Enter your full name * Email address: Your e-mail address that will be used to sign the commits @@ -132,6 +133,7 @@ git config --global user.signingkey A83C67A6D7F71756 ``` Next, add your GPG key to your Git user. + * Go to https://github.com/IBM/cloud-pak-deployer.git * Log in using your public GitHub user * Click on your user at the top right of the pages @@ -153,6 +155,7 @@ git clone https://github.com/IBM/cloud-pak-deployer.git ``` ### Connect VSCode to the development server + * Install the **Remote - SSH** extension in VSCode * Click on the green icon in the lower left of VSCode * Open SSH Config file, choose the one in your home directory @@ -164,6 +167,7 @@ Host nickname_of_your_server ``` Once you have set up this server in the SSH config file, you can connect to it and start remote development. + * Open * Select the `cloud-pak-deployer` directory (this is the cloned repository) * As the directory is a cloned Git repo, VSCode will automatically open the default branch diff --git a/sample-configurations/sample-dynamic/config-samples/cp4d-480.yaml b/sample-configurations/sample-dynamic/config-samples/cp4d-480.yaml index edbaca64e..15faa743c 100644 --- a/sample-configurations/sample-dynamic/config-samples/cp4d-480.yaml +++ b/sample-configurations/sample-dynamic/config-samples/cp4d-480.yaml @@ -222,6 +222,8 @@ cp4d: # noobaa_cert_secret: noobaa-s3-serving-cert state: removed + # Please note that for watsonx.ai foundation models, you neeed to install the + # Node Feature Discovery and NVIDIA GPU operators. You can do so by setting the openshift.gpu.install property to True - name: watsonx_ai description: watsonx.ai state: removed diff --git a/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp-auto.yaml b/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp-auto.yaml index e37b9a3e5..0ce26f55a 100644 --- a/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp-auto.yaml +++ b/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp-auto.yaml @@ -14,6 +14,8 @@ openshift: install: True storage_type: storage-class storage_class: managed-nfs-storage + gpu: + install: False openshift_storage: - storage_name: auto-storage storage_type: auto diff --git a/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp-satellite-ocs.yaml b/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp-satellite-ocs.yaml index d5b007355..f793a9311 100644 --- a/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp-satellite-ocs.yaml +++ b/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp-satellite-ocs.yaml @@ -18,6 +18,8 @@ openshift: domain_name: example.com infrastructure: type: ibm-roks + gpu: + install: False mcg: install: False storage_type: storage-class diff --git a/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp.yaml b/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp.yaml index fbe5739df..866972e53 100644 --- a/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp.yaml +++ b/sample-configurations/sample-dynamic/config-samples/ocp-existing-ocp.yaml @@ -10,6 +10,8 @@ openshift: ocp_version: 4.8 cluster_name: "{{ env_id }}" domain_name: example.com + gpu: + install: False mcg: install: False storage_type: storage-class diff --git a/sample-configurations/sample-dynamic/config-samples/ocp-existing-roks-classic.yaml b/sample-configurations/sample-dynamic/config-samples/ocp-existing-roks-classic.yaml index 44d515869..80111a2ea 100644 --- a/sample-configurations/sample-dynamic/config-samples/ocp-existing-roks-classic.yaml +++ b/sample-configurations/sample-dynamic/config-samples/ocp-existing-roks-classic.yaml @@ -10,6 +10,8 @@ openshift: ocp_version: 4.8 cluster_name: "{{ env_id }}" domain_name: example.com + gpu: + install: False mcg: install: True storage_type: storage-class diff --git a/sample-configurations/sample-dynamic/config-samples/watsonx-480.yaml b/sample-configurations/sample-dynamic/config-samples/watsonx-480.yaml index bafe1756d..b1c81e273 100644 --- a/sample-configurations/sample-dynamic/config-samples/watsonx-480.yaml +++ b/sample-configurations/sample-dynamic/config-samples/watsonx-480.yaml @@ -25,11 +25,10 @@ cp4d: # All tested cartridges. To install, change the "state" property to "installed". To uninstall, change the state # to "removed" or comment out the entire cartridge. Make sure that the "-" and properties are aligned with the lite # cartridge; the "-" is at position 3 and the property starts at position 5. -# -# If a cartridge has dependencies and you want to install it, you must ensure that the dependent cartridge is also -# installed. # + # Please note that for watsonx.ai foundation models, you neeed to install the + # Node Feature Discovery and NVIDIA GPU operators. You can do so by setting the openshift.gpu.install property to True - name: watsonx_ai description: watsonx.ai state: removed