From 087df7d2c862b0c5beb35cbe3ee43b0c1ee99a7f Mon Sep 17 00:00:00 2001
From: Itzhak Kave <yitzhak.kave@gmail.com>
Date: Wed, 29 May 2024 14:29:54 +0300
Subject: [PATCH] Create new test file for the nodes maintenance test scenarios
 when using a Provider mode (#9528)

* Create new test file for the nodes maintenance test scenarios when using a Provider mode

Signed-off-by: Itzhak Kave <ikave@ibm.com>
---
 ocs_ci/ocs/cluster.py                         |  14 +-
 ocs_ci/ocs/constants.py                       |   5 +
 ocs_ci/ocs/scale_lib.py                       |   8 +-
 .../test_nodes_maintenance_provider_mode.py   | 169 ++++++++++++++++++
 4 files changed, 192 insertions(+), 4 deletions(-)
 create mode 100644 tests/functional/z_cluster/nodes/test_nodes_maintenance_provider_mode.py

diff --git a/ocs_ci/ocs/cluster.py b/ocs_ci/ocs/cluster.py
index 3078a3700a6..52f6d34ad74 100644
--- a/ocs_ci/ocs/cluster.py
+++ b/ocs_ci/ocs/cluster.py
@@ -57,7 +57,10 @@
 from ocs_ci.ocs.resources.pvc import PVC
 from ocs_ci.utility.connection import Connection
 from ocs_ci.utility.lvmo_utils import get_lvm_cluster_name
-from ocs_ci.ocs.resources.pod import get_mds_pods, wait_for_pods_to_be_running
+from ocs_ci.ocs.resources.pod import (
+    get_mds_pods,
+    wait_for_pods_to_be_in_statuses,
+)
 from ocs_ci.utility.decorators import switch_to_orig_index_at_last
 
 logger = logging.getLogger(__name__)
@@ -3235,7 +3238,14 @@ def client_cluster_health_check():
         )
 
     logger.info("Wait for the pods to be running")
-    res = wait_for_pods_to_be_running(timeout=300, sleep=20)
+    expected_statuses = [constants.STATUS_RUNNING, constants.STATUS_COMPLETED]
+    exclude_pod_name_prefixes = ["rook-ceph-tools"]
+    res = wait_for_pods_to_be_in_statuses(
+        expected_statuses=expected_statuses,
+        exclude_pod_name_prefixes=exclude_pod_name_prefixes,
+        timeout=300,
+        sleep=20,
+    )
     if not res:
         raise ResourceWrongStatusException("Not all the pods in running state")
 
diff --git a/ocs_ci/ocs/constants.py b/ocs_ci/ocs/constants.py
index 98b3c8f173f..631579e2043 100644
--- a/ocs_ci/ocs/constants.py
+++ b/ocs_ci/ocs/constants.py
@@ -323,6 +323,7 @@
 
 DEFAULT_CLUSTERNAME = DEFAULT_STORAGE_CLUSTER = "ocs-storagecluster"
 DEFAULT_CLUSTERNAME_EXTERNAL_MODE = "ocs-external-storagecluster"
+DEFAULT_CLUSTERNAME_CLIENT = "storage-client"
 DEFAULT_BLOCKPOOL = f"{DEFAULT_CLUSTERNAME}-cephblockpool"
 METADATA_POOL = f"{DEFAULT_CLUSTERNAME}-cephfilesystem-metadata"
 DATA_POOL = f"{DEFAULT_CLUSTERNAME}-cephfilesystem-data0"
@@ -363,6 +364,10 @@
     f"{DEFAULT_CLUSTERNAME_EXTERNAL_MODE}-ceph-rbd-thick"
 )
 
+# Default StorageClass for Provider-mode
+DEFAULT_STORAGECLASS_CLIENT_CEPHFS = f"{DEFAULT_CLUSTERNAME_CLIENT}-cephfs"
+DEFAULT_STORAGECLASS_CLIENT_RBD = f"{DEFAULT_CLUSTERNAME_CLIENT}-ceph-rbd"
+
 # Default VolumeSnapshotClass
 DEFAULT_VOLUMESNAPSHOTCLASS_CEPHFS = f"{DEFAULT_CLUSTERNAME}-cephfsplugin-snapclass"
 DEFAULT_VOLUMESNAPSHOTCLASS_RBD = f"{DEFAULT_CLUSTERNAME}-rbdplugin-snapclass"
diff --git a/ocs_ci/ocs/scale_lib.py b/ocs_ci/ocs/scale_lib.py
index f1ab57d9c87..0d17cedffcc 100644
--- a/ocs_ci/ocs/scale_lib.py
+++ b/ocs_ci/ocs/scale_lib.py
@@ -124,8 +124,12 @@ def create_multi_pvc_pod(
             raise UnexpectedBehaviour("Kube_job batch count should be lesser than 1200")
 
         logger.info(f"Start creating {pvc_count} PVC of 2 types RBD-RWO & FS-RWX")
-        cephfs_sc_obj = constants.DEFAULT_STORAGECLASS_CEPHFS
-        rbd_sc_obj = constants.DEFAULT_STORAGECLASS_RBD
+        if is_hci_cluster():
+            cephfs_sc_obj = constants.DEFAULT_STORAGECLASS_CLIENT_CEPHFS
+            rbd_sc_obj = constants.DEFAULT_STORAGECLASS_CLIENT_RBD
+        else:
+            cephfs_sc_obj = constants.DEFAULT_STORAGECLASS_CEPHFS
+            rbd_sc_obj = constants.DEFAULT_STORAGECLASS_RBD
 
         # Get pvc_dict_list, append all the pvc.yaml dict to pvc_dict_list
         rbd_pvc_dict_list, cephfs_pvc_dict_list = ([], [])
diff --git a/tests/functional/z_cluster/nodes/test_nodes_maintenance_provider_mode.py b/tests/functional/z_cluster/nodes/test_nodes_maintenance_provider_mode.py
new file mode 100644
index 00000000000..83066db9ee9
--- /dev/null
+++ b/tests/functional/z_cluster/nodes/test_nodes_maintenance_provider_mode.py
@@ -0,0 +1,169 @@
+import logging
+import pytest
+import random
+import time
+
+
+from ocs_ci.framework.pytest_customization.marks import brown_squad
+from ocs_ci.framework.testlib import (
+    tier4a,
+    tier4b,
+    ignore_leftovers,
+    ManageTest,
+    provider_client_platform_required,
+)
+from ocs_ci.ocs import constants
+from ocs_ci.ocs.constants import HCI_PROVIDER
+from ocs_ci.ocs.node import (
+    get_node_objs,
+    recover_node_to_ready_state,
+    wait_for_nodes_status,
+    get_nodes,
+    drain_nodes,
+    schedule_nodes,
+)
+from ocs_ci.helpers.sanity_helpers import SanityProviderMode
+from ocs_ci.ocs.cluster import (
+    ceph_health_check,
+)
+from ocs_ci.framework import config
+from ocs_ci.utility.utils import switch_to_correct_cluster_at_setup
+
+logger = logging.getLogger(__name__)
+
+
+def check_drain_and_unschedule_node(ocp_node):
+    """
+    Drain and unschedule a node
+
+    Args:
+        ocp_node (OCS): The node object
+
+    Raises:
+        ResourceWrongStatusException: In case the node didn't reach the desired state
+
+    """
+    drain_nodes([ocp_node.name])
+    # Wait for the node to be unschedule
+    wait_for_nodes_status(
+        node_names=[ocp_node.name],
+        status=constants.NODE_READY_SCHEDULING_DISABLED,
+        timeout=120,
+        sleep=5,
+    )
+
+    wait_time_before_reschedule = 30
+    logger.info(
+        f"Wait {wait_time_before_reschedule} seconds before rescheduling the node"
+    )
+    time.sleep(wait_time_before_reschedule)
+
+    schedule_nodes([ocp_node.name])
+    wait_for_nodes_status(
+        node_names=[ocp_node.name],
+        status=constants.NODE_READY,
+        timeout=120,
+        sleep=5,
+    )
+    logger.info("Checking that the Ceph health is OK")
+    ceph_health_check()
+
+
+@brown_squad
+@ignore_leftovers
+@provider_client_platform_required
+class TestNodesMaintenanceProviderMode(ManageTest):
+    """
+    Test nodes maintenance scenarios when using a Provider mode
+    """
+
+    @pytest.fixture(autouse=True)
+    def setup(self, request, create_scale_pods_and_pvcs_using_kube_job_on_hci_clients):
+        """
+        1. Save the original index
+        2. Switch to the correct cluster index
+        3. Initialize the Sanity instance
+
+        """
+        self.orig_index = config.cur_index
+        switch_to_correct_cluster_at_setup(request)
+        self.sanity_helpers = SanityProviderMode(
+            create_scale_pods_and_pvcs_using_kube_job_on_hci_clients
+        )
+
+    @pytest.fixture(autouse=True)
+    def teardown(self, request):
+        """
+        1. Make sure all nodes are up again
+        2. Switch to the original cluster index
+        3. Check the Ceph health
+
+        """
+
+        def finalizer():
+            ocp_nodes = get_node_objs()
+            for n in ocp_nodes:
+                recover_node_to_ready_state(n)
+
+            logger.info("Switch to the original cluster index")
+            config.switch_ctx(self.orig_index)
+            ceph_health_check()
+
+        request.addfinalizer(finalizer)
+
+    @tier4a
+    @pytest.mark.parametrize(
+        argnames=["cluster_type", "node_type"],
+        argvalues=[
+            pytest.param(
+                *[HCI_PROVIDER, constants.WORKER_MACHINE],
+                marks=pytest.mark.polarion_id("OCS-5461"),
+            ),
+            pytest.param(
+                *[HCI_PROVIDER, constants.MASTER_MACHINE],
+                marks=pytest.mark.polarion_id("OCS-5462"),
+            ),
+        ],
+    )
+    def test_node_maintenance(self, cluster_type, node_type):
+        """
+        Test node maintenance
+
+        """
+        ocp_nodes = get_nodes(node_type=node_type)
+        ocp_node = random.choice(ocp_nodes)
+        check_drain_and_unschedule_node(ocp_node)
+        logger.info(
+            "Check basic cluster functionality by creating resources, run IO, "
+            "and deleting the resources"
+        )
+        self.sanity_helpers.create_resources_on_clients()
+        self.sanity_helpers.delete_resources()
+        logger.info("Check the cluster health")
+        self.sanity_helpers.health_check_provider_mode()
+
+    @tier4b
+    @pytest.mark.polarion_id("OCS-5466")
+    @pytest.mark.parametrize(
+        argnames=["cluster_type", "node_type"],
+        argvalues=[
+            pytest.param(*[HCI_PROVIDER, constants.WORKER_MACHINE]),
+        ],
+    )
+    def test_rolling_nodes_maintenance(self, cluster_type, node_type):
+        """
+        Test maintenance nodes one after the other and check health status in between
+
+        """
+        ocp_nodes = get_nodes(node_type=node_type)
+        for ocp_node in ocp_nodes:
+            check_drain_and_unschedule_node(ocp_node)
+
+        logger.info(
+            "Check basic cluster functionality by creating resources, run IO, "
+            "and deleting the resources"
+        )
+        self.sanity_helpers.create_resources_on_clients()
+        self.sanity_helpers.delete_resources()
+        logger.info("Check the cluster health")
+        self.sanity_helpers.health_check_provider_mode()