From 087df7d2c862b0c5beb35cbe3ee43b0c1ee99a7f Mon Sep 17 00:00:00 2001 From: Itzhak Kave Date: Wed, 29 May 2024 14:29:54 +0300 Subject: [PATCH] Create new test file for the nodes maintenance test scenarios when using a Provider mode (#9528) * Create new test file for the nodes maintenance test scenarios when using a Provider mode Signed-off-by: Itzhak Kave --- ocs_ci/ocs/cluster.py | 14 +- ocs_ci/ocs/constants.py | 5 + ocs_ci/ocs/scale_lib.py | 8 +- .../test_nodes_maintenance_provider_mode.py | 169 ++++++++++++++++++ 4 files changed, 192 insertions(+), 4 deletions(-) create mode 100644 tests/functional/z_cluster/nodes/test_nodes_maintenance_provider_mode.py diff --git a/ocs_ci/ocs/cluster.py b/ocs_ci/ocs/cluster.py index 3078a3700a6..52f6d34ad74 100644 --- a/ocs_ci/ocs/cluster.py +++ b/ocs_ci/ocs/cluster.py @@ -57,7 +57,10 @@ from ocs_ci.ocs.resources.pvc import PVC from ocs_ci.utility.connection import Connection from ocs_ci.utility.lvmo_utils import get_lvm_cluster_name -from ocs_ci.ocs.resources.pod import get_mds_pods, wait_for_pods_to_be_running +from ocs_ci.ocs.resources.pod import ( + get_mds_pods, + wait_for_pods_to_be_in_statuses, +) from ocs_ci.utility.decorators import switch_to_orig_index_at_last logger = logging.getLogger(__name__) @@ -3235,7 +3238,14 @@ def client_cluster_health_check(): ) logger.info("Wait for the pods to be running") - res = wait_for_pods_to_be_running(timeout=300, sleep=20) + expected_statuses = [constants.STATUS_RUNNING, constants.STATUS_COMPLETED] + exclude_pod_name_prefixes = ["rook-ceph-tools"] + res = wait_for_pods_to_be_in_statuses( + expected_statuses=expected_statuses, + exclude_pod_name_prefixes=exclude_pod_name_prefixes, + timeout=300, + sleep=20, + ) if not res: raise ResourceWrongStatusException("Not all the pods in running state") diff --git a/ocs_ci/ocs/constants.py b/ocs_ci/ocs/constants.py index 98b3c8f173f..631579e2043 100644 --- a/ocs_ci/ocs/constants.py +++ b/ocs_ci/ocs/constants.py @@ -323,6 +323,7 @@ DEFAULT_CLUSTERNAME = DEFAULT_STORAGE_CLUSTER = "ocs-storagecluster" DEFAULT_CLUSTERNAME_EXTERNAL_MODE = "ocs-external-storagecluster" +DEFAULT_CLUSTERNAME_CLIENT = "storage-client" DEFAULT_BLOCKPOOL = f"{DEFAULT_CLUSTERNAME}-cephblockpool" METADATA_POOL = f"{DEFAULT_CLUSTERNAME}-cephfilesystem-metadata" DATA_POOL = f"{DEFAULT_CLUSTERNAME}-cephfilesystem-data0" @@ -363,6 +364,10 @@ f"{DEFAULT_CLUSTERNAME_EXTERNAL_MODE}-ceph-rbd-thick" ) +# Default StorageClass for Provider-mode +DEFAULT_STORAGECLASS_CLIENT_CEPHFS = f"{DEFAULT_CLUSTERNAME_CLIENT}-cephfs" +DEFAULT_STORAGECLASS_CLIENT_RBD = f"{DEFAULT_CLUSTERNAME_CLIENT}-ceph-rbd" + # Default VolumeSnapshotClass DEFAULT_VOLUMESNAPSHOTCLASS_CEPHFS = f"{DEFAULT_CLUSTERNAME}-cephfsplugin-snapclass" DEFAULT_VOLUMESNAPSHOTCLASS_RBD = f"{DEFAULT_CLUSTERNAME}-rbdplugin-snapclass" diff --git a/ocs_ci/ocs/scale_lib.py b/ocs_ci/ocs/scale_lib.py index f1ab57d9c87..0d17cedffcc 100644 --- a/ocs_ci/ocs/scale_lib.py +++ b/ocs_ci/ocs/scale_lib.py @@ -124,8 +124,12 @@ def create_multi_pvc_pod( raise UnexpectedBehaviour("Kube_job batch count should be lesser than 1200") logger.info(f"Start creating {pvc_count} PVC of 2 types RBD-RWO & FS-RWX") - cephfs_sc_obj = constants.DEFAULT_STORAGECLASS_CEPHFS - rbd_sc_obj = constants.DEFAULT_STORAGECLASS_RBD + if is_hci_cluster(): + cephfs_sc_obj = constants.DEFAULT_STORAGECLASS_CLIENT_CEPHFS + rbd_sc_obj = constants.DEFAULT_STORAGECLASS_CLIENT_RBD + else: + cephfs_sc_obj = constants.DEFAULT_STORAGECLASS_CEPHFS + rbd_sc_obj = constants.DEFAULT_STORAGECLASS_RBD # Get pvc_dict_list, append all the pvc.yaml dict to pvc_dict_list rbd_pvc_dict_list, cephfs_pvc_dict_list = ([], []) diff --git a/tests/functional/z_cluster/nodes/test_nodes_maintenance_provider_mode.py b/tests/functional/z_cluster/nodes/test_nodes_maintenance_provider_mode.py new file mode 100644 index 00000000000..83066db9ee9 --- /dev/null +++ b/tests/functional/z_cluster/nodes/test_nodes_maintenance_provider_mode.py @@ -0,0 +1,169 @@ +import logging +import pytest +import random +import time + + +from ocs_ci.framework.pytest_customization.marks import brown_squad +from ocs_ci.framework.testlib import ( + tier4a, + tier4b, + ignore_leftovers, + ManageTest, + provider_client_platform_required, +) +from ocs_ci.ocs import constants +from ocs_ci.ocs.constants import HCI_PROVIDER +from ocs_ci.ocs.node import ( + get_node_objs, + recover_node_to_ready_state, + wait_for_nodes_status, + get_nodes, + drain_nodes, + schedule_nodes, +) +from ocs_ci.helpers.sanity_helpers import SanityProviderMode +from ocs_ci.ocs.cluster import ( + ceph_health_check, +) +from ocs_ci.framework import config +from ocs_ci.utility.utils import switch_to_correct_cluster_at_setup + +logger = logging.getLogger(__name__) + + +def check_drain_and_unschedule_node(ocp_node): + """ + Drain and unschedule a node + + Args: + ocp_node (OCS): The node object + + Raises: + ResourceWrongStatusException: In case the node didn't reach the desired state + + """ + drain_nodes([ocp_node.name]) + # Wait for the node to be unschedule + wait_for_nodes_status( + node_names=[ocp_node.name], + status=constants.NODE_READY_SCHEDULING_DISABLED, + timeout=120, + sleep=5, + ) + + wait_time_before_reschedule = 30 + logger.info( + f"Wait {wait_time_before_reschedule} seconds before rescheduling the node" + ) + time.sleep(wait_time_before_reschedule) + + schedule_nodes([ocp_node.name]) + wait_for_nodes_status( + node_names=[ocp_node.name], + status=constants.NODE_READY, + timeout=120, + sleep=5, + ) + logger.info("Checking that the Ceph health is OK") + ceph_health_check() + + +@brown_squad +@ignore_leftovers +@provider_client_platform_required +class TestNodesMaintenanceProviderMode(ManageTest): + """ + Test nodes maintenance scenarios when using a Provider mode + """ + + @pytest.fixture(autouse=True) + def setup(self, request, create_scale_pods_and_pvcs_using_kube_job_on_hci_clients): + """ + 1. Save the original index + 2. Switch to the correct cluster index + 3. Initialize the Sanity instance + + """ + self.orig_index = config.cur_index + switch_to_correct_cluster_at_setup(request) + self.sanity_helpers = SanityProviderMode( + create_scale_pods_and_pvcs_using_kube_job_on_hci_clients + ) + + @pytest.fixture(autouse=True) + def teardown(self, request): + """ + 1. Make sure all nodes are up again + 2. Switch to the original cluster index + 3. Check the Ceph health + + """ + + def finalizer(): + ocp_nodes = get_node_objs() + for n in ocp_nodes: + recover_node_to_ready_state(n) + + logger.info("Switch to the original cluster index") + config.switch_ctx(self.orig_index) + ceph_health_check() + + request.addfinalizer(finalizer) + + @tier4a + @pytest.mark.parametrize( + argnames=["cluster_type", "node_type"], + argvalues=[ + pytest.param( + *[HCI_PROVIDER, constants.WORKER_MACHINE], + marks=pytest.mark.polarion_id("OCS-5461"), + ), + pytest.param( + *[HCI_PROVIDER, constants.MASTER_MACHINE], + marks=pytest.mark.polarion_id("OCS-5462"), + ), + ], + ) + def test_node_maintenance(self, cluster_type, node_type): + """ + Test node maintenance + + """ + ocp_nodes = get_nodes(node_type=node_type) + ocp_node = random.choice(ocp_nodes) + check_drain_and_unschedule_node(ocp_node) + logger.info( + "Check basic cluster functionality by creating resources, run IO, " + "and deleting the resources" + ) + self.sanity_helpers.create_resources_on_clients() + self.sanity_helpers.delete_resources() + logger.info("Check the cluster health") + self.sanity_helpers.health_check_provider_mode() + + @tier4b + @pytest.mark.polarion_id("OCS-5466") + @pytest.mark.parametrize( + argnames=["cluster_type", "node_type"], + argvalues=[ + pytest.param(*[HCI_PROVIDER, constants.WORKER_MACHINE]), + ], + ) + def test_rolling_nodes_maintenance(self, cluster_type, node_type): + """ + Test maintenance nodes one after the other and check health status in between + + """ + ocp_nodes = get_nodes(node_type=node_type) + for ocp_node in ocp_nodes: + check_drain_and_unschedule_node(ocp_node) + + logger.info( + "Check basic cluster functionality by creating resources, run IO, " + "and deleting the resources" + ) + self.sanity_helpers.create_resources_on_clients() + self.sanity_helpers.delete_resources() + logger.info("Check the cluster health") + self.sanity_helpers.health_check_provider_mode()