From efd670de376ac37d1a3040ff36729c1670486f38 Mon Sep 17 00:00:00 2001 From: Aleksandr Bykov Date: Wed, 23 Oct 2024 21:43:45 +0700 Subject: [PATCH] feature(nemesis): Nemesises to work with zero nodes Add new nemesises: - Add remove zero nodes to cluster - Set kill scylla to run on both data and zero nodes - update terminate and replace to run for data or zero node respectively Add new config yamls and new job to run multidc config with zero nodes --- .../zero_nodes_nemesis_set.yaml | 3 ++ ...rotoken_node_i4ilarge_1_num_single_dc.yaml | 3 ++ ...ti-dc-rack-aware-with-znode-dc.jenkinsfile | 11 +++++ sdcm/nemesis.py | 40 +++++++++++++++++++ ...i-dc-rack-aware-with-znode-in-diff_dc.yaml | 25 ++++++++++++ unit_tests/test_nemesis_sisyphus.py | 2 +- 6 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 configurations/zerotoken_nodes/zero_nodes_nemesis_set.yaml create mode 100644 configurations/zerotoken_nodes/zerotoken_node_i4ilarge_1_num_single_dc.yaml create mode 100644 jenkins-pipelines/oss/longevity/longevity-multi-dc-rack-aware-with-znode-dc.jenkinsfile create mode 100644 test-cases/longevity/longevity-multi-dc-rack-aware-with-znode-in-diff_dc.yaml diff --git a/configurations/zerotoken_nodes/zero_nodes_nemesis_set.yaml b/configurations/zerotoken_nodes/zero_nodes_nemesis_set.yaml new file mode 100644 index 0000000000..edc7e4a030 --- /dev/null +++ b/configurations/zerotoken_nodes/zero_nodes_nemesis_set.yaml @@ -0,0 +1,3 @@ +nemesis_class_name: 'ZeroTokenSetMonkey' +use_zero_nodes: true +zero_token_instance_type_db: 'i4i.large' diff --git a/configurations/zerotoken_nodes/zerotoken_node_i4ilarge_1_num_single_dc.yaml b/configurations/zerotoken_nodes/zerotoken_node_i4ilarge_1_num_single_dc.yaml new file mode 100644 index 0000000000..3216181856 --- /dev/null +++ b/configurations/zerotoken_nodes/zerotoken_node_i4ilarge_1_num_single_dc.yaml @@ -0,0 +1,3 @@ +n_db_zero_token_nodes: 1 +zero_token_instance_type_db: 'i4i.large' +use_zero_nodes: true diff --git a/jenkins-pipelines/oss/longevity/longevity-multi-dc-rack-aware-with-znode-dc.jenkinsfile b/jenkins-pipelines/oss/longevity/longevity-multi-dc-rack-aware-with-znode-dc.jenkinsfile new file mode 100644 index 0000000000..4ff8c71e1c --- /dev/null +++ b/jenkins-pipelines/oss/longevity/longevity-multi-dc-rack-aware-with-znode-dc.jenkinsfile @@ -0,0 +1,11 @@ +#!groovy + +// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43 +def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm) + +longevityPipeline( + backend: 'aws', + region: '''["eu-west-1", "eu-west-2", "eu-north-1"]''', + test_name: 'longevity_test.LongevityTest.test_custom_time', + test_config: '''["test-cases/longevity/longevity-multi-dc-rack-aware-with-znode-in-diff_dc.yaml"]''', +) diff --git a/sdcm/nemesis.py b/sdcm/nemesis.py index 3d3bb38516..1094f5f012 100644 --- a/sdcm/nemesis.py +++ b/sdcm/nemesis.py @@ -5233,6 +5233,19 @@ def disrupt_disable_binary_gossip_execute_major_compaction(self): self.target_node.restart_scylla_server() raise + @target_all_nodes + def disrupt_grow_shrink_zero_nodes(self): + """"Add/remove znodes to same dc where target node. The target node could be any node""" + if not self.cluster.params.get('use_zero_nodes'): + raise UnsupportedNemesis("The zero tokens support is not enabled") + + duration_with_znode = 300 + new_znode = self._add_and_init_new_cluster_nodes(count=1, is_zero_node=True)[0] + self.log.debug("Run with zero-token node %s for %ds", new_znode.name, duration_with_znode) + time.sleep(duration_with_znode) + znode = random.choice([node for node in self.cluster.zero_nodes if node.dc_idx == self.target_node.dc_idx]) + self.decommission_nodes(nodes=[znode]) + def disrupt_method_wrapper(method, is_exclusive=False): # pylint: disable=too-many-statements # noqa: PLR0915 """ @@ -5494,6 +5507,7 @@ class StopWaitStartMonkey(Nemesis): disruptive = True kubernetes = True limited = True + zero_node_changes = True def disrupt(self): self.disrupt_stop_wait_start_scylla_server(600) @@ -6127,6 +6141,7 @@ class NodeTerminateAndReplace(Nemesis): # While on kubernetes we put it all on scylla-operator kubernetes = False topology_changes = True + zero_node_changes = True def disrupt(self): self.disrupt_terminate_and_replace_node() @@ -6752,3 +6767,28 @@ class EndOfQuotaNemesis(Nemesis): def disrupt(self): self.disrupt_end_of_quota_nemesis() + + +class GrowShrinkZeroTokenNode(Nemesis): + + disruptive = True + schema_changes = False + free_tier_set = False + zero_node_changes = True + + def disrupt(self): + self.disrupt_grow_shrink_zero_nodes() + + +class ZeroTokenSetMonkey(SisyphusMonkey): + """Nemesis set for testing Scylla with configured zero nodes + + Disruptions that can be caused by random failures and user actions with + zero node configured + """ + + def __init__(self, *args, **kwargs): + super(SisyphusMonkey, self).__init__(*args, **kwargs) # pylint: disable=bad-super-call + self.use_all_nodes_as_target = True + self.build_list_of_disruptions_to_execute(nemesis_selector=['zero_node_changes']) + self.shuffle_list_of_disruptions() diff --git a/test-cases/longevity/longevity-multi-dc-rack-aware-with-znode-in-diff_dc.yaml b/test-cases/longevity/longevity-multi-dc-rack-aware-with-znode-in-diff_dc.yaml new file mode 100644 index 0000000000..5e5d726115 --- /dev/null +++ b/test-cases/longevity/longevity-multi-dc-rack-aware-with-znode-in-diff_dc.yaml @@ -0,0 +1,25 @@ +test_duration: 800 + +prepare_write_cmd: ["cassandra-stress write cl=LOCAL_QUORUM n=20971520 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3,eu-northscylla_node_north=0) compaction(strategy=SizeTieredCompactionStrategy)' -port jmx=6868 -mode cql3 native -rate threads=80 -pop seq=1..20971520 -col 'n=FIXED(10) size=FIXED(512)' -log interval=5", + ] + +stress_cmd: ["cassandra-stress write cl=LOCAL_QUORUM duration=720m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3,eu-northscylla_node_north=0) compaction(strategy=SizeTieredCompactionStrategy)' -port jmx=6868 -mode cql3 native -rate threads=40 -pop 'dist=uniform(1..20971520)' -col 'n=FIXED(10) size=FIXED(512)' -log interval=5 -errors retries=50", + "cassandra-stress read cl=LOCAL_QUORUM duration=720m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3,eu-northscylla_node_north=0) compaction(strategy=SizeTieredCompactionStrategy)' -port jmx=6868 -mode cql3 native -rate threads=40 -pop 'dist=uniform(1..20971520)' -col 'n=FIXED(10) size=FIXED(512)' -log interval=5 -errors retries=50", + ] + +n_db_nodes: '3 3 0' +n_loaders: '1 1' +n_monitor_nodes: 1 +n_db_zero_token_nodes: '0 1 1' + +instance_type_db: 'i4i.4xlarge' +zero_token_instance_type_db: 'i4i.large' +use_zero_nodes: true + +nemesis_class_name: 'SisyphusMonkey' +nemesis_interval: 10 +nemesis_filter_seeds: false + +round_robin: false + +user_prefix: 'multi-dc-rackaware-with-znode-dc' diff --git a/unit_tests/test_nemesis_sisyphus.py b/unit_tests/test_nemesis_sisyphus.py index 22db260f30..2ad1726a2b 100644 --- a/unit_tests/test_nemesis_sisyphus.py +++ b/unit_tests/test_nemesis_sisyphus.py @@ -80,7 +80,7 @@ def test_list_all_available_nemesis(generate_file=True): disruption_list, disruptions_dict, disruption_classes = sisyphus.get_list_of_disrupt_methods( subclasses_list=subclasses, export_properties=True) - assert len(disruption_list) == 88 + assert len(disruption_list) == 89 if generate_file: with open(sct_abs_path('data_dir/nemesis.yml'), 'w', encoding="utf-8") as outfile1: