Skip to content

Commit

Permalink
feature(nemesis): Nemesises to work with zero nodes
Browse files Browse the repository at this point in the history
Add new nemesises:
 - Add remove zero nodes to cluster
 - Set kill scylla to run on both data and zero nodes
 - update terminate and replace to run for data or zero node
respectively

Add new config yamls and new job to run
multidc config with zero nodes
  • Loading branch information
aleksbykov authored and fruch committed Nov 3, 2024
1 parent a58de1b commit efd670d
Show file tree
Hide file tree
Showing 6 changed files with 83 additions and 1 deletion.
3 changes: 3 additions & 0 deletions configurations/zerotoken_nodes/zero_nodes_nemesis_set.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
nemesis_class_name: 'ZeroTokenSetMonkey'
use_zero_nodes: true
zero_token_instance_type_db: 'i4i.large'
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
n_db_zero_token_nodes: 1
zero_token_instance_type_db: 'i4i.large'
use_zero_nodes: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!groovy

// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43
def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)

longevityPipeline(
backend: 'aws',
region: '''["eu-west-1", "eu-west-2", "eu-north-1"]''',
test_name: 'longevity_test.LongevityTest.test_custom_time',
test_config: '''["test-cases/longevity/longevity-multi-dc-rack-aware-with-znode-in-diff_dc.yaml"]''',
)
40 changes: 40 additions & 0 deletions sdcm/nemesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -5233,6 +5233,19 @@ def disrupt_disable_binary_gossip_execute_major_compaction(self):
self.target_node.restart_scylla_server()
raise

@target_all_nodes
def disrupt_grow_shrink_zero_nodes(self):
""""Add/remove znodes to same dc where target node. The target node could be any node"""
if not self.cluster.params.get('use_zero_nodes'):
raise UnsupportedNemesis("The zero tokens support is not enabled")

duration_with_znode = 300
new_znode = self._add_and_init_new_cluster_nodes(count=1, is_zero_node=True)[0]
self.log.debug("Run with zero-token node %s for %ds", new_znode.name, duration_with_znode)
time.sleep(duration_with_znode)
znode = random.choice([node for node in self.cluster.zero_nodes if node.dc_idx == self.target_node.dc_idx])
self.decommission_nodes(nodes=[znode])


def disrupt_method_wrapper(method, is_exclusive=False): # pylint: disable=too-many-statements # noqa: PLR0915
"""
Expand Down Expand Up @@ -5494,6 +5507,7 @@ class StopWaitStartMonkey(Nemesis):
disruptive = True
kubernetes = True
limited = True
zero_node_changes = True

def disrupt(self):
self.disrupt_stop_wait_start_scylla_server(600)
Expand Down Expand Up @@ -6127,6 +6141,7 @@ class NodeTerminateAndReplace(Nemesis):
# While on kubernetes we put it all on scylla-operator
kubernetes = False
topology_changes = True
zero_node_changes = True

def disrupt(self):
self.disrupt_terminate_and_replace_node()
Expand Down Expand Up @@ -6752,3 +6767,28 @@ class EndOfQuotaNemesis(Nemesis):

def disrupt(self):
self.disrupt_end_of_quota_nemesis()


class GrowShrinkZeroTokenNode(Nemesis):

disruptive = True
schema_changes = False
free_tier_set = False
zero_node_changes = True

def disrupt(self):
self.disrupt_grow_shrink_zero_nodes()


class ZeroTokenSetMonkey(SisyphusMonkey):
"""Nemesis set for testing Scylla with configured zero nodes
Disruptions that can be caused by random failures and user actions with
zero node configured
"""

def __init__(self, *args, **kwargs):
super(SisyphusMonkey, self).__init__(*args, **kwargs) # pylint: disable=bad-super-call
self.use_all_nodes_as_target = True
self.build_list_of_disruptions_to_execute(nemesis_selector=['zero_node_changes'])
self.shuffle_list_of_disruptions()
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
test_duration: 800

prepare_write_cmd: ["cassandra-stress write cl=LOCAL_QUORUM n=20971520 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3,eu-northscylla_node_north=0) compaction(strategy=SizeTieredCompactionStrategy)' -port jmx=6868 -mode cql3 native -rate threads=80 -pop seq=1..20971520 -col 'n=FIXED(10) size=FIXED(512)' -log interval=5",
]

stress_cmd: ["cassandra-stress write cl=LOCAL_QUORUM duration=720m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3,eu-northscylla_node_north=0) compaction(strategy=SizeTieredCompactionStrategy)' -port jmx=6868 -mode cql3 native -rate threads=40 -pop 'dist=uniform(1..20971520)' -col 'n=FIXED(10) size=FIXED(512)' -log interval=5 -errors retries=50",
"cassandra-stress read cl=LOCAL_QUORUM duration=720m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3,eu-northscylla_node_north=0) compaction(strategy=SizeTieredCompactionStrategy)' -port jmx=6868 -mode cql3 native -rate threads=40 -pop 'dist=uniform(1..20971520)' -col 'n=FIXED(10) size=FIXED(512)' -log interval=5 -errors retries=50",
]

n_db_nodes: '3 3 0'
n_loaders: '1 1'
n_monitor_nodes: 1
n_db_zero_token_nodes: '0 1 1'

instance_type_db: 'i4i.4xlarge'
zero_token_instance_type_db: 'i4i.large'
use_zero_nodes: true

nemesis_class_name: 'SisyphusMonkey'
nemesis_interval: 10
nemesis_filter_seeds: false

round_robin: false

user_prefix: 'multi-dc-rackaware-with-znode-dc'
2 changes: 1 addition & 1 deletion unit_tests/test_nemesis_sisyphus.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def test_list_all_available_nemesis(generate_file=True):
disruption_list, disruptions_dict, disruption_classes = sisyphus.get_list_of_disrupt_methods(
subclasses_list=subclasses, export_properties=True)

assert len(disruption_list) == 88
assert len(disruption_list) == 89

if generate_file:
with open(sct_abs_path('data_dir/nemesis.yml'), 'w', encoding="utf-8") as outfile1:
Expand Down

0 comments on commit efd670d

Please sign in to comment.