Skip to content

Commit

Permalink
fix(_get_keyspaces_to_decrease_rf): Address a case where no keyspace …
Browse files Browse the repository at this point in the history
…RF value of DC

In case no keyspace replication-factor value is retrieved in a DC,
A warning is logged and the keyspace is ignored (skipped).
Fixes: #8694

(cherry picked from commit e57d75b)

# Conflicts:
#	sdcm/utils/replication_strategy_utils.py
  • Loading branch information
yarongilor authored and mergify[bot] committed Sep 15, 2024
1 parent 38e2ebe commit 05f2773
Showing 1 changed file with 122 additions and 0 deletions.
122 changes: 122 additions & 0 deletions sdcm/utils/replication_strategy_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,125 @@ def __call__(self, **keyspaces: ReplicationStrategy) -> None:
for keyspace, strategy in keyspaces.items():
self._preserve_replication_strategy(keyspace)
strategy.apply(self.node, keyspace)
<<<<<<< HEAD
=======


class DataCenterTopologyRfControl:
"""
This class manages and controls the replication factor (RF) of keyspaces in a ScyllaDB data center, when nodes are removed or re-added to the cluster.
**Purpose**:
- In scenarios where a keyspace has an RF equal to the total number of nodes in a data center, decommissioning a node is not supported where tablets are used.
- This class provides functionality to temporarily decrease the RF of such keyspaces before a node decommissioning operation and revert them back to their original RF after a new node is added.
**Usage**:
1. **`decrease_keyspaces_rf`**: Identifies keyspaces with RF equal to the total number of nodes in the data center and decreases their RF by 1. This is necessary so decommissioning a node is allowed (with tablets).
2. **`revert_to_original_keyspaces_rf`**: Reverts the RF of the keyspaces back to their original values after a new node is added to the data center.
Attributes:
- `target_node`: The node to decommission.
- `datacenter`: The data center to which the target node belongs.
- `decreased_rf_keyspaces`: A list of keyspaces whose RF has been decreased.
- `original_nodes_number`: The original number of nodes in the data center (before decommission).
"""

def __init__(self, target_node: 'BaseNode') -> None:
self.target_node = target_node
self.cluster = target_node.parent_cluster
self.datacenter = target_node.datacenter
self.decreased_rf_keyspaces = []
self.original_nodes_number = self._get_original_nodes_number(target_node)

def _get_original_nodes_number(self, node: 'BaseNode') -> int:
# Get the original number of nodes in the data center
return len([n for n in self.cluster.nodes if n.dc_idx == node.dc_idx])

def _get_keyspaces_to_decrease_rf(self, session) -> list:
"""
Returns a list of keyspaces of the data-center that have the specified replication factor.
Example:
For a replication_factor of 3 and dc of "dc1", the output might be:
["keyspace1", "scylla_bench"]
"""
query = "SELECT keyspace_name, replication FROM system_schema.keyspaces"
cql_result = session.execute(query)

matching_keyspaces = []

for row in cql_result.current_rows:
keyspace_name = row.keyspace_name

if is_system_keyspace(keyspace_name):
continue

replication = row.replication

if 'SimpleStrategy' in replication['class']:
continue # Skip keyspace using SimpleStrategy

if 'NetworkTopologyStrategy' in replication['class']:
rf = replication.get(self.datacenter)
if rf is None:
LOGGER.warning(
f"Datacenter {self.datacenter} not found in replication strategy for keyspace {keyspace_name}.")
continue
if int(rf) == self.original_nodes_number:
matching_keyspaces.append(keyspace_name)
else:
LOGGER.warning("Unexpected replication strategy found: %s", replication['class'])

return matching_keyspaces

def _alter_keyspace_rf(self, keyspace: str, replication_factor: int, session):
# Alter the replication factor for keyspace of the data-center.

alter_ks_cmd = f"ALTER KEYSPACE {keyspace} WITH REPLICATION = {{ 'class' : 'NetworkTopologyStrategy', '{self.datacenter}':{replication_factor} }}"
message = f"Altering {keyspace} RF with: {alter_ks_cmd}"
LOGGER.debug(message)
try:
session.execute(alter_ks_cmd)
except Exception as error:
LOGGER.error(f"{message} Failed with: {error}")
raise error

def revert_to_original_keyspaces_rf(self, node_to_wait_for_balance: 'BaseNode' = None):
if self.decreased_rf_keyspaces:
LOGGER.debug(f"Reverting keyspaces replication factor to original value of {self.datacenter}..")
with self.cluster.cql_connection_patient(self.cluster.nodes[0]) as session:
for keyspace in self.decreased_rf_keyspaces:
self._alter_keyspace_rf(keyspace=keyspace, replication_factor=self.original_nodes_number,
session=session)
if node_to_wait_for_balance:
wait_for_tablets_balanced(node_to_wait_for_balance)

def decrease_keyspaces_rf(self):
"""
If any keyspace RF equals to number-of-cluster-nodes, where tablets are in use,
then a decommission is not supported.
In this case, the user has to decrease the replication-factor of any such keyspace first.
Later on, after adding a new node, such a keyspace can be reconfigured back to its original
replication-factor value.
"""
node = self.target_node
with self.cluster.cql_connection_patient(node) as session:
# Ensure that nodes_num is 2 or greater
if self.original_nodes_number > 1:
if decreased_rf_keyspaces := self._get_keyspaces_to_decrease_rf(session=session):
LOGGER.debug(
f"Found the following keyspaces with replication factor to decrease: {decreased_rf_keyspaces}")
try:
for keyspace in decreased_rf_keyspaces:
self._alter_keyspace_rf(keyspace=keyspace, replication_factor=self.original_nodes_number - 1,
session=session)
self.decreased_rf_keyspaces.append(keyspace)
except Exception as error:
self.revert_to_original_keyspaces_rf()
LOGGER.error(
f"Decreasing keyspace replication factor failed with: ({error}), aborting operation")
raise error
else:
LOGGER.error(
f"DC {self.datacenter} has {self.original_nodes_number} nodes. Cannot alter replication factor")
>>>>>>> e57d75b34 (fix(_get_keyspaces_to_decrease_rf): Address a case where no keyspace RF value of DC)

0 comments on commit 05f2773

Please sign in to comment.