Skip to content

Commit

Permalink
Add cluster manual rejoin mechanism
Browse files Browse the repository at this point in the history
  • Loading branch information
sinclert-canonical committed Jan 22, 2025
1 parent 68b5819 commit b1e5454
Showing 1 changed file with 32 additions and 1 deletion.
33 changes: 32 additions & 1 deletion src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ def _handle_non_online_instance_status(self, state) -> None:
}
all_states.add("offline")

if all_states == {"offline"} and self.unit.is_leader():
if self.unit.is_leader() and all_states == {"offline"}:
# All instance are off or its a single unit cluster
# reboot cluster from outage from the leader unit
logger.info("Attempting reboot from complete outage.")
Expand All @@ -460,6 +460,13 @@ def _handle_non_online_instance_status(self, state) -> None:
logger.error("Failed to reboot cluster from complete outage.")
self.unit.status = BlockedStatus("failed to recover cluster.")

elif self._mysql.is_cluster_auto_rejoin_ongoing():
logger.info("Cluster auto-rejoin attempts are still ongoing.")

elif not self._mysql.is_cluster_auto_rejoin_ongoing():
logger.info("Cluster auto-rejoin attempts are exhausted. Attempting manual rejoin")
self._execute_manual_rejoin()

if state == "unreachable":
try:
if not snap_service_operation(
Expand All @@ -471,6 +478,30 @@ def _handle_non_online_instance_status(self, state) -> None:
except SnapServiceOperationError as e:
self.unit.status = BlockedStatus(e.message)

def _execute_manual_rejoin(self) -> None:
"""Executes an instance manual rejoin.
It is supposed to be called when the MySQL 8.0.21+ auto-rejoin attempts have been exhausted,
on an OFFLINE replica that still belongs to the cluster
"""
if not self._mysql.is_instance_in_cluster(self.unit_label):
logger.warning("Instance does not belong to the cluster. Cannot perform manual rejoin")
return

cluster_primary = self._get_primary_from_online_peer()
if not cluster_primary:
logger.warning("Instance does not have ONLINE peers. Cannot perform manual rejoin")
return

self._mysql.remove_instance(
unit_label=self.unit_label,
)
self._mysql.add_instance_to_cluster(
instance_address=self.unit_address,
instance_unit_label=self.unit_label,
from_instance=cluster_primary,
)

def _on_update_status(self, _) -> None: # noqa: C901
"""Handle update status.
Expand Down

0 comments on commit b1e5454

Please sign in to comment.