From b1e5454ad7c4b0b8bbae93d509bd1c05d8477c1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sinclert=20P=C3=A9rez?= Date: Mon, 20 Jan 2025 12:16:05 +0100 Subject: [PATCH] Add cluster manual rejoin mechanism --- src/charm.py | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/src/charm.py b/src/charm.py index 91c2ef7a3..4e6806bf2 100755 --- a/src/charm.py +++ b/src/charm.py @@ -449,7 +449,7 @@ def _handle_non_online_instance_status(self, state) -> None: } all_states.add("offline") - if all_states == {"offline"} and self.unit.is_leader(): + if self.unit.is_leader() and all_states == {"offline"}: # All instance are off or its a single unit cluster # reboot cluster from outage from the leader unit logger.info("Attempting reboot from complete outage.") @@ -460,6 +460,13 @@ def _handle_non_online_instance_status(self, state) -> None: logger.error("Failed to reboot cluster from complete outage.") self.unit.status = BlockedStatus("failed to recover cluster.") + elif self._mysql.is_cluster_auto_rejoin_ongoing(): + logger.info("Cluster auto-rejoin attempts are still ongoing.") + + elif not self._mysql.is_cluster_auto_rejoin_ongoing(): + logger.info("Cluster auto-rejoin attempts are exhausted. Attempting manual rejoin") + self._execute_manual_rejoin() + if state == "unreachable": try: if not snap_service_operation( @@ -471,6 +478,30 @@ def _handle_non_online_instance_status(self, state) -> None: except SnapServiceOperationError as e: self.unit.status = BlockedStatus(e.message) + def _execute_manual_rejoin(self) -> None: + """Executes an instance manual rejoin. + + It is supposed to be called when the MySQL 8.0.21+ auto-rejoin attempts have been exhausted, + on an OFFLINE replica that still belongs to the cluster + """ + if not self._mysql.is_instance_in_cluster(self.unit_label): + logger.warning("Instance does not belong to the cluster. Cannot perform manual rejoin") + return + + cluster_primary = self._get_primary_from_online_peer() + if not cluster_primary: + logger.warning("Instance does not have ONLINE peers. Cannot perform manual rejoin") + return + + self._mysql.remove_instance( + unit_label=self.unit_label, + ) + self._mysql.add_instance_to_cluster( + instance_address=self.unit_address, + instance_unit_label=self.unit_label, + from_instance=cluster_primary, + ) + def _on_update_status(self, _) -> None: # noqa: C901 """Handle update status.