Add cluster manual rejoin test

canonical · Jan 23, 2025 · 41173b0 · 41173b0
1 parent 95788bf
commit 41173b0
Showing 1 changed file with 62 additions and 0 deletions.
diff --git a/tests/integration/high_availability/test_self_healing.py b/tests/integration/high_availability/test_self_healing.py
@@ -450,3 +450,65 @@ async def test_sst_test(ops_test: OpsTest, highly_available_cluster, continuous_
     database_name, table_name = "test-forceful-restart", "data"
     await insert_data_into_mysql_and_validate_replication(ops_test, database_name, table_name)
     await clean_up_database_and_table(ops_test, database_name, table_name)
+
+
+@pytest.mark.group(7)
+@pytest.mark.abort_on_fail
+async def test_cluster_manual_rejoin(
+    ops_test: OpsTest, highly_available_cluster, continuous_writes
+):
+    """The cluster manual re-join test.
+
+    A graceful restart is performed in one of the instances (choosing Primary to make it painful).
+    In order to verify that the instance can come back ONLINE, after disabling automatic re-join
+    """
+    # Ensure continuous writes still incrementing for all units
+    await ensure_all_units_continuous_writes_incrementing(ops_test)
+
+    mysql_app_name = get_application_name(ops_test, "mysql")
+    primary_unit = await get_primary_unit_wrapper(ops_test, mysql_app_name)
+
+    config = {
+        "username": CLUSTER_ADMIN_USERNAME,
+        "password": await get_system_user_password(primary_unit, CLUSTER_ADMIN_USERNAME),
+        "host": await get_unit_ip(ops_test, primary_unit.name),
+    }
+
+    queries = [
+        "SET PERSIST group_replication_autorejoin_tries=0",
+    ]
+
+    # Disable automatic re-join procedure
+    await execute_queries_on_unit(
+        unit_address=config["host"],
+        username=config["username"],
+        password=config["password"],
+        queries=queries,
+        commit=True,
+    )
+
+    logger.info(f"Stopping server on unit {primary_unit.name}")
+    await graceful_stop_server(ops_test, primary_unit.name)
+
+    # Verify connection is not possible
+    assert not is_connection_possible(config), "❌ Connection is possible after instance stop"
+
+    logger.info(f"Re starting server on unit {primary_unit.name}")
+    await start_server(ops_test, primary_unit.name)
+
+    # Verify unit comes back active
+    async with ops_test.fast_forward():
+        logger.info("Waiting unit to enter in maintenance.")
+        await ops_test.model.block_until(
+            lambda: primary_unit.workload_status == "maintenance",
+            timeout=WAIT_TIMEOUT,
+        )
+
+        logger.info("Waiting unit to be back online.")
+        await ops_test.model.block_until(
+            lambda: primary_unit.workload_status == "active",
+            timeout=WAIT_TIMEOUT,
+        )
+
+    # Ensure continuous writes still incrementing for all units
+    await ensure_all_units_continuous_writes_incrementing(ops_test)