Skip to content

Commit

Permalink
Add cluster manual rejoin test
Browse files Browse the repository at this point in the history
  • Loading branch information
sinclert-canonical committed Jan 23, 2025
1 parent 95788bf commit 41173b0
Showing 1 changed file with 62 additions and 0 deletions.
62 changes: 62 additions & 0 deletions tests/integration/high_availability/test_self_healing.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,3 +450,65 @@ async def test_sst_test(ops_test: OpsTest, highly_available_cluster, continuous_
database_name, table_name = "test-forceful-restart", "data"
await insert_data_into_mysql_and_validate_replication(ops_test, database_name, table_name)
await clean_up_database_and_table(ops_test, database_name, table_name)


@pytest.mark.group(7)
@pytest.mark.abort_on_fail
async def test_cluster_manual_rejoin(
ops_test: OpsTest, highly_available_cluster, continuous_writes
):
"""The cluster manual re-join test.
A graceful restart is performed in one of the instances (choosing Primary to make it painful).
In order to verify that the instance can come back ONLINE, after disabling automatic re-join
"""
# Ensure continuous writes still incrementing for all units
await ensure_all_units_continuous_writes_incrementing(ops_test)

mysql_app_name = get_application_name(ops_test, "mysql")
primary_unit = await get_primary_unit_wrapper(ops_test, mysql_app_name)

config = {
"username": CLUSTER_ADMIN_USERNAME,
"password": await get_system_user_password(primary_unit, CLUSTER_ADMIN_USERNAME),
"host": await get_unit_ip(ops_test, primary_unit.name),
}

queries = [
"SET PERSIST group_replication_autorejoin_tries=0",
]

# Disable automatic re-join procedure
await execute_queries_on_unit(
unit_address=config["host"],
username=config["username"],
password=config["password"],
queries=queries,
commit=True,
)

logger.info(f"Stopping server on unit {primary_unit.name}")
await graceful_stop_server(ops_test, primary_unit.name)

# Verify connection is not possible
assert not is_connection_possible(config), "❌ Connection is possible after instance stop"

logger.info(f"Re starting server on unit {primary_unit.name}")
await start_server(ops_test, primary_unit.name)

# Verify unit comes back active
async with ops_test.fast_forward():
logger.info("Waiting unit to enter in maintenance.")
await ops_test.model.block_until(
lambda: primary_unit.workload_status == "maintenance",
timeout=WAIT_TIMEOUT,
)

logger.info("Waiting unit to be back online.")
await ops_test.model.block_until(
lambda: primary_unit.workload_status == "active",
timeout=WAIT_TIMEOUT,
)

# Ensure continuous writes still incrementing for all units
await ensure_all_units_continuous_writes_incrementing(ops_test)

0 comments on commit 41173b0

Please sign in to comment.