diff --git a/tests/integration/ha_tests/conftest.py b/tests/integration/ha_tests/conftest.py index 68efb7e802..dd457f550d 100644 --- a/tests/integration/ha_tests/conftest.py +++ b/tests/integration/ha_tests/conftest.py @@ -5,11 +5,13 @@ from pytest_operator.plugin import OpsTest from tests.integration.ha_tests.helpers import ( + ORIGINAL_RESTART_DELAY, app_name, change_master_start_timeout, change_wal_settings, get_master_start_timeout, get_postgresql_parameter, + update_restart_delay, ) APPLICATION_NAME = "application" @@ -41,7 +43,16 @@ async def master_start_timeout(ops_test: OpsTest) -> None: initial_master_start_timeout = await get_master_start_timeout(ops_test) yield # Rollback to the initial configuration. - await change_master_start_timeout(ops_test, initial_master_start_timeout) + await change_master_start_timeout(ops_test, initial_master_start_timeout, use_random_unit=True) + + +@pytest.fixture() +async def reset_restart_delay(ops_test: OpsTest): + """Resets service file delay on all units.""" + yield + app = await app_name(ops_test) + for unit in ops_test.model.applications[app].units: + await update_restart_delay(ops_test, unit, ORIGINAL_RESTART_DELAY) @pytest.fixture() diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index dd9a69aa72..a5cd056713 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -1,5 +1,7 @@ # Copyright 2022 Canonical Ltd. # See LICENSE file for licensing details. +import random +import subprocess from pathlib import Path from typing import Optional, Set @@ -14,6 +16,10 @@ METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) PORT = 5432 APP_NAME = METADATA["name"] +PATRONI_SERVICE_DEFAULT_PATH = "/etc/systemd/system/patroni.service" +TMP_SERVICE_PATH = "tests/integration/ha_tests/tmp.service" +RESTART_DELAY = 60 * 3 +ORIGINAL_RESTART_DELAY = 30 class MemberNotListedOnClusterError(Exception): @@ -25,7 +31,30 @@ class MemberNotUpdatedOnClusterError(Exception): class ProcessError(Exception): - pass + """Raised when a process fails.""" + + +class ProcessRunningError(Exception): + """Raised when a process is running when it is not expected to be.""" + + +async def all_db_processes_down(ops_test: OpsTest, process: str) -> bool: + """Verifies that all units of the charm do not have the DB process running.""" + app = await app_name(ops_test) + + try: + for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): + with attempt: + for unit in ops_test.model.applications[app].units: + _, raw_pid, _ = await ops_test.juju("ssh", unit.name, "pgrep", "-f", process) + + # If something was returned, there is a running process. + if len(raw_pid) > 0: + raise ProcessRunningError + except RetryError: + return False + + return True async def app_name(ops_test: OpsTest, application_name: str = "postgresql") -> Optional[str]: @@ -44,18 +73,26 @@ async def app_name(ops_test: OpsTest, application_name: str = "postgresql") -> O return None -async def change_master_start_timeout(ops_test: OpsTest, seconds: Optional[int]) -> None: +async def change_master_start_timeout( + ops_test: OpsTest, seconds: Optional[int], use_random_unit: bool = False +) -> None: """Change master start timeout configuration. Args: ops_test: ops_test instance. seconds: number of seconds to set in master_start_timeout configuration. + use_random_unit: whether to use a random unit (default is False, + so it uses the primary) """ for attempt in Retrying(stop=stop_after_delay(30 * 2), wait=wait_fixed(3)): with attempt: app = await app_name(ops_test) - primary_name = await get_primary(ops_test, app) - unit_ip = get_unit_address(ops_test, primary_name) + if use_random_unit: + unit = get_random_unit(ops_test, app) + unit_ip = get_unit_address(ops_test, unit) + else: + primary_name = await get_primary(ops_test, app) + unit_ip = get_unit_address(ops_test, primary_name) requests.patch( f"http://{unit_ip}:8008/config", json={"master_start_timeout": seconds}, @@ -183,6 +220,11 @@ async def get_postgresql_parameter(ops_test: OpsTest, parameter_name: str) -> Op return parameter_value +def get_random_unit(ops_test: OpsTest, app: str) -> str: + """Returns a random unit name.""" + return random.choice(ops_test.model.applications[app].units).name + + async def get_password(ops_test: OpsTest, app: str, down_unit: str = None) -> str: """Use the charm action to retrieve the password from provided application. @@ -358,3 +400,41 @@ async def stop_continuous_writes(ops_test: OpsTest) -> int: ) action = await action.wait() return int(action.results["writes"]) + + +async def update_restart_delay(ops_test: OpsTest, unit, delay: int): + """Updates the restart delay in the DB service file. + + When the DB service fails it will now wait for `delay` number of seconds. + """ + # Load the service file from the unit and update it with the new delay. + await unit.scp_from(source=PATRONI_SERVICE_DEFAULT_PATH, destination=TMP_SERVICE_PATH) + with open(TMP_SERVICE_PATH, "r") as patroni_service_file: + patroni_service = patroni_service_file.readlines() + + for index, line in enumerate(patroni_service): + if "RestartSec" in line: + patroni_service[index] = f"RestartSec={delay}s\n" + + with open(TMP_SERVICE_PATH, "w") as service_file: + service_file.writelines(patroni_service) + + # Upload the changed file back to the unit, we cannot scp this file directly to + # PATRONI_SERVICE_DEFAULT_PATH since this directory has strict permissions, instead we scp it + # elsewhere and then move it to PATRONI_SERVICE_DEFAULT_PATH. + await unit.scp_to(source=TMP_SERVICE_PATH, destination="patroni.service") + mv_cmd = ( + f"run --unit {unit.name} mv /home/ubuntu/patroni.service {PATRONI_SERVICE_DEFAULT_PATH}" + ) + return_code, _, _ = await ops_test.juju(*mv_cmd.split()) + if return_code != 0: + raise ProcessError("Command: %s failed on unit: %s.", mv_cmd, unit.name) + + # Remove temporary file from machine. + subprocess.call(["rm", TMP_SERVICE_PATH]) + + # Reload the daemon for systemd otherwise changes are not saved. + reload_cmd = f"run --unit {unit.name} systemctl daemon-reload" + return_code, _, _ = await ops_test.juju(*reload_cmd.split()) + if return_code != 0: + raise ProcessError("Command: %s failed on unit: %s.", reload_cmd, unit.name) diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index 3046d18347..0c93007444 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # Copyright 2022 Canonical Ltd. # See LICENSE file for licensing details. +import asyncio import pytest from pytest_operator.plugin import OpsTest @@ -8,6 +9,8 @@ from tests.integration.ha_tests.helpers import ( METADATA, + RESTART_DELAY, + all_db_processes_down, app_name, change_master_start_timeout, change_wal_settings, @@ -22,6 +25,7 @@ send_signal_to_process, start_continuous_writes, stop_continuous_writes, + update_restart_delay, ) from tests.integration.helpers import ( db_connect, @@ -231,6 +235,64 @@ async def test_restart_db_process( ), "secondary not up to date with the cluster after restarting." +@pytest.mark.ha_self_healing_tests +@pytest.mark.parametrize("process", [PATRONI_PROCESS]) +@pytest.mark.parametrize("signal", ["SIGTERM", "SIGKILL"]) +async def test_full_cluster_restart( + ops_test: OpsTest, process: str, signal: str, continuous_writes, reset_restart_delay +) -> None: + """This tests checks that a cluster recovers from a full cluster restart. + + The test can be called a full cluster crash when the signal sent to the OS process + is SIGKILL. + """ + # Start an application that continuously writes data to the database. + app = await app_name(ops_test) + await start_continuous_writes(ops_test, app) + + # Update all units to have a new RESTART_DELAY. Modifying the Restart delay to 3 minutes + # should ensure enough time for all replicas to be down at the same time. + for unit in ops_test.model.applications[app].units: + await update_restart_delay(ops_test, unit, RESTART_DELAY) + + # Restart all units "simultaneously". + await asyncio.gather( + *[ + send_signal_to_process(ops_test, unit.name, process, kill_code=signal) + for unit in ops_test.model.applications[app].units + ] + ) + + # This test serves to verify behavior when all replicas are down at the same time that when + # they come back online they operate as expected. This check verifies that we meet the criteria + # of all replicas being down at the same time. + assert await all_db_processes_down(ops_test, process), "Not all units down at the same time." + + # Verify all units are up and running. + for unit in ops_test.model.applications[app].units: + assert await postgresql_ready( + ops_test, unit.name + ), f"unit {unit.name} not restarted after cluster restart." + + writes = await count_writes(ops_test) + for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)): + with attempt: + more_writes = await count_writes(ops_test) + assert more_writes > writes, "writes not continuing to DB" + + # Verify that all units are part of the same cluster. + member_ips = await fetch_cluster_members(ops_test) + ip_addresses = [unit.public_address for unit in ops_test.model.applications[app].units] + assert set(member_ips) == set(ip_addresses), "not all units are part of the same cluster." + + # Verify that no writes to the database were missed after stopping the writes. + total_expected_writes = await stop_continuous_writes(ops_test) + for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): + with attempt: + actual_writes = await count_writes(ops_test) + assert total_expected_writes == actual_writes, "writes to the db were missed." + + @pytest.mark.ha_self_healing_tests async def test_forceful_restart_without_data_and_transaction_logs( ops_test: OpsTest, diff --git a/tox.ini b/tox.ini index d8f5055d15..ee32c70660 100644 --- a/tox.ini +++ b/tox.ini @@ -78,7 +78,7 @@ deps = -r{toxinidir}/requirements.txt commands = # Download patroni resource to use in the charm deployment. - sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.3.tar.gz" -L -s > patroni.tar.gz' + sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.4.tar.gz" -L -s > patroni.tar.gz' pytest -v --tb native --ignore={[vars]tst_path}unit --log-cli-level=INFO -s {posargs} -m charm_tests # Remove the downloaded resource. sh -c 'rm -f patroni.tar.gz' @@ -97,7 +97,7 @@ deps = -r{toxinidir}/requirements.txt commands = # Download patroni resource to use in the charm deployment. - sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.3.tar.gz" -L -s > patroni.tar.gz' + sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.4.tar.gz" -L -s > patroni.tar.gz' pytest -v --tb native --ignore={[vars]tst_path}unit --log-cli-level=INFO -s {posargs} -m database_relation_tests # Remove the downloaded resource. sh -c 'rm -f patroni.tar.gz' @@ -116,7 +116,7 @@ deps = -r{toxinidir}/requirements.txt commands = # Download patroni resource to use in the charm deployment. - sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.3.tar.gz" -L -s > patroni.tar.gz' + sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.4.tar.gz" -L -s > patroni.tar.gz' pytest -v --tb native --ignore={[vars]tst_path}unit --log-cli-level=INFO -s {posargs} -m db_relation_tests # Remove the downloaded resource. sh -c 'rm -f patroni.tar.gz' @@ -135,7 +135,7 @@ deps = -r{toxinidir}/requirements.txt commands = # Download patroni resource to use in the charm deployment. - sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.3.tar.gz" -L -s > patroni.tar.gz' + sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.4.tar.gz" -L -s > patroni.tar.gz' pytest -v --tb native --ignore={[vars]tst_path}unit --log-cli-level=INFO -s {posargs} -m db_admin_relation_tests # Remove the downloaded resource. sh -c 'rm -f patroni.tar.gz' @@ -154,7 +154,7 @@ deps = -r{toxinidir}/requirements.txt commands = # Download patroni resource to use in the charm deployment. - sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.3.tar.gz" -L -s > patroni.tar.gz' + sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.4.tar.gz" -L -s > patroni.tar.gz' pytest -v --tb native --ignore={[vars]tst_path}unit --log-cli-level=INFO -s {posargs} -m ha_self_healing_tests # Remove the downloaded resource. sh -c 'rm -f patroni.tar.gz' @@ -173,7 +173,7 @@ deps = -r{toxinidir}/requirements.txt commands = # Download patroni resource to use in the charm deployment. - sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.3.tar.gz" -L -s > patroni.tar.gz' + sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.4.tar.gz" -L -s > patroni.tar.gz' pytest -v --tb native --ignore={[vars]tst_path}unit --log-cli-level=INFO -s {posargs} -m password_rotation_tests # Remove the downloaded resource. sh -c 'rm -f patroni.tar.gz' @@ -192,7 +192,7 @@ deps = -r{toxinidir}/requirements.txt commands = # Download patroni resource to use in the charm deployment. - sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.3.tar.gz" -L -s > patroni.tar.gz' + sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.4.tar.gz" -L -s > patroni.tar.gz' pytest -v --tb native --ignore={[vars]tst_path}unit --log-cli-level=INFO -s {posargs} -m tls_tests # Remove the downloaded resource. sh -c 'rm -f patroni.tar.gz' @@ -211,7 +211,7 @@ deps = -r{toxinidir}/requirements.txt commands = # Download patroni resource to use in the charm deployment. - sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.3.tar.gz" -L -s > patroni.tar.gz' + sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.4.tar.gz" -L -s > patroni.tar.gz' pytest -v --tb native --ignore={[vars]tst_path}unit --log-cli-level=INFO -s {posargs} # Remove the downloaded resource. sh -c 'rm -f patroni.tar.gz'