Skip to content

Commit

Permalink
Full cluster restart and crash tests (#45)
Browse files Browse the repository at this point in the history
* Add restart db process test

* Change restart delay

* Pin Juju agent version on CI

* Update test code

* Change processes that should be tested

* Re enable tests

* Remove unused code

* Remove unused import

* Remove unused code

* Add docstring

* Fix Patroni API calls

* Fix kill db process test error

* Improve comments

* Remove unused code

* Revert change

* Unify tests

* Replace ps aux | grep with pgrep

* Readd changes removed on merge
  • Loading branch information
marceloneppel authored Dec 2, 2022
1 parent d94460d commit 0073d8f
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 13 deletions.
13 changes: 12 additions & 1 deletion tests/integration/ha_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@
from pytest_operator.plugin import OpsTest

from tests.integration.ha_tests.helpers import (
ORIGINAL_RESTART_DELAY,
app_name,
change_master_start_timeout,
change_wal_settings,
get_master_start_timeout,
get_postgresql_parameter,
update_restart_delay,
)

APPLICATION_NAME = "application"
Expand Down Expand Up @@ -41,7 +43,16 @@ async def master_start_timeout(ops_test: OpsTest) -> None:
initial_master_start_timeout = await get_master_start_timeout(ops_test)
yield
# Rollback to the initial configuration.
await change_master_start_timeout(ops_test, initial_master_start_timeout)
await change_master_start_timeout(ops_test, initial_master_start_timeout, use_random_unit=True)


@pytest.fixture()
async def reset_restart_delay(ops_test: OpsTest):
"""Resets service file delay on all units."""
yield
app = await app_name(ops_test)
for unit in ops_test.model.applications[app].units:
await update_restart_delay(ops_test, unit, ORIGINAL_RESTART_DELAY)


@pytest.fixture()
Expand Down
88 changes: 84 additions & 4 deletions tests/integration/ha_tests/helpers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Copyright 2022 Canonical Ltd.
# See LICENSE file for licensing details.
import random
import subprocess
from pathlib import Path
from typing import Optional, Set

Expand All @@ -14,6 +16,10 @@
METADATA = yaml.safe_load(Path("./metadata.yaml").read_text())
PORT = 5432
APP_NAME = METADATA["name"]
PATRONI_SERVICE_DEFAULT_PATH = "/etc/systemd/system/patroni.service"
TMP_SERVICE_PATH = "tests/integration/ha_tests/tmp.service"
RESTART_DELAY = 60 * 3
ORIGINAL_RESTART_DELAY = 30


class MemberNotListedOnClusterError(Exception):
Expand All @@ -25,7 +31,30 @@ class MemberNotUpdatedOnClusterError(Exception):


class ProcessError(Exception):
pass
"""Raised when a process fails."""


class ProcessRunningError(Exception):
"""Raised when a process is running when it is not expected to be."""


async def all_db_processes_down(ops_test: OpsTest, process: str) -> bool:
"""Verifies that all units of the charm do not have the DB process running."""
app = await app_name(ops_test)

try:
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)):
with attempt:
for unit in ops_test.model.applications[app].units:
_, raw_pid, _ = await ops_test.juju("ssh", unit.name, "pgrep", "-f", process)

# If something was returned, there is a running process.
if len(raw_pid) > 0:
raise ProcessRunningError
except RetryError:
return False

return True


async def app_name(ops_test: OpsTest, application_name: str = "postgresql") -> Optional[str]:
Expand All @@ -44,18 +73,26 @@ async def app_name(ops_test: OpsTest, application_name: str = "postgresql") -> O
return None


async def change_master_start_timeout(ops_test: OpsTest, seconds: Optional[int]) -> None:
async def change_master_start_timeout(
ops_test: OpsTest, seconds: Optional[int], use_random_unit: bool = False
) -> None:
"""Change master start timeout configuration.
Args:
ops_test: ops_test instance.
seconds: number of seconds to set in master_start_timeout configuration.
use_random_unit: whether to use a random unit (default is False,
so it uses the primary)
"""
for attempt in Retrying(stop=stop_after_delay(30 * 2), wait=wait_fixed(3)):
with attempt:
app = await app_name(ops_test)
primary_name = await get_primary(ops_test, app)
unit_ip = get_unit_address(ops_test, primary_name)
if use_random_unit:
unit = get_random_unit(ops_test, app)
unit_ip = get_unit_address(ops_test, unit)
else:
primary_name = await get_primary(ops_test, app)
unit_ip = get_unit_address(ops_test, primary_name)
requests.patch(
f"http://{unit_ip}:8008/config",
json={"master_start_timeout": seconds},
Expand Down Expand Up @@ -183,6 +220,11 @@ async def get_postgresql_parameter(ops_test: OpsTest, parameter_name: str) -> Op
return parameter_value


def get_random_unit(ops_test: OpsTest, app: str) -> str:
"""Returns a random unit name."""
return random.choice(ops_test.model.applications[app].units).name


async def get_password(ops_test: OpsTest, app: str, down_unit: str = None) -> str:
"""Use the charm action to retrieve the password from provided application.
Expand Down Expand Up @@ -358,3 +400,41 @@ async def stop_continuous_writes(ops_test: OpsTest) -> int:
)
action = await action.wait()
return int(action.results["writes"])


async def update_restart_delay(ops_test: OpsTest, unit, delay: int):
"""Updates the restart delay in the DB service file.
When the DB service fails it will now wait for `delay` number of seconds.
"""
# Load the service file from the unit and update it with the new delay.
await unit.scp_from(source=PATRONI_SERVICE_DEFAULT_PATH, destination=TMP_SERVICE_PATH)
with open(TMP_SERVICE_PATH, "r") as patroni_service_file:
patroni_service = patroni_service_file.readlines()

for index, line in enumerate(patroni_service):
if "RestartSec" in line:
patroni_service[index] = f"RestartSec={delay}s\n"

with open(TMP_SERVICE_PATH, "w") as service_file:
service_file.writelines(patroni_service)

# Upload the changed file back to the unit, we cannot scp this file directly to
# PATRONI_SERVICE_DEFAULT_PATH since this directory has strict permissions, instead we scp it
# elsewhere and then move it to PATRONI_SERVICE_DEFAULT_PATH.
await unit.scp_to(source=TMP_SERVICE_PATH, destination="patroni.service")
mv_cmd = (
f"run --unit {unit.name} mv /home/ubuntu/patroni.service {PATRONI_SERVICE_DEFAULT_PATH}"
)
return_code, _, _ = await ops_test.juju(*mv_cmd.split())
if return_code != 0:
raise ProcessError("Command: %s failed on unit: %s.", mv_cmd, unit.name)

# Remove temporary file from machine.
subprocess.call(["rm", TMP_SERVICE_PATH])

# Reload the daemon for systemd otherwise changes are not saved.
reload_cmd = f"run --unit {unit.name} systemctl daemon-reload"
return_code, _, _ = await ops_test.juju(*reload_cmd.split())
if return_code != 0:
raise ProcessError("Command: %s failed on unit: %s.", reload_cmd, unit.name)
62 changes: 62 additions & 0 deletions tests/integration/ha_tests/test_self_healing.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
#!/usr/bin/env python3
# Copyright 2022 Canonical Ltd.
# See LICENSE file for licensing details.
import asyncio

import pytest
from pytest_operator.plugin import OpsTest
from tenacity import Retrying, stop_after_delay, wait_fixed

from tests.integration.ha_tests.helpers import (
METADATA,
RESTART_DELAY,
all_db_processes_down,
app_name,
change_master_start_timeout,
change_wal_settings,
Expand All @@ -22,6 +25,7 @@
send_signal_to_process,
start_continuous_writes,
stop_continuous_writes,
update_restart_delay,
)
from tests.integration.helpers import (
db_connect,
Expand Down Expand Up @@ -231,6 +235,64 @@ async def test_restart_db_process(
), "secondary not up to date with the cluster after restarting."


@pytest.mark.ha_self_healing_tests
@pytest.mark.parametrize("process", [PATRONI_PROCESS])
@pytest.mark.parametrize("signal", ["SIGTERM", "SIGKILL"])
async def test_full_cluster_restart(
ops_test: OpsTest, process: str, signal: str, continuous_writes, reset_restart_delay
) -> None:
"""This tests checks that a cluster recovers from a full cluster restart.
The test can be called a full cluster crash when the signal sent to the OS process
is SIGKILL.
"""
# Start an application that continuously writes data to the database.
app = await app_name(ops_test)
await start_continuous_writes(ops_test, app)

# Update all units to have a new RESTART_DELAY. Modifying the Restart delay to 3 minutes
# should ensure enough time for all replicas to be down at the same time.
for unit in ops_test.model.applications[app].units:
await update_restart_delay(ops_test, unit, RESTART_DELAY)

# Restart all units "simultaneously".
await asyncio.gather(
*[
send_signal_to_process(ops_test, unit.name, process, kill_code=signal)
for unit in ops_test.model.applications[app].units
]
)

# This test serves to verify behavior when all replicas are down at the same time that when
# they come back online they operate as expected. This check verifies that we meet the criteria
# of all replicas being down at the same time.
assert await all_db_processes_down(ops_test, process), "Not all units down at the same time."

# Verify all units are up and running.
for unit in ops_test.model.applications[app].units:
assert await postgresql_ready(
ops_test, unit.name
), f"unit {unit.name} not restarted after cluster restart."

writes = await count_writes(ops_test)
for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)):
with attempt:
more_writes = await count_writes(ops_test)
assert more_writes > writes, "writes not continuing to DB"

# Verify that all units are part of the same cluster.
member_ips = await fetch_cluster_members(ops_test)
ip_addresses = [unit.public_address for unit in ops_test.model.applications[app].units]
assert set(member_ips) == set(ip_addresses), "not all units are part of the same cluster."

# Verify that no writes to the database were missed after stopping the writes.
total_expected_writes = await stop_continuous_writes(ops_test)
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)):
with attempt:
actual_writes = await count_writes(ops_test)
assert total_expected_writes == actual_writes, "writes to the db were missed."


@pytest.mark.ha_self_healing_tests
async def test_forceful_restart_without_data_and_transaction_logs(
ops_test: OpsTest,
Expand Down
16 changes: 8 additions & 8 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ deps =
-r{toxinidir}/requirements.txt
commands =
# Download patroni resource to use in the charm deployment.
sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.3.tar.gz" -L -s > patroni.tar.gz'
sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.4.tar.gz" -L -s > patroni.tar.gz'
pytest -v --tb native --ignore={[vars]tst_path}unit --log-cli-level=INFO -s {posargs} -m charm_tests
# Remove the downloaded resource.
sh -c 'rm -f patroni.tar.gz'
Expand All @@ -97,7 +97,7 @@ deps =
-r{toxinidir}/requirements.txt
commands =
# Download patroni resource to use in the charm deployment.
sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.3.tar.gz" -L -s > patroni.tar.gz'
sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.4.tar.gz" -L -s > patroni.tar.gz'
pytest -v --tb native --ignore={[vars]tst_path}unit --log-cli-level=INFO -s {posargs} -m database_relation_tests
# Remove the downloaded resource.
sh -c 'rm -f patroni.tar.gz'
Expand All @@ -116,7 +116,7 @@ deps =
-r{toxinidir}/requirements.txt
commands =
# Download patroni resource to use in the charm deployment.
sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.3.tar.gz" -L -s > patroni.tar.gz'
sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.4.tar.gz" -L -s > patroni.tar.gz'
pytest -v --tb native --ignore={[vars]tst_path}unit --log-cli-level=INFO -s {posargs} -m db_relation_tests
# Remove the downloaded resource.
sh -c 'rm -f patroni.tar.gz'
Expand All @@ -135,7 +135,7 @@ deps =
-r{toxinidir}/requirements.txt
commands =
# Download patroni resource to use in the charm deployment.
sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.3.tar.gz" -L -s > patroni.tar.gz'
sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.4.tar.gz" -L -s > patroni.tar.gz'
pytest -v --tb native --ignore={[vars]tst_path}unit --log-cli-level=INFO -s {posargs} -m db_admin_relation_tests
# Remove the downloaded resource.
sh -c 'rm -f patroni.tar.gz'
Expand All @@ -154,7 +154,7 @@ deps =
-r{toxinidir}/requirements.txt
commands =
# Download patroni resource to use in the charm deployment.
sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.3.tar.gz" -L -s > patroni.tar.gz'
sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.4.tar.gz" -L -s > patroni.tar.gz'
pytest -v --tb native --ignore={[vars]tst_path}unit --log-cli-level=INFO -s {posargs} -m ha_self_healing_tests
# Remove the downloaded resource.
sh -c 'rm -f patroni.tar.gz'
Expand All @@ -173,7 +173,7 @@ deps =
-r{toxinidir}/requirements.txt
commands =
# Download patroni resource to use in the charm deployment.
sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.3.tar.gz" -L -s > patroni.tar.gz'
sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.4.tar.gz" -L -s > patroni.tar.gz'
pytest -v --tb native --ignore={[vars]tst_path}unit --log-cli-level=INFO -s {posargs} -m password_rotation_tests
# Remove the downloaded resource.
sh -c 'rm -f patroni.tar.gz'
Expand All @@ -192,7 +192,7 @@ deps =
-r{toxinidir}/requirements.txt
commands =
# Download patroni resource to use in the charm deployment.
sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.3.tar.gz" -L -s > patroni.tar.gz'
sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.4.tar.gz" -L -s > patroni.tar.gz'
pytest -v --tb native --ignore={[vars]tst_path}unit --log-cli-level=INFO -s {posargs} -m tls_tests
# Remove the downloaded resource.
sh -c 'rm -f patroni.tar.gz'
Expand All @@ -211,7 +211,7 @@ deps =
-r{toxinidir}/requirements.txt
commands =
# Download patroni resource to use in the charm deployment.
sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.3.tar.gz" -L -s > patroni.tar.gz'
sh -c 'stat patroni.tar.gz > /dev/null 2>&1 || curl "https://github.com/zalando/patroni/archive/refs/tags/v2.1.4.tar.gz" -L -s > patroni.tar.gz'
pytest -v --tb native --ignore={[vars]tst_path}unit --log-cli-level=INFO -s {posargs}
# Remove the downloaded resource.
sh -c 'rm -f patroni.tar.gz'
Expand Down

0 comments on commit 0073d8f

Please sign in to comment.