Skip to content

Commit

Permalink
Fixes for machines restart (#47)
Browse files Browse the repository at this point in the history
* Fix service restart at boot

* Add fix for TLS files and postgres user

* Add integration test

* Fix unit test

* Fix ubuntu version and update charming actions

* Move test

* Remove previous test

* Revert change on unit test

* Minor fixes

* Fix unit tests

* Modify condition

* Update Juju version

* Setup tmate session

* Add Waiting Status

* Add defer call

* Change reboot command

* Add waiting status

* Remove incorrect defer call

* Add replica related changes

* Simplify test

* Simplify code and test

* Add comment about latest juju 2

* Update charming actions

* Make test optional

* Add unit tests

* Add docstring and type hints

* Some fixes

* Minor fixes

* Remove prints

* Fix bundle deployment

* Add additional unit tests
  • Loading branch information
marceloneppel authored Jan 24, 2023
1 parent acc1dff commit 6d4b41d
Show file tree
Hide file tree
Showing 8 changed files with 280 additions and 22 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ jobs:
with:
fetch-depth: 0
- name: Select charmhub channel
uses: canonical/charming-actions/channel@2.0.0-rc
uses: canonical/charming-actions/channel@2.2.0
id: channel
- name: Upload charm to charmhub
uses: canonical/charming-actions/upload-charm@2.0.0-rc
uses: canonical/charming-actions/upload-charm@2.2.0
with:
credentials: "${{ secrets.CHARMHUB_TOKEN }}"
github-token: "${{ secrets.GITHUB_TOKEN }}"
Expand Down
85 changes: 73 additions & 12 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,13 @@
ActionEvent,
CharmBase,
ConfigChangedEvent,
InstallEvent,
LeaderElectedEvent,
RelationChangedEvent,
RelationDepartedEvent,
StartEvent,
)
from ops.framework import EventBase
from ops.main import main
from ops.model import (
ActiveStatus,
Expand Down Expand Up @@ -514,8 +517,12 @@ def _unit_ip(self) -> str:
"""Current unit ip."""
return str(self.model.get_binding(PEER).network.bind_address)

def _on_install(self, event) -> None:
def _on_install(self, event: InstallEvent) -> None:
"""Install prerequisites for the application."""
if not self._is_storage_attached():
self._reboot_on_detached_storage(event)
return

self.unit.status = MaintenanceStatus("installing PostgreSQL")

# Prevent the default cluster creation.
Expand Down Expand Up @@ -609,12 +616,23 @@ def _get_ips_to_remove(self) -> Set[str]:
current = self._units_ips
return old - current

def _on_start(self, event) -> None:
"""Handle the start event."""
def _can_start(self, event: StartEvent) -> bool:
"""Returns whether the workload can be started on this unit."""
if not self._is_storage_attached():
self._reboot_on_detached_storage(event)
return False

# Doesn't try to bootstrap the cluster if it's in a blocked state
# caused, for example, because a failed installation of packages.
if self._has_blocked_status:
logger.debug("Early exit on_start: Unit blocked")
return False

return True

def _on_start(self, event: StartEvent) -> None:
"""Handle the start event."""
if not self._can_start(event):
return

postgres_password = self._get_password()
Expand All @@ -626,18 +644,17 @@ def _on_start(self, event) -> None:
event.defer()
return

if not self.unit.is_leader() and "cluster_initialised" not in self._peers.data[self.app]:
logger.debug("Deferring on_start: awaiting for cluster to start")
self.unit.status = WaitingStatus("awaiting for cluster to start")
event.defer()
return

# Only the leader can bootstrap the cluster.
# On replicas, only prepare for starting the instance later.
if not self.unit.is_leader():
self._patroni.configure_patroni_on_unit()
event.defer()
self._start_replica(event)
return

# Bootstrap the cluster in the leader unit.
self._start_primary(event)

def _start_primary(self, event: StartEvent) -> None:
"""Bootstrap the cluster."""
# Set some information needed by Patroni to bootstrap the cluster.
if not self._patroni.bootstrap_cluster():
self.unit.status = BlockedStatus("failed to start Patroni")
Expand All @@ -653,7 +670,10 @@ def _on_start(self, event) -> None:
# Create the default postgres database user that is needed for some
# applications (not charms) like Landscape Server.
try:
self.postgresql.create_user("postgres", new_password(), admin=True)
# This event can be run on a replica if the machines are restarted.
# For that case, check whether the postgres user already exits.
if "postgres" not in self.postgresql.list_users():
self.postgresql.create_user("postgres", new_password(), admin=True)
except PostgreSQLCreateUserError as e:
logger.exception(e)
self.unit.status = BlockedStatus("Failed to create postgres user")
Expand All @@ -665,6 +685,23 @@ def _on_start(self, event) -> None:
self._peers.data[self.app]["cluster_initialised"] = "True"
self.unit.status = ActiveStatus()

def _start_replica(self, event) -> None:
"""Configure the replica if the cluster was already initialised."""
if "cluster_initialised" not in self._peers.data[self.app]:
logger.debug("Deferring on_start: awaiting for cluster to start")
self.unit.status = WaitingStatus("awaiting for cluster to start")
event.defer()
return

# Member already started, so we can set an ActiveStatus.
# This can happen after a reboot.
if self._patroni.member_started:
self.unit.status = ActiveStatus()
return

# Configure Patroni in the replica but don't start it yet.
self._patroni.configure_patroni_on_unit()

def _on_get_password(self, event: ActionEvent) -> None:
"""Returns the password for a user as an action response.
Expand Down Expand Up @@ -826,6 +863,14 @@ def _install_pip_packages(self, packages: List[str]) -> None:
logger.error("could not install pip packages")
raise

def _is_storage_attached(self) -> bool:
"""Returns if storage is attached."""
try:
subprocess.check_call(["mountpoint", "-q", self._storage_path])
return True
except subprocess.CalledProcessError:
return False

@property
def _peers(self) -> Relation:
"""Fetch the peer relation.
Expand All @@ -848,6 +893,22 @@ def push_tls_files_to_workload(self) -> None:

self.update_config()

def _reboot_on_detached_storage(self, event: EventBase) -> None:
"""Reboot on detached storage.
Workaround for lxd containers not getting storage attached on startups.
Args:
event: the event that triggered this handler
"""
event.defer()
logger.error("Data directory not attached. Reboot unit.")
self.unit.status = WaitingStatus("Data directory not attached")
try:
subprocess.check_call(["systemctl", "reboot"])
except subprocess.CalledProcessError:
pass

def _restart(self, _) -> None:
"""Restart PostgreSQL."""
try:
Expand Down
3 changes: 3 additions & 0 deletions src/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from charms.operator_libs_linux.v1.systemd import (
daemon_reload,
service_restart,
service_resume,
service_running,
service_start,
)
Expand Down Expand Up @@ -370,7 +371,9 @@ def start_patroni(self) -> bool:
Returns:
Whether the service started successfully.
"""
# Start the service and enable it to start at boot.
service_start(PATRONI_SERVICE)
service_resume(PATRONI_SERVICE)
return service_running(PATRONI_SERVICE)

def switchover(self) -> None:
Expand Down
18 changes: 17 additions & 1 deletion tests/integration/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# See LICENSE file for licensing details.
import itertools
import json
import subprocess
import tempfile
import zipfile
from datetime import datetime
Expand Down Expand Up @@ -318,7 +319,7 @@ async def deploy_and_relate_bundle_with_postgresql(
# Relate application to PostgreSQL.
relation = await ops_test.model.relate(f"{application_name}", f"{DATABASE_APP_NAME}:db-admin")
await ops_test.model.wait_for_idle(
apps=[application_name],
apps=[application_name, DATABASE_APP_NAME],
status="active",
timeout=1000,
)
Expand Down Expand Up @@ -620,6 +621,21 @@ async def primary_changed(ops_test: OpsTest, old_primary: str) -> bool:
return primary != old_primary


async def restart_machine(ops_test: OpsTest, unit_name: str) -> None:
"""Restart the machine where a unit run on.
Args:
ops_test: The ops test framework instance
unit_name: The name of the unit to restart the machine
"""
hostname_command = f"run --unit {unit_name} -- hostname"
return_code, raw_hostname, _ = await ops_test.juju(*hostname_command.split())
if return_code != 0:
raise Exception("Failed to get the unit machine name: %s", return_code)
restart_machine_command = f"lxc restart {raw_hostname.strip()}"
subprocess.check_call(restart_machine_command.split())


async def run_command_on_unit(ops_test: OpsTest, unit_name: str, command: str) -> str:
"""Run a command on a specific unit.
Expand Down
54 changes: 53 additions & 1 deletion tests/integration/test_tls.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
#!/usr/bin/env python3
# Copyright 2022 Canonical Ltd.
# See LICENSE file for licensing details.
import logging
import os

import pytest as pytest
from pytest_operator.plugin import OpsTest
from tenacity import Retrying, stop_after_delay, wait_exponential
from tenacity import Retrying, stop_after_attempt, stop_after_delay, wait_exponential

from tests.helpers import METADATA
from tests.integration.helpers import (
Expand All @@ -18,9 +20,12 @@
get_primary,
get_unit_address,
primary_changed,
restart_machine,
run_command_on_unit,
)

logger = logging.getLogger(__name__)

APP_NAME = METADATA["name"]
TLS_CERTIFICATES_APP_NAME = "tls-certificates-operator"

Expand Down Expand Up @@ -135,3 +140,50 @@ async def test_tls_enabled(ops_test: OpsTest) -> None:
for unit in ops_test.model.applications[DATABASE_APP_NAME].units:
assert await check_tls(ops_test, unit.name, enabled=False)
assert await check_tls_patroni_api(ops_test, unit.name, enabled=False)


@pytest.mark.skipif(
not os.environ.get("RESTART_MACHINE_TEST"),
reason="RESTART_MACHINE_TEST environment variable not set",
)
@pytest.mark.tls_tests
async def test_restart_machine(ops_test: OpsTest) -> None:
async with ops_test.fast_forward():
# Relate it to the PostgreSQL to enable TLS.
await ops_test.model.relate(DATABASE_APP_NAME, TLS_CERTIFICATES_APP_NAME)
await ops_test.model.wait_for_idle(status="active", timeout=1000)

# Wait for all units enabling TLS.
for unit in ops_test.model.applications[DATABASE_APP_NAME].units:
assert await check_tls(ops_test, unit.name, enabled=True)
assert await check_tls_patroni_api(ops_test, unit.name, enabled=True)

unit_name = "postgresql/0"
issue_found = False
for attempt in Retrying(stop=stop_after_attempt(10)):
with attempt:
# Restart the machine of the unit.
logger.info(f"restarting {unit_name}")
await restart_machine(ops_test, unit_name)

# Check whether the issue happened (the storage wasn't mounted).
logger.info(
f"checking whether storage was mounted - attempt {attempt.retry_state.attempt_number}"
)
result = await run_command_on_unit(ops_test, unit_name, "lsblk")
if "/var/lib/postgresql/data" not in result:
issue_found = True

assert (
issue_found
), "Couldn't reproduce the issue from https://bugs.launchpad.net/juju/+bug/1999758"

# Wait for the unit to be ready again. Some errors in the start hook may happen due
# to rebooting in the middle of a hook.
await ops_test.model.wait_for_idle(status="active", timeout=1000, raise_on_error=False)

# Wait for the unit enabling TLS again.
logger.info(f"checking TLS on {unit_name}")
assert await check_tls(ops_test, "postgresql/0", enabled=True)
logger.info(f"checking TLS on Patroni API from {unit_name}")
assert await check_tls_patroni_api(ops_test, "postgresql/0", enabled=True)
Loading

0 comments on commit 6d4b41d

Please sign in to comment.