Skip to content

Commit

Permalink
Enable full cluster crash and restart test and changes to somewhat st…
Browse files Browse the repository at this point in the history
…ablise the test suite (#55)

* Pass prebuilt charm to tests for local execution

* Enable postgres process for full cluster restart test

* Revert "Pass prebuilt charm to tests for local execution"

This reverts commit cc7f199.

* Bump up ops framework

* Add retries when dropping continuous_writes table

* Terminating continuous_writes in test charm

* Version bump for deps

* Revert "Version bump for deps"

This reverts commit 8e70a6f.

* Version bump for libs

* Switch test app charm to peer relation for storing cont writes PId

* Code review tweaks

* Don't count on endpoints change

* Syncmode (#64)

* Syncmode

* Nuke pgconf

* Refactor

* Fake strict sync mode

* Update charm.py

* FFWD cleanup

* Always fail

* Update helpers.py

* Add observer that updates endpoints on relation (#58)

* Pass prebuilt charm to tests for local execution

* Enable postgres process for full cluster restart test

* Revert "Pass prebuilt charm to tests for local execution"

This reverts commit cc7f199.

* Add observer that updates endpoints on relation

* Fixing logic

* Bump up ops framework

* Fix the logic to update the unit databag when it's clearead on another unit

* Fix observer start

* Fixes in the test

* Fix part of the unit tests

* Fix remaining unit tests

* Minor adjustments in the code

* Fix machine start

* Add copyright notice

* Add retries when dropping continuous_writes table

* Terminating continuous_writes in test charm

* Version bump for deps

* Revert "Version bump for deps"

This reverts commit 8e70a6f.

* Version bump for libs

* Switch test app charm to peer relation for storing cont writes PId

* Code review tweaks

* Fix TLS test

* Comment logs

* Fix get_member_ip

* Add a call to relation endpoints update

* Fix and revert changes

* Change sleep time

* Revert changes

* Minor fixes

* Revert lib changes

* Revert requirements changes

* Uncomment charms removal

* Remove log calls

* Add unit tests to the observer

* Fix incorrect user deletion

* Fix unit tests

---------

Co-authored-by: Dragomir Penev <[email protected]>

* Pass down unit for kill and restart tests

* Reset master_start_timeout just once

* Tweaks

* Increase wait for test_forceful_restart_without_data_and_transaction_logs

* Only check writes from sync standby or primary

* Tweak helpers and disable freeze postgresql process

* Wait a bit on first count during restart tests

* Reenable freeze postgresql process

* Don't accept string None in _connection_string

---------

Co-authored-by: Marcelo Henrique Neppel <[email protected]>

* Disable synchronous_mode_strict (#70)

* Disable synchronous_mode_strict

* Custom failover event

* Don't specify switchover candidate

* HA tweaks

* Check for primary before dropping continuous_writes

* Unit tests

* Dropping and creating test table

* Update src/cluster.py

Co-authored-by: Marcelo Henrique Neppel <[email protected]>

---------

Co-authored-by: Marcelo Henrique Neppel <[email protected]>

* increase test_forceful_restart_without_data_and_transaction_logs timeout

* Set synchronous_node_count to cluster minority count (#72)

* Revert autofailover

* Update sync_node_count

* Don't rely on planned_units when downscaling

* Wait for standbys

* Handle error on update_synchronous_node_count during scale down

* Update tests/integration/ha_tests/application-charm/src/charm.py

Co-authored-by: Mykola Marzhan <[email protected]>

* Code review tweaks

---------

Co-authored-by: Marcelo Henrique Neppel <[email protected]>
Co-authored-by: Mykola Marzhan <[email protected]>
  • Loading branch information
3 people authored Mar 6, 2023
1 parent 17e902c commit 48e00c3
Show file tree
Hide file tree
Showing 17 changed files with 403 additions and 259 deletions.
25 changes: 20 additions & 5 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@

logger = logging.getLogger(__name__)

NO_PRIMARY_MESSAGE = "no primary in the cluster"
CREATE_CLUSTER_CONF_PATH = "/etc/postgresql-common/createcluster.d/pgcharm.conf"


Expand Down Expand Up @@ -198,6 +199,15 @@ def _on_get_primary(self, event: ActionEvent) -> None:
except RetryError as e:
logger.error(f"failed to get primary with error {e}")

def _updated_synchronous_node_count(self, num_units: int = None) -> bool:
"""Tries to update synchronous_node_count configuration and reports the result."""
try:
self._patroni.update_synchronous_node_count(num_units)
return True
except RetryError:
logger.debug("Unable to set synchronous_node_count")
return False

def _on_peer_relation_departed(self, event: RelationDepartedEvent) -> None:
"""The leader removes the departing units from the list of cluster members."""
# Don't handle this event in the same unit that is departing.
Expand All @@ -221,7 +231,9 @@ def _on_peer_relation_departed(self, event: RelationDepartedEvent) -> None:
if not self.unit.is_leader():
return

if "cluster_initialised" not in self._peers.data[self.app]:
if "cluster_initialised" not in self._peers.data[
self.app
] or not self._updated_synchronous_node_count(len(self._units_ips)):
logger.debug("Deferring on_peer_relation_departed: cluster not initialized")
event.defer()
return
Expand All @@ -241,7 +253,7 @@ def _on_peer_relation_departed(self, event: RelationDepartedEvent) -> None:
if self.primary_endpoint:
self._update_relation_endpoints()
else:
self.unit.status = BlockedStatus("no primary in the cluster")
self.unit.status = BlockedStatus(NO_PRIMARY_MESSAGE)
return

def _on_pgdata_storage_detaching(self, _) -> None:
Expand Down Expand Up @@ -334,7 +346,7 @@ def _on_peer_relation_changed(self, event: RelationChangedEvent):
self._update_relation_endpoints()
self.unit.status = ActiveStatus()
else:
self.unit.status = BlockedStatus("no primary in the cluster")
self.unit.status = BlockedStatus(NO_PRIMARY_MESSAGE)

def _add_members(self, event):
"""Add new cluster members.
Expand All @@ -357,6 +369,7 @@ def _add_members(self, event):
for member in self._hosts - self._patroni.cluster_members:
logger.debug("Adding %s to cluster", member)
self.add_cluster_member(member)
self._patroni.update_synchronous_node_count()
except NotReadyError:
logger.info("Deferring reconfigure: another member doing sync right now")
event.defer()
Expand Down Expand Up @@ -523,6 +536,9 @@ def _on_cluster_topology_change(self, _):
logger.info("Cluster topology changed")
self._update_relation_endpoints()
self._update_certificate()
if self.is_blocked and self.unit.status.message == NO_PRIMARY_MESSAGE:
if self.primary_endpoint:
self.unit.status = ActiveStatus()

def _on_install(self, event: InstallEvent) -> None:
"""Install prerequisites for the application."""
Expand Down Expand Up @@ -572,7 +588,6 @@ def _inhibit_default_cluster_creation(self) -> None:
os.makedirs(os.path.dirname(CREATE_CLUSTER_CONF_PATH), mode=0o755, exist_ok=True)
with open(CREATE_CLUSTER_CONF_PATH, mode="w") as file:
file.write("create_main_cluster = false\n")
file.write(f"include '{self._storage_path}/conf.d/postgresql-operator.conf'")

def _on_leader_elected(self, event: LeaderElectedEvent) -> None:
"""Handle the leader-elected event."""
Expand Down Expand Up @@ -608,7 +623,7 @@ def _on_leader_elected(self, event: LeaderElectedEvent) -> None:
if self.primary_endpoint:
self._update_relation_endpoints()
else:
self.unit.status = BlockedStatus("no primary in the cluster")
self.unit.status = BlockedStatus(NO_PRIMARY_MESSAGE)

def _on_config_changed(self, event: ConfigChangedEvent) -> None:
"""Install additional packages through APT."""
Expand Down
40 changes: 23 additions & 17 deletions src/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ class SwitchoverFailedError(Exception):
"""Raised when a switchover failed for some reason."""


class UpdateSyncNodeCountError(Exception):
"""Raised when updating synchronous_node_count failed for some reason."""


class Patroni:
"""This class handles the bootstrap of a PostgreSQL database through Patroni."""

Expand All @@ -81,8 +85,8 @@ def __init__(
storage_path: path to the storage mounted on this unit
cluster_name: name of the cluster
member_name: name of the member inside the cluster
peers_ips: IP addresses of the peer units
planned_units: number of units planned for the cluster
peers_ips: IP addresses of the peer units
superuser_password: password for the operator user
replication_password: password for the user used in the replication
rewind_password: password for the user used on rewinds
Expand Down Expand Up @@ -123,7 +127,6 @@ def configure_patroni_on_unit(self):
self._render_patroni_service_file()
# Reload systemd services before trying to start Patroni.
daemon_reload()
self.render_postgresql_conf_file()

def _change_owner(self, path: str) -> None:
"""Change the ownership of a file or a directory to the postgres user.
Expand Down Expand Up @@ -358,24 +361,10 @@ def render_patroni_yml_file(self, enable_tls: bool = False, stanza: str = None)
enable_pgbackrest=stanza is not None,
stanza=stanza,
version=self._get_postgresql_version(),
minority_count=self.planned_units // 2,
)
self.render_file(f"{self.storage_path}/patroni.yml", rendered, 0o644)

def render_postgresql_conf_file(self) -> None:
"""Render the PostgreSQL configuration file."""
# Open the template postgresql.conf file.
with open("templates/postgresql.conf.j2", "r") as file:
template = Template(file.read())
# Render the template file with the correct values.
# TODO: add extra configurations here later.
rendered = template.render(
listen_addresses="*",
synchronous_commit="on" if self.planned_units > 1 else "off",
synchronous_standby_names="*",
)
self._create_directory(f"{self.storage_path}/conf.d", mode=0o644)
self.render_file(f"{self.storage_path}/conf.d/postgresql-operator.conf", rendered, 0o644)

def start_patroni(self) -> bool:
"""Start Patroni service using systemd.
Expand Down Expand Up @@ -463,3 +452,20 @@ def restart_postgresql(self) -> None:
def reinitialize_postgresql(self) -> None:
"""Reinitialize PostgreSQL."""
requests.post(f"{self._patroni_url}/reinitialize", verify=self.verify)

def update_synchronous_node_count(self, units: int = None) -> None:
"""Update synchronous_node_count to the minority of the planned cluster."""
if units is None:
units = self.planned_units
# Try to update synchronous_node_count.
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)):
with attempt:
r = requests.patch(
f"{self._patroni_url}/config",
json={"synchronous_node_count": units // 2},
verify=self.verify,
)

# Check whether the update was unsuccessful.
if r.status_code != 200:
raise UpdateSyncNodeCountError(f"received {r.status_code}")
4 changes: 4 additions & 0 deletions templates/patroni.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,15 @@ bootstrap:
loop_wait: 10
retry_timeout: 10
maximum_lag_on_failover: 1048576
synchronous_mode: true
synchronous_node_count: {{ minority_count }}
postgresql:
use_pg_rewind: true
remove_data_directory_on_rewind_failure: true
remove_data_directory_on_diverged_timelines: true
parameters:
synchronous_commit: on
synchronous_standby_names: "*"
{%- if enable_pgbackrest %}
archive_command: 'pgbackrest --stanza={{ stanza }} archive-push %p'
{% else %}
Expand Down
8 changes: 0 additions & 8 deletions templates/postgresql.conf.j2

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ bases:
channel: "22.04"
parts:
charm:
charm-binary-python-packages: [psycopg2-binary==2.9.3]
charm-binary-python-packages: [psycopg2-binary==2.9.5]
4 changes: 4 additions & 0 deletions tests/integration/ha_tests/application-charm/metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,7 @@ requires:
database:
interface: postgresql_client
limit: 1

peers:
application-peers:
interface: application-peers
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
ops >= 1.5.0
ops >= 2.0.0
tenacity==8.1.0
Loading

0 comments on commit 48e00c3

Please sign in to comment.