Skip to content

Commit

Permalink
Update cluster_has_replica to account for the quorum_standby state
Browse files Browse the repository at this point in the history
  • Loading branch information
blogh committed Dec 24, 2024
1 parent 632dd44 commit 7f2dc34
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 9 deletions.
26 changes: 21 additions & 5 deletions check_patroni/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,13 @@ def cluster_has_leader(ctx: click.Context) -> None:
type=str,
help="Critical threshold for the number of sync replica.",
)
@click.option(
"--sync-type",
type=click.Choice(["any", "sync", "quorum"], case_sensitive=True),
default="any",
show_default=True,
help="Synchronous replication mode used to filter and count sync standbies.",
)
@click.option("--max-lag", "max_lag", type=str, help="maximum allowed lag")
@click.pass_context
@nagiosplugin.guarded
Expand All @@ -370,9 +377,10 @@ def cluster_has_replica(
critical: str,
sync_warning: str,
sync_critical: str,
sync_type: str,
max_lag: str,
) -> None:
"""Check if the cluster has healthy replicas and/or if some are sync standbies
"""Check if the cluster has healthy replicas and/or if some are sync or quorum standbies
\b
For patroni (and this check):
Expand All @@ -381,7 +389,7 @@ def cluster_has_replica(
\b
A healthy replica:
* has a `replica` or `sync_standby` role
* has a `replica`, `quorum_standby` or `sync_standby` role
* has the same timeline as the leader and
* is in `running` state (patroni < V3.0.4)
* is in `streaming` or `in archive recovery` state (patroni >= V3.0.4)
Expand All @@ -398,16 +406,24 @@ def cluster_has_replica(
switchover or failover and the standbies are in the process of catching up with
the new leader. The alert shouldn't last long.
In PostgreSQL, synchronous replication has two modes: on and quorum and is
configured with the gucs `synchronous_standby_names` and `synchronous_commit`. Patroni
uses the parameter `synchronous_mode`, which can be set to `on`, `quorum` and `off`,
and has `synchronous_node_count` to configure the synchronous replication factor.
Please note that, in synchronous replication, the number of servers tagged as "{sync|quorum}_standby"
(what we measure) is not always equal tot `synchronous_node_count`.
\b
Check:
* `OK`: if the healthy_replica count and their lag are compatible with the replica count threshold.
and if the sync_replica count is compatible with the sync replica count threshold.
and if the synchronous replica count is compatible with the sync replica count threshold.
* `WARNING` / `CRITICAL`: otherwise
\b
Perfdata:
* healthy_replica & unhealthy_replica count
* the number of sync_replica, they are included in the previous count
* the number of sync_replica (sync or quorum depending on `--sync-type`), they are included
in the previous count
* the lag of each replica labelled with "member name"_lag
* the timeline of each replica labelled with "member name"_timeline
* a boolean to tell if the node is a sync stanbdy labelled with "member name"_sync
Expand All @@ -416,7 +432,7 @@ def cluster_has_replica(
tmax_lag = size_to_byte(max_lag) if max_lag is not None else None
check = nagiosplugin.Check()
check.add(
ClusterHasReplica(ctx.obj.connection_info, tmax_lag),
ClusterHasReplica(ctx.obj.connection_info, tmax_lag, sync_type),
nagiosplugin.ScalarContext(
"healthy_replica",
warning,
Expand Down
21 changes: 17 additions & 4 deletions check_patroni/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,12 @@ def problem(self, results: nagiosplugin.Result) -> str:


class ClusterHasReplica(PatroniResource):
def __init__(self, connection_info: ConnectionInfo, max_lag: Union[int, None]):
def __init__(
self, connection_info: ConnectionInfo, max_lag: Union[int, None], sync_type: str
):
super().__init__(connection_info)
self.max_lag = max_lag
self.sync_type = sync_type

def probe(self) -> Iterable[nagiosplugin.Metric]:
def debug_member(member: Any, health: str) -> None:
Expand All @@ -161,7 +164,7 @@ def debug_member(member: Any, health: str) -> None:

# Look for replicas
for member in cluster_item_dict["members"]:
if member["role"] in ["replica", "sync_standby"]:
if member["role"] in ["replica", "sync_standby", "quorum_standby"]:
if member["lag"] == "unknown":
# This could happen if the node is stopped
# nagiosplugin doesn't handle strings in perfstats
Expand All @@ -175,7 +178,11 @@ def debug_member(member: Any, health: str) -> None:
"name": member["name"],
"lag": member["lag"],
"timeline": member["timeline"],
"sync": 1 if member["role"] == "sync_standby" else 0,
"sync": (
1
if member["role"] in ["sync_standby", "quorum_standby"]
else 0
),
}
)

Expand Down Expand Up @@ -214,7 +221,13 @@ def debug_member(member: Any, health: str) -> None:
unhealthy_replica += 1
continue

if member["role"] == "sync_standby":
if (
self.sync_type in ["sync", "any"]
and member["role"] == "sync_standby"
) or (
self.sync_type in ["quorum", "any"]
and member["role"] == "quorum_standby"
):
sync_replica += 1

if self.max_lag is None or self.max_lag >= int(member["lag"]):
Expand Down
33 changes: 33 additions & 0 deletions tests/json/cluster_has_replica_ok_quorum.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"members": [
{
"name": "srv1",
"role": "leader",
"state": "running",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "quorum_standby",
"state": "streaming",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"timeline": 51,
"lag": 0
},
{
"name": "srv3",
"role": "quorum_standby",
"state": "streaming",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"timeline": 51,
"lag": 0
}
]
}
53 changes: 53 additions & 0 deletions tests/test_cluster_has_replica.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,59 @@ def test_cluster_has_relica_ok(runner: CliRunner, patroni_api: PatroniAPI) -> No
assert result.exit_code == 0


@pytest.fixture
def cluster_has_replica_ok_sync(
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
) -> Iterator[None]:
cluster_path: Union[str, Path] = "cluster_has_replica_ok.json"
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
if old_replica_state:
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
yield None


@pytest.mark.usefixtures("cluster_has_replica_ok_sync")
def test_cluster_has_relica_ok_sync(runner: CliRunner, patroni_api: PatroniAPI) -> None:
result = runner.invoke(
main, ["-e", patroni_api.endpoint, "cluster_has_replica", "--sync-type", "sync"]
)
assert (
result.stdout
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=0 srv2_timeline=51 srv3_lag=0 srv3_sync=1 srv3_timeline=51 sync_replica=1 unhealthy_replica=0\n"
)
assert result.exit_code == 0


@pytest.fixture
def cluster_has_replica_ok_quorum(
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
) -> Iterator[None]:
cluster_path: Union[str, Path] = "cluster_has_replica_ok_quorum.json"
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
if old_replica_state:
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
yield None


@pytest.mark.usefixtures("cluster_has_replica_ok_quorum")
def test_cluster_has_relica_ok_qorum(
runner: CliRunner, patroni_api: PatroniAPI
) -> None:
result = runner.invoke(
main,
["-e", patroni_api.endpoint, "cluster_has_replica", "--sync-type", "quorum"],
)
assert (
result.stdout
== "CLUSTERHASREPLICA OK - healthy_replica is 2 | healthy_replica=2 srv2_lag=0 srv2_sync=1 srv2_timeline=51 srv3_lag=0 srv3_sync=1 srv3_timeline=51 sync_replica=2 unhealthy_replica=0\n"
)
assert result.exit_code == 0


@pytest.mark.usefixtures("cluster_has_replica_ok")
def test_cluster_has_replica_ok_with_count_thresholds(
runner: CliRunner, patroni_api: PatroniAPI
Expand Down

0 comments on commit 7f2dc34

Please sign in to comment.