From 9f75ab16e4c00be2467b3a4a15719ba2b6f47ea4 Mon Sep 17 00:00:00 2001 From: A Codeweavers Infrastructure Bod <36475663+infraweavers@users.noreply.github.com> Date: Wed, 21 Nov 2018 15:15:22 +0000 Subject: [PATCH 1/5] Adding check_streaming_delta This lets a pginstance be monitored based on receiving and sending WAL, so we can monitor the delay between our cascading replication servers. We've also added the ability to limit based on being in the same network as the cluster that's being checked from which is useful for checking only local replicas if you have a hub-and-spoke-style replication strategy --- check_postgres.pl | 55 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/check_postgres.pl b/check_postgres.pl index ef587279..6b653520 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -1534,6 +1534,7 @@ package check_postgres; 'replace', ## used by same_schema only 'lsfunc=s', ## used by wal_files and archive_ready 'skipcycled', ## used by sequence only + 'netmasklength=i' ## used by streaming_delta only ); die $USAGE if ! keys %opt and ! @ARGV; @@ -2498,6 +2499,9 @@ sub finishup { ## Check the replication delay in hot standby setup check_hot_standby_delay() if $action eq 'hot_standby_delay'; +# Check the delay between two standby servers (useful for cascading replication) +check_streaming_delta() if $action eq 'streaming_delta'; + ## Check the delay on replication slots. warning and critical are sizes check_replication_slots() if $action eq 'replication_slots'; @@ -5483,6 +5487,57 @@ sub check_hot_standby_delay { } ## end of check_hot_standby_delay + +sub check_streaming_delta { + my ($critical, $warning) = ($opt{critical}, $opt{warning}); + + ## Check on the delay in PITR replication between the WAL receieved + ## and the WAL passed on to the cascading replicas + ## if the subnet mask is passed in it will only check against servers + ## that are in the same subnet as the postgres instance based on that + ## subnet mask + + $SQL = q{SELECT application_name, client_addr, pid, + sent_location, write_location, flush_location, replay_location, + CASE pg_is_in_recovery() WHEN true THEN pg_last_xlog_receive_location() ELSE pg_current_xlog_location() END AS master_location + FROM pg_stat_replication }; + if ($opt{netmasklength}) { + my $netmask_length = $opt{netmasklength}; + $SQL .= "WHERE network(set_masklen(client_addr,$netmask_length)) = network(set_masklen(inet_server_addr(),$netmask_length))"; + } + my $info = run_command($SQL); + for $db (@{$info->{db}}) { + for my $row (@{$db->{slurp}}) { + my ($a, $b) = split(/\//, $row->{'master_location'}); + my $master_location = (hex('ff000000') * hex($a)) + hex($b); + + for my $wal_type (qw/sent write flush replay/) + { + ($a, $b) = split(/\//, $row->{'sent_location'}); + my $slave_position = (hex('ff000000') * hex($a)) + hex($b); + + my $slave_lag = $master_location - $slave_position; + + $db->{perf} .= "$row->{'client_addr'}_$wal_type=$slave_lag;$warning;$critical; "; + + if (length $critical and $slave_lag >= $critical) { + add_critical "CRITICAL for : $row->{'client_addr'} - $row->{'application_name'} - $wal_type"; + } + elsif (length $warning and $slave_lag >= $warning) { + add_warning "WARNING for : $row->{'client_addr'} - $row->{'application_name'} - $wal_type"; + } + else { + add_ok "OK for : $row->{'client_addr'} - $row->{'application_name'} - $wal_type"; + } + } + } + } + + return; + +} ## end of check_streaming_delta + + sub check_replication_slots { ## Check the delay on one or more replication slots From 1cac62c50615a8774d8b316a1255676a67a3c5a2 Mon Sep 17 00:00:00 2001 From: A Codeweavers Infrastructure Bod <36475663+infraweavers@users.noreply.github.com> Date: Thu, 22 Nov 2018 11:13:36 +0000 Subject: [PATCH 2/5] Fixing perfdata was incorrect (always sent_location) and duplicated (it is output on each add_ok invocation) --- check_postgres.pl | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/check_postgres.pl b/check_postgres.pl index 6b653520..0f2b163a 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -5506,19 +5506,19 @@ sub check_streaming_delta { $SQL .= "WHERE network(set_masklen(client_addr,$netmask_length)) = network(set_masklen(inet_server_addr(),$netmask_length))"; } my $info = run_command($SQL); + my $perfdata = ""; for $db (@{$info->{db}}) { for my $row (@{$db->{slurp}}) { my ($a, $b) = split(/\//, $row->{'master_location'}); my $master_location = (hex('ff000000') * hex($a)) + hex($b); - for my $wal_type (qw/sent write flush replay/) - { - ($a, $b) = split(/\//, $row->{'sent_location'}); + for my $wal_type (qw/sent write flush replay/) { + ($a, $b) = split(/\//, $row->{$wal_type . '_location'}); my $slave_position = (hex('ff000000') * hex($a)) + hex($b); my $slave_lag = $master_location - $slave_position; - $db->{perf} .= "$row->{'client_addr'}_$wal_type=$slave_lag;$warning;$critical; "; + $db->{perf} .= "$row->{'client_addr'}_$wal_type=$slave_lag;$warning;$critical "; if (length $critical and $slave_lag >= $critical) { add_critical "CRITICAL for : $row->{'client_addr'} - $row->{'application_name'} - $wal_type"; @@ -5526,10 +5526,8 @@ sub check_streaming_delta { elsif (length $warning and $slave_lag >= $warning) { add_warning "WARNING for : $row->{'client_addr'} - $row->{'application_name'} - $wal_type"; } - else { - add_ok "OK for : $row->{'client_addr'} - $row->{'application_name'} - $wal_type"; - } } + add_ok "OK for : $row->{'client_addr'} - $row->{'application_name'}"; } } From 954dd24e92b6950f680e373d76caa0f23598ae60 Mon Sep 17 00:00:00 2001 From: A Codeweavers Infrastructure Bod <36475663+infraweavers@users.noreply.github.com> Date: Thu, 22 Nov 2018 11:14:37 +0000 Subject: [PATCH 3/5] Tabs -> Spaces --- check_postgres.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_postgres.pl b/check_postgres.pl index 0f2b163a..5fbf45f0 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -5527,7 +5527,7 @@ sub check_streaming_delta { add_warning "WARNING for : $row->{'client_addr'} - $row->{'application_name'} - $wal_type"; } } - add_ok "OK for : $row->{'client_addr'} - $row->{'application_name'}"; + add_ok "OK for : $row->{'client_addr'} - $row->{'application_name'}"; } } From d53d58e7ceae4e6e9d46e10a77dc2725b7388092 Mon Sep 17 00:00:00 2001 From: A Codeweavers Infrastructure Bod <36475663+infraweavers@users.noreply.github.com> Date: Thu, 22 Nov 2018 11:58:33 +0000 Subject: [PATCH 4/5] Fixing duplicate perfdata if there are multiple rows We need to call `add_ok` once for each instance of `$db` otherwise perfdata doesn't work at all, more frequently and it is duplicated --- check_postgres.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_postgres.pl b/check_postgres.pl index 5fbf45f0..e83ff1ac 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -5527,8 +5527,8 @@ sub check_streaming_delta { add_warning "WARNING for : $row->{'client_addr'} - $row->{'application_name'} - $wal_type"; } } - add_ok "OK for : $row->{'client_addr'} - $row->{'application_name'}"; } + add_ok "OK for : $row->{'client_addr'} - $row->{'application_name'}"; } return; From 518a2450fe496e917ebe022cae70352f472d393a Mon Sep 17 00:00:00 2001 From: A Codeweavers Infrastructure Bod <36475663+infraweavers@users.noreply.github.com> Date: Tue, 26 Jan 2021 02:33:16 +0000 Subject: [PATCH 5/5] pg_basebackup adds itself to pg_stat_replication but with blank xlog positions, which causes this to go critical. In reality, this is not for checking pg_basebackup status --- check_postgres.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/check_postgres.pl b/check_postgres.pl index e83ff1ac..a5a4676a 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -5500,10 +5500,10 @@ sub check_streaming_delta { $SQL = q{SELECT application_name, client_addr, pid, sent_location, write_location, flush_location, replay_location, CASE pg_is_in_recovery() WHEN true THEN pg_last_xlog_receive_location() ELSE pg_current_xlog_location() END AS master_location - FROM pg_stat_replication }; + FROM pg_stat_replication WHERE state != 'backup'}; if ($opt{netmasklength}) { my $netmask_length = $opt{netmasklength}; - $SQL .= "WHERE network(set_masklen(client_addr,$netmask_length)) = network(set_masklen(inet_server_addr(),$netmask_length))"; + $SQL .= " AND network(set_masklen(client_addr,$netmask_length)) = network(set_masklen(inet_server_addr(),$netmask_length))"; } my $info = run_command($SQL); my $perfdata = "";