From 51596790e0d422845ecc61218aa9bdaa9c9e2b2a Mon Sep 17 00:00:00 2001 From: Milos Hauser <16776599+hau21um@users.noreply.github.com> Date: Thu, 29 Sep 2022 22:20:42 +0200 Subject: [PATCH 1/2] Fix PiR if (CLONE_)WALG_DOWNLOAD_CONCURRENCY not set If for any reason WALG_DOWNLOAD_CONCURRENCY is not set, then POOL_SIZE is also not populated. In case of restore/cloning this lead to following errors, just after successful restore of the base backup tar files, like: ------------------------------ (stdout): wal_e.worker.s3.s3_worker INFO MSG: beginning partition download DETAIL: The partition being downloaded is part_00000343.tar.lzo. HINT: The absolute S3 key is spilo/xxxxxxx/wal/13/basebackups_005/base_00000021000000EE00000009_00000040/tar_partitions/part_00000343.tar.lzo. STRUCTURED: time=2022-09-29T17:45:57.018619-00 pid=75 2022-09-29 17:46:01,904 INFO: Lock owner: None; I am xxxxxxxx-cluster-0 2022-09-29 17:46:01,904 INFO: not healthy enough for leader race 2022-09-29 17:46:01,904 INFO: bootstrap in progress . . 2022-09-29 17:46:27,465 maybe_pg_upgrade INFO: No PostgreSQL configuration items changed, nothing to reload. 2022-09-29 17:46:27,492 maybe_pg_upgrade INFO: Cluster version: 13, bin version: 13 2022-09-29 17:46:27,492 maybe_pg_upgrade INFO: Trying to perform point-in-time recovery 2022-09-29 17:46:27,493 maybe_pg_upgrade INFO: Running custom bootstrap script: true 2022-09-29 17:46:27,510 maybe_pg_upgrade WARNING: Removing unexpected parameter=slow_query_log value=off from the config 2022-09-29 17:46:27.716 UTC user= db= pid=4046 LOG: Auto detecting pg_stat_kcache.linux_hz parameter... 2022-09-29 17:46:27.717 UTC user= db= pid=4046 LOG: pg_stat_kcache.linux_hz is set to 500000 2022-09-29 17:46:27,721 maybe_pg_upgrade INFO: postmaster pid=4046 /var/run/postgresql:5432 - no response 2022-09-29 17:46:27.889 UTC user= db= pid=4046 LOG: redirecting log output to logging collector process 2022-09-29 17:46:27.889 UTC user= db= pid=4046 HINT: Future log output will appear in directory "../pg_log". 2022-09-29 17:46:28,746 maybe_pg_upgrade ERROR: postmaster is not running Traceback (most recent call last): File "/scripts/maybe_pg_upgrade.py", line 51, in perform_pitr raise Exception('Point-in-time recovery failed') Exception: Point-in-time recovery failed During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/scripts/maybe_pg_upgrade.py", line 140, in main() File "/scripts/maybe_pg_upgrade.py", line 87, in main perform_pitr(upgrade, cluster_version, bin_version, config['bootstrap']) File "/scripts/maybe_pg_upgrade.py", line 70, in perform_pitr raise Exception('Point-in-time recovery failed.\nLOGS:\n--\n' + logs) Exception: Point-in-time recovery failed. (postgresql-X.csv): 2022-09-29 17:46:27.889 UTC,,,4046,,6335d9f3.fce,1,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"ending log output to stderr",,"Future log output will go to log destination ""csvlog"".",,,,,,,"","postmaster" 2022-09-29 17:46:27.889 UTC,,,4046,,6335d9f3.fce,2,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"starting PostgreSQL 13.7 (Ubuntu 13.7-1.pgdg18.04+1) on x86_64-pc-linux-gnu, compiled by gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0, 64-bit",,,,,,,,,"","postmaster" 2022-09-29 17:46:27.889 UTC,,,4046,,6335d9f3.fce,3,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"listening on IPv4 address ""0.0.0.0"", port 5432",,,,,,,,,"","postmaster" 2022-09-29 17:46:27.889 UTC,,,4046,,6335d9f3.fce,4,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"listening on IPv6 address ""::"", port 5432",,,,,,,,,"","postmaster" 2022-09-29 17:46:27.894 UTC,,,4046,,6335d9f3.fce,5,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"listening on Unix socket ""/var/run/postgresql/.s.PGSQL.5432""",,,,,,,,,"","postmaster" 2022-09-29 17:46:27.901 UTC,,,4049,,6335d9f3.fd1,1,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"database system was interrupted; last known up at 2022-09-28 13:10:06 UTC",,,,,,,,,"","startup" 2022-09-29 17:46:27.901 UTC,,,4049,,6335d9f3.fd1,2,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"creating missing WAL directory ""pg_wal/archive_status""",,,,,,,,,"","startup" 2022-09-29 17:46:28.192 UTC,,,4049,,6335d9f3.fd1,3,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"starting point-in-time recovery to 2022-09-29 00:00:00.666+00",,,,,,,,,"","startup" 2022-09-29 17:46:28.222 UTC,,,4049,,6335d9f3.fd1,4,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"invalid checkpoint record",,,,,,,,,"","startup" 2022-09-29 17:46:28.222 UTC,,,4049,,6335d9f3.fd1,5,,2022-09-29 17:46:27 UTC,,0,FATAL,XX000,"could not locate required checkpoint record",,"If you are restoring from a backup, touch ""/home/postgres/pgdata/pgroot/data/recovery.signal"" and add required recovery options. If you are not restoring from a backup, try removing the file ""/home/postgres/pgdata/pgroot/data/backup_label"". Be careful: removing ""/home/postgres/pgdata/pgroot/data/backup_label"" will result in a corrupt cluster if restoring from a backup.",,,,,,,"","startup" 2022-09-29 17:46:28.224 UTC,,,4046,,6335d9f3.fce,6,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"startup process (PID 4049) exited with exit code 1",,,,,,,,,"","postmaster" 2022-09-29 17:46:28.224 UTC,,,4046,,6335d9f3.fce,7,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"aborting startup due to startup process failure",,,,,,,,,"","postmaster" 2022-09-29 17:46:28.229 UTC,,,4046,,6335d9f3.fce,8,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"database system is shut down",,,,,,,,,"","postmaster" (postgresql-X.log): 2022-09-29 17:46:27.889 UTC user= db= pid=4046 LOG: ending log output to stderr 2022-09-29 17:46:27.889 UTC user= db= pid=4046 HINT: Future log output will go to log destination "csvlog". Thu Sep 29 17:46:28 UTC 2022 Usage: /scripts/wal-e-wal-fetch.sh wal-fetch [--prefetch PREFETCH] WAL_SEGMENT WAL_DESTINATION Thu Sep 29 17:46:28 UTC 2022 Usage: /scripts/wal-e-wal-fetch.sh wal-fetch [--prefetch PREFETCH] WAL_SEGMENT WAL_DESTINATION Thu Sep 29 17:46:28 UTC 2022 Usage: /scripts/wal-e-wal-fetch.sh wal-fetch [--prefetch PREFETCH] WAL_SEGMENT WAL_DESTINATION ------------------------------ Without this patch setting CLONE_WALG_DOWNLOAD_CONCURRENCY env in PG manifest would be workaround. --- postgres-appliance/scripts/restore_command.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/postgres-appliance/scripts/restore_command.sh b/postgres-appliance/scripts/restore_command.sh index a4bb88939..4bf2f953a 100755 --- a/postgres-appliance/scripts/restore_command.sh +++ b/postgres-appliance/scripts/restore_command.sh @@ -48,7 +48,7 @@ if [[ -z $WALE_S3_PREFIX ]]; then # non AWS environment? if [[ -f $wale_prefetch_source ]]; then exec mv "${wale_prefetch_source}" "${wal_destination}" else - exec wal-e wal-fetch -p $POOL_SIZE "${wal_filename}" "${wal_destination}" + exec wal-e wal-fetch ${POOL_SIZE:+-p $POOL_SIZE} "${wal_filename}" "${wal_destination}" fi else exec bash /scripts/wal-e-wal-fetch.sh wal-fetch -p $POOL_SIZE "${wal_filename}" "${wal_destination}" From b4960e987f14bd6e7a9ce5e1199fc82d549a1d36 Mon Sep 17 00:00:00 2001 From: Milos Hauser <16776599+hau21um@users.noreply.github.com> Date: Thu, 29 Sep 2022 23:07:00 +0200 Subject: [PATCH 2/2] One more conditional -p usage --- postgres-appliance/scripts/restore_command.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/postgres-appliance/scripts/restore_command.sh b/postgres-appliance/scripts/restore_command.sh index 4bf2f953a..926876d15 100755 --- a/postgres-appliance/scripts/restore_command.sh +++ b/postgres-appliance/scripts/restore_command.sh @@ -51,5 +51,5 @@ if [[ -z $WALE_S3_PREFIX ]]; then # non AWS environment? exec wal-e wal-fetch ${POOL_SIZE:+-p $POOL_SIZE} "${wal_filename}" "${wal_destination}" fi else - exec bash /scripts/wal-e-wal-fetch.sh wal-fetch -p $POOL_SIZE "${wal_filename}" "${wal_destination}" + exec bash /scripts/wal-e-wal-fetch.sh wal-fetch ${POOL_SIZE:+-p $POOL_SIZE} "${wal_filename}" "${wal_destination}" fi