From 51596790e0d422845ecc61218aa9bdaa9c9e2b2a Mon Sep 17 00:00:00 2001
From: Milos Hauser <16776599+hau21um@users.noreply.github.com>
Date: Thu, 29 Sep 2022 22:20:42 +0200
Subject: [PATCH 1/2] Fix PiR if (CLONE_)WALG_DOWNLOAD_CONCURRENCY not set

If for any reason WALG_DOWNLOAD_CONCURRENCY is not set, then POOL_SIZE is also not populated. In case of restore/cloning this lead to following errors, just after successful restore of the base backup tar files, like:

------------------------------
(stdout):
wal_e.worker.s3.s3_worker INFO     MSG: beginning partition download
        DETAIL: The partition being downloaded is part_00000343.tar.lzo.
        HINT: The absolute S3 key is spilo/xxxxxxx/wal/13/basebackups_005/base_00000021000000EE00000009_00000040/tar_partitions/part_00000343.tar.lzo.
        STRUCTURED: time=2022-09-29T17:45:57.018619-00 pid=75
2022-09-29 17:46:01,904 INFO: Lock owner: None; I am xxxxxxxx-cluster-0
2022-09-29 17:46:01,904 INFO: not healthy enough for leader race
2022-09-29 17:46:01,904 INFO: bootstrap in progress
.
.
2022-09-29 17:46:27,465 maybe_pg_upgrade INFO: No PostgreSQL configuration items changed, nothing to reload.
2022-09-29 17:46:27,492 maybe_pg_upgrade INFO: Cluster version: 13, bin version: 13
2022-09-29 17:46:27,492 maybe_pg_upgrade INFO: Trying to perform point-in-time recovery
2022-09-29 17:46:27,493 maybe_pg_upgrade INFO: Running custom bootstrap script: true
2022-09-29 17:46:27,510 maybe_pg_upgrade WARNING: Removing unexpected parameter=slow_query_log value=off from the config
2022-09-29 17:46:27.716 UTC user= db= pid=4046 LOG:  Auto detecting pg_stat_kcache.linux_hz parameter...
2022-09-29 17:46:27.717 UTC user= db= pid=4046 LOG:  pg_stat_kcache.linux_hz is set to 500000
2022-09-29 17:46:27,721 maybe_pg_upgrade INFO: postmaster pid=4046
/var/run/postgresql:5432 - no response
2022-09-29 17:46:27.889 UTC user= db= pid=4046 LOG:  redirecting log output to logging collector process
2022-09-29 17:46:27.889 UTC user= db= pid=4046 HINT:  Future log output will appear in directory "../pg_log".
2022-09-29 17:46:28,746 maybe_pg_upgrade ERROR: postmaster is not running
Traceback (most recent call last):
  File "/scripts/maybe_pg_upgrade.py", line 51, in perform_pitr
    raise Exception('Point-in-time recovery failed')
Exception: Point-in-time recovery failed

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/scripts/maybe_pg_upgrade.py", line 140, in <module>
    main()
  File "/scripts/maybe_pg_upgrade.py", line 87, in main
    perform_pitr(upgrade, cluster_version, bin_version, config['bootstrap'])
  File "/scripts/maybe_pg_upgrade.py", line 70, in perform_pitr
    raise Exception('Point-in-time recovery failed.\nLOGS:\n--\n' + logs)
Exception: Point-in-time recovery failed.

(postgresql-X.csv):
2022-09-29 17:46:27.889 UTC,,,4046,,6335d9f3.fce,1,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"ending log output to stderr",,"Future log output will go to log destination ""csvlog"".",,,,,,,"","postmaster"
2022-09-29 17:46:27.889 UTC,,,4046,,6335d9f3.fce,2,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"starting PostgreSQL 13.7 (Ubuntu 13.7-1.pgdg18.04+1) on x86_64-pc-linux-gnu, compiled by gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0, 64-bit",,,,,,,,,"","postmaster"
2022-09-29 17:46:27.889 UTC,,,4046,,6335d9f3.fce,3,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"listening on IPv4 address ""0.0.0.0"", port 5432",,,,,,,,,"","postmaster"
2022-09-29 17:46:27.889 UTC,,,4046,,6335d9f3.fce,4,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"listening on IPv6 address ""::"", port 5432",,,,,,,,,"","postmaster"
2022-09-29 17:46:27.894 UTC,,,4046,,6335d9f3.fce,5,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"listening on Unix socket ""/var/run/postgresql/.s.PGSQL.5432""",,,,,,,,,"","postmaster"
2022-09-29 17:46:27.901 UTC,,,4049,,6335d9f3.fd1,1,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"database system was interrupted; last known up at 2022-09-28 13:10:06 UTC",,,,,,,,,"","startup"
2022-09-29 17:46:27.901 UTC,,,4049,,6335d9f3.fd1,2,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"creating missing WAL directory ""pg_wal/archive_status""",,,,,,,,,"","startup"
2022-09-29 17:46:28.192 UTC,,,4049,,6335d9f3.fd1,3,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"starting point-in-time recovery to 2022-09-29 00:00:00.666+00",,,,,,,,,"","startup"
2022-09-29 17:46:28.222 UTC,,,4049,,6335d9f3.fd1,4,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"invalid checkpoint record",,,,,,,,,"","startup"
2022-09-29 17:46:28.222 UTC,,,4049,,6335d9f3.fd1,5,,2022-09-29 17:46:27 UTC,,0,FATAL,XX000,"could not locate required checkpoint record",,"If you are restoring from a backup, touch ""/home/postgres/pgdata/pgroot/data/recovery.signal"" and add required recovery options.
If you are not restoring from a backup, try removing the file ""/home/postgres/pgdata/pgroot/data/backup_label"".
Be careful: removing ""/home/postgres/pgdata/pgroot/data/backup_label"" will result in a corrupt cluster if restoring from a backup.",,,,,,,"","startup"
2022-09-29 17:46:28.224 UTC,,,4046,,6335d9f3.fce,6,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"startup process (PID 4049) exited with exit code 1",,,,,,,,,"","postmaster"
2022-09-29 17:46:28.224 UTC,,,4046,,6335d9f3.fce,7,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"aborting startup due to startup process failure",,,,,,,,,"","postmaster"
2022-09-29 17:46:28.229 UTC,,,4046,,6335d9f3.fce,8,,2022-09-29 17:46:27 UTC,,0,LOG,00000,"database system is shut down",,,,,,,,,"","postmaster"

(postgresql-X.log):
2022-09-29 17:46:27.889 UTC user= db= pid=4046 LOG:  ending log output to stderr
2022-09-29 17:46:27.889 UTC user= db= pid=4046 HINT:  Future log output will go to log destination "csvlog".
Thu Sep 29 17:46:28 UTC 2022
Usage: /scripts/wal-e-wal-fetch.sh wal-fetch [--prefetch PREFETCH] WAL_SEGMENT WAL_DESTINATION
Thu Sep 29 17:46:28 UTC 2022
Usage: /scripts/wal-e-wal-fetch.sh wal-fetch [--prefetch PREFETCH] WAL_SEGMENT WAL_DESTINATION
Thu Sep 29 17:46:28 UTC 2022
Usage: /scripts/wal-e-wal-fetch.sh wal-fetch [--prefetch PREFETCH] WAL_SEGMENT WAL_DESTINATION
------------------------------

Without this patch setting CLONE_WALG_DOWNLOAD_CONCURRENCY env in PG manifest would be workaround.
---
 postgres-appliance/scripts/restore_command.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/postgres-appliance/scripts/restore_command.sh b/postgres-appliance/scripts/restore_command.sh
index a4bb88939..4bf2f953a 100755
--- a/postgres-appliance/scripts/restore_command.sh
+++ b/postgres-appliance/scripts/restore_command.sh
@@ -48,7 +48,7 @@ if [[ -z $WALE_S3_PREFIX ]]; then  # non AWS environment?
     if [[ -f $wale_prefetch_source ]]; then
         exec mv "${wale_prefetch_source}" "${wal_destination}"
     else
-        exec wal-e wal-fetch -p $POOL_SIZE "${wal_filename}" "${wal_destination}"
+        exec wal-e wal-fetch ${POOL_SIZE:+-p $POOL_SIZE} "${wal_filename}" "${wal_destination}"
     fi
 else
     exec bash /scripts/wal-e-wal-fetch.sh wal-fetch -p $POOL_SIZE "${wal_filename}" "${wal_destination}"

From b4960e987f14bd6e7a9ce5e1199fc82d549a1d36 Mon Sep 17 00:00:00 2001
From: Milos Hauser <16776599+hau21um@users.noreply.github.com>
Date: Thu, 29 Sep 2022 23:07:00 +0200
Subject: [PATCH 2/2] One more conditional -p usage

---
 postgres-appliance/scripts/restore_command.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/postgres-appliance/scripts/restore_command.sh b/postgres-appliance/scripts/restore_command.sh
index 4bf2f953a..926876d15 100755
--- a/postgres-appliance/scripts/restore_command.sh
+++ b/postgres-appliance/scripts/restore_command.sh
@@ -51,5 +51,5 @@ if [[ -z $WALE_S3_PREFIX ]]; then  # non AWS environment?
         exec wal-e wal-fetch ${POOL_SIZE:+-p $POOL_SIZE} "${wal_filename}" "${wal_destination}"
     fi
 else
-    exec bash /scripts/wal-e-wal-fetch.sh wal-fetch -p $POOL_SIZE "${wal_filename}" "${wal_destination}"
+    exec bash /scripts/wal-e-wal-fetch.sh wal-fetch ${POOL_SIZE:+-p $POOL_SIZE} "${wal_filename}" "${wal_destination}"
 fi