From 0a8fefad1f4060ce9cab1b3ef5902de24d8c71d3 Mon Sep 17 00:00:00 2001 From: Mats Kindahl Date: Thu, 28 Sep 2023 17:22:03 +0200 Subject: [PATCH] Restart scheduler on error If the scheduler receives an error, it will never restart again since `bgw_restart_time` is set to `BGW_NEVER_RESTART`, which will prevent all jobs from executing. This commit fixes the issue by adding a GUC that can be set to the restart time for the scheduler, and set the default to 30 seconds. It also adds some additional variables to be able to shutdown the scheduler with a non-zero exit code, which allows the restart functionality to be tested, as well as tests. Fixes #5091 --- .unreleased/feature_6195 | 1 + src/bgw/scheduler.c | 1 + src/guc.c | 27 +++++++ src/guc.h | 8 ++ src/loader/bgw_launcher.c | 20 ++++- src/loader/bgw_launcher.h | 4 +- src/loader/loader.c | 2 +- tsl/test/expected/bgw_scheduler_control.out | 90 +++++++++++++++++++++ tsl/test/sql/CMakeLists.txt | 2 + tsl/test/sql/bgw_scheduler_control.sql | 41 ++++++++++ 10 files changed, 192 insertions(+), 4 deletions(-) create mode 100644 .unreleased/feature_6195 create mode 100644 tsl/test/expected/bgw_scheduler_control.out create mode 100644 tsl/test/sql/bgw_scheduler_control.sql diff --git a/.unreleased/feature_6195 b/.unreleased/feature_6195 new file mode 100644 index 00000000000..a930c0a7bcc --- /dev/null +++ b/.unreleased/feature_6195 @@ -0,0 +1 @@ +Implements: #6195 Restart scheduler on error diff --git a/src/bgw/scheduler.c b/src/bgw/scheduler.c index 90755f6e43e..e8ee5f2fa14 100644 --- a/src/bgw/scheduler.c +++ b/src/bgw/scheduler.c @@ -807,6 +807,7 @@ ts_bgw_scheduler_process(int32 run_for_interval_ms, wait_for_all_jobs_to_shutdown(); check_for_stopped_and_timed_out_jobs(); + proc_exit(ts_bgw_scheduler_exit_code); } static void diff --git a/src/guc.c b/src/guc.c index 5027c930229..37841b6b2b6 100644 --- a/src/guc.c +++ b/src/guc.c @@ -113,6 +113,20 @@ TSDLLEXPORT int ts_guc_hypertable_replication_factor_default = 1; bool ts_guc_debug_require_batch_sorted_merge = false; +/* + * Exit code for the scheduler. + * + * Normally it exits with a zero which means that it will not restart. If an + * error is raised, it exits with error code 1, which will trigger a + * restart. + * + * This variable exists to be able to trigger a restart for a normal exit, + * which is useful when debugging. + * + * See backend/postmaster/bgworker.c + */ +int ts_bgw_scheduler_exit_code = 0; + #ifdef TS_DEBUG bool ts_shutdown_bgw = false; char *ts_current_timestamp_mock = NULL; @@ -738,6 +752,19 @@ _guc_init(void) /* assign_hook= */ NULL, /* show_hook= */ NULL); + DefineCustomIntVariable(/* name= */ "timescaledb.shutdown_bgw_scheduler_exit_code", + /* short_desc= */ "exit code to use when shutting down the scheduler", + /* long_desc= */ "this is for debugging purposes", + /* valueAddr= */ &ts_bgw_scheduler_exit_code, + /* bootValue= */ 0, + /* minValue= */ 0, + /* maxValue= */ 255, + /* context= */ PGC_SIGHUP, + /* flags= */ 0, + /* check_hook= */ NULL, + /* assign_hook= */ NULL, + /* show_hook= */ NULL); + DefineCustomStringVariable(/* name= */ "timescaledb.current_timestamp_mock", /* short_desc= */ "set the current timestamp", /* long_desc= */ "this is for debugging purposes", diff --git a/src/guc.h b/src/guc.h index 409ba38df7c..f0c40131e55 100644 --- a/src/guc.h +++ b/src/guc.h @@ -98,6 +98,14 @@ extern TSDLLEXPORT DistCopyTransferFormat ts_guc_dist_copy_transfer_format; typedef void (*set_ssl_options_hook_type)(const char *user_name); extern TSDLLEXPORT set_ssl_options_hook_type ts_set_ssl_options_hook; +/* + * Exit code to use when scheduler exits. + * + * Mostly used for debugging, but defined also for non-debug builds since that + * simplifies the code (and also simplifies debugging non-debug builds). + */ +extern TSDLLEXPORT int ts_bgw_scheduler_exit_code; + #ifdef TS_DEBUG extern bool ts_shutdown_bgw; extern char *ts_current_timestamp_mock; diff --git a/src/loader/bgw_launcher.c b/src/loader/bgw_launcher.c index 9e0adb1a979..bb6db58c5da 100644 --- a/src/loader/bgw_launcher.c +++ b/src/loader/bgw_launcher.c @@ -84,6 +84,8 @@ typedef enum SchedulerState static volatile sig_atomic_t got_SIGHUP = false; +int ts_guc_bgw_scheduler_restart_time_sec = 30; + static void launcher_sighup(SIGNAL_ARGS) { @@ -238,10 +240,24 @@ terminate_background_worker(BackgroundWorkerHandle *handle) } extern void -ts_bgw_cluster_launcher_register(void) +ts_bgw_cluster_launcher_init(void) { BackgroundWorker worker; + DefineCustomIntVariable(/* name= */ "timescaledb.bgw_scheduler_restart_time", + /* short_desc= */ "Restart time for scheduler in seconds", + /* long_desc= */ + "The number of seconds until the scheduler restart on failure.", + /* valueAddr= */ &ts_guc_bgw_scheduler_restart_time_sec, + /* bootValue= */ 30, + /* minValue= */ 1, + /* maxValue= */ 3600, + /* context= */ PGC_POSTMASTER, + /* flags= */ GUC_UNIT_S, + /* check_hook= */ NULL, + /* assign_hook= */ NULL, + /* show_hook= */ NULL); + memset(&worker, 0, sizeof(worker)); /* set up worker settings for our main worker */ snprintf(worker.bgw_name, BGW_MAXLEN, "TimescaleDB Background Worker Launcher"); @@ -276,7 +292,7 @@ register_entrypoint_for_db(Oid db_id, VirtualTransactionId vxid, BackgroundWorke memset(&worker, 0, sizeof(worker)); snprintf(worker.bgw_name, BGW_MAXLEN, "TimescaleDB Background Worker Scheduler"); worker.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; - worker.bgw_restart_time = BGW_NEVER_RESTART; + worker.bgw_restart_time = ts_guc_bgw_scheduler_restart_time_sec, worker.bgw_start_time = BgWorkerStart_RecoveryFinished; snprintf(worker.bgw_library_name, BGW_MAXLEN, EXTENSION_NAME); snprintf(worker.bgw_function_name, BGW_MAXLEN, BGW_ENTRYPOINT_FUNCNAME); diff --git a/src/loader/bgw_launcher.h b/src/loader/bgw_launcher.h index 69d9ca27b1a..0833145cc06 100644 --- a/src/loader/bgw_launcher.h +++ b/src/loader/bgw_launcher.h @@ -10,7 +10,9 @@ #include #include -extern void ts_bgw_cluster_launcher_register(void); +extern int ts_guc_bgw_scheduler_restart_time_sec; + +extern void ts_bgw_cluster_launcher_init(void); /*called by postmaster at launcher bgw startup*/ TSDLLEXPORT extern Datum ts_bgw_cluster_launcher_main(PG_FUNCTION_ARGS); diff --git a/src/loader/loader.c b/src/loader/loader.c index f96a7733b29..714ce24978a 100644 --- a/src/loader/loader.c +++ b/src/loader/loader.c @@ -711,7 +711,7 @@ _PG_init(void) timescaledb_shmem_request_hook(); #endif - ts_bgw_cluster_launcher_register(); + ts_bgw_cluster_launcher_init(); ts_bgw_counter_setup_gucs(); ts_bgw_interface_register_api_version(); ts_seclabel_init(); diff --git a/tsl/test/expected/bgw_scheduler_control.out b/tsl/test/expected/bgw_scheduler_control.out new file mode 100644 index 00000000000..d5cf50c7091 --- /dev/null +++ b/tsl/test/expected/bgw_scheduler_control.out @@ -0,0 +1,90 @@ +-- This file and its contents are licensed under the Timescale License. +-- Please see the included NOTICE for copyright information and +-- LICENSE-TIMESCALE for a copy of the license. +\c :TEST_DBNAME :ROLE_SUPERUSER +-- There is going to be one scheduler for each database, but we only +-- care if there is zero (all schedulers shut down) or at least one +-- (schedulers should be automatically restarted). +CREATE VIEW tsdb_bgw AS + SELECT DISTINCT application_name FROM pg_stat_activity + WHERE application_name LIKE 'TimescaleDB%' + ORDER BY application_name; +SHOW timescaledb.bgw_scheduler_restart_time; + timescaledb.bgw_scheduler_restart_time +---------------------------------------- + 30s +(1 row) + +SELECT _timescaledb_functions.start_background_workers(); + start_background_workers +-------------------------- + t +(1 row) + +SELECT * FROM tsdb_bgw; + application_name +----------------------------------------- + TimescaleDB Background Worker Launcher + TimescaleDB Background Worker Scheduler +(2 rows) + +ALTER SYSTEM SET timescaledb.shutdown_bgw_scheduler TO 'on'; +ALTER SYSTEM SET timescaledb.shutdown_bgw_scheduler_exit_code TO 1; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +SELECT pg_sleep(20); -- Wait for scheduler to exit. + pg_sleep +---------- + +(1 row) + +SELECT * FROM tsdb_bgw; + application_name +---------------------------------------- + TimescaleDB Background Worker Launcher +(1 row) + +ALTER SYSTEM RESET timescaledb.shutdown_bgw_scheduler; +ALTER SYSTEM RESET timescaledb.shutdown_bgw_scheduler_exit_code; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +SELECT pg_sleep(30); -- Wait for scheduler to restart. + pg_sleep +---------- + +(1 row) + +SELECT * FROM tsdb_bgw; + application_name +----------------------------------------- + TimescaleDB Background Worker Launcher + TimescaleDB Background Worker Scheduler +(2 rows) + +ALTER SYSTEM SET timescaledb.shutdown_bgw_scheduler TO 'on'; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +SELECT _timescaledb_functions.stop_background_workers(); + stop_background_workers +------------------------- + t +(1 row) + +SELECT pg_sleep(20); + pg_sleep +---------- + +(1 row) + diff --git a/tsl/test/sql/CMakeLists.txt b/tsl/test/sql/CMakeLists.txt index 9a623ffee2d..eb5d082b1ee 100644 --- a/tsl/test/sql/CMakeLists.txt +++ b/tsl/test/sql/CMakeLists.txt @@ -43,6 +43,7 @@ if(CMAKE_BUILD_TYPE MATCHES Debug) APPEND TEST_FILES bgw_db_scheduler.sql + bgw_scheduler_control.sql job_errors_permissions.sql troubleshooting_job_errors.sql bgw_db_scheduler_fixed.sql @@ -135,6 +136,7 @@ endif() set(SOLO_TESTS # dist_hypertable needs a lot of memory when the Sanitizer is active dist_hypertable-${PG_VERSION_MAJOR} + bgw_scheduler_control bgw_db_scheduler job_errors_permissions troubleshooting_job_errors diff --git a/tsl/test/sql/bgw_scheduler_control.sql b/tsl/test/sql/bgw_scheduler_control.sql new file mode 100644 index 00000000000..c11d7e319b6 --- /dev/null +++ b/tsl/test/sql/bgw_scheduler_control.sql @@ -0,0 +1,41 @@ +-- This file and its contents are licensed under the Timescale License. +-- Please see the included NOTICE for copyright information and +-- LICENSE-TIMESCALE for a copy of the license. + +\c :TEST_DBNAME :ROLE_SUPERUSER + +-- There is going to be one scheduler for each database, but we only +-- care if there is zero (all schedulers shut down) or at least one +-- (schedulers should be automatically restarted). +CREATE VIEW tsdb_bgw AS + SELECT DISTINCT application_name FROM pg_stat_activity + WHERE application_name LIKE 'TimescaleDB%' + ORDER BY application_name; + +SHOW timescaledb.bgw_scheduler_restart_time; + +SELECT _timescaledb_functions.start_background_workers(); + +SELECT * FROM tsdb_bgw; + +ALTER SYSTEM SET timescaledb.shutdown_bgw_scheduler TO 'on'; +ALTER SYSTEM SET timescaledb.shutdown_bgw_scheduler_exit_code TO 1; +SELECT pg_reload_conf(); + +SELECT pg_sleep(20); -- Wait for scheduler to exit. + +SELECT * FROM tsdb_bgw; + +ALTER SYSTEM RESET timescaledb.shutdown_bgw_scheduler; +ALTER SYSTEM RESET timescaledb.shutdown_bgw_scheduler_exit_code; +SELECT pg_reload_conf(); + +SELECT pg_sleep(30); -- Wait for scheduler to restart. + +SELECT * FROM tsdb_bgw; + +ALTER SYSTEM SET timescaledb.shutdown_bgw_scheduler TO 'on'; +SELECT pg_reload_conf(); + +SELECT _timescaledb_functions.stop_background_workers(); +SELECT pg_sleep(20);