From 848c25e0b0ea6fe509a1467d52a119ddaf239314 Mon Sep 17 00:00:00 2001 From: AlexandrLitkevich Date: Tue, 1 Apr 2025 09:51:47 +0400 Subject: [PATCH 1/2] tcm: add the tt tcm status command @TarantoolBot document Title: add the tt tcm status command. Add `tt tcm status` command that displays whether TCM is running in: - watchdog mode (with auto-recovery) - interactive mode Closes #TNTP-1992 --- CHANGELOG.md | 200 ++++++++++++------------ cli/cmd/tcm.go | 116 ++++++++++++-- cli/process_utils/process_utils.go | 21 +++ cli/tcm/tcm.go | 5 +- lib/watchdog/watchdog.go | 239 +++++++++++++++++++++++++++++ lib/watchdog/watchdog_test.go | 124 +++++++++++++++ test/integration/tcm/test_tcm.py | 119 ++++++++++++-- 7 files changed, 702 insertions(+), 122 deletions(-) create mode 100644 lib/watchdog/watchdog.go create mode 100644 lib/watchdog/watchdog_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 507f17e49..1c81aeb53 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,14 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added +- `tt aeon connect`: added tests for connect file/app. +- `tt pack `: support `.packignore` file to specify files that should not be included + in package (works the same as `.gitignore`). +- `tt tcm start`: add the tcm command. +- `tt tcm start` OR `tt tcm start --path /path/to/tcm`: added the capability to run TCM in interactive mode. +- `tt tcm start --watchdog`: implemented Watchdog mode for automatic restarting of TCM upon unexpected termination. +- `tt tcm status`: added command to check TCM runtime status (modes: `watchdog` or `interactive`). + ### Changed ### Fixed @@ -113,16 +121,16 @@ continues to improve `tt play` command. - `tt aeon connect`: add support to connect Aeon database. - `tt play`: support of the SSL parameters by using next flags: - * `sslkeyfile` - path to a private SSL key file, - * `sslcertfile` - path to an SSL certificate file, - * `sslcafile` - path to a trusted certificate authorities (CA) file, - * `sslciphers` - colon-separated list of SSL cipher suites the connection. + - `sslkeyfile` - path to a private SSL key file, + - `sslcertfile` - path to an SSL certificate file, + - `sslcafile` - path to a trusted certificate authorities (CA) file, + - `sslciphers` - colon-separated list of SSL cipher suites the connection. - `tt play`: support connection to a target instance by `application` name or `application:instance` name. - `tt coredump pack`: add options to customize coredump packing: - * `-e (--executable)`: specify Tarantool executable path. - * `-p (--pid)`: specify PID of the dumped process. - * `-t (--time)`: specify time of dump (seconds since the Epoch). + - `-e (--executable)`: specify Tarantool executable path. + - `-p (--pid)`: specify PID of the dumped process. + - `-t (--time)`: specify time of dump (seconds since the Epoch). - `tt.yaml`: allows to specify a list of modules directories. - Environment variable TT_CLI_MODULES_PATH can be used to specify an extra path with modules. @@ -130,8 +138,8 @@ continues to improve `tt play` command. ### Changed - `tt stop/kill/clean/logrotate`: no longer need: - * Instances scripts for multi-instance applications. - * Cluster config for tarantool3-based cluster applications. + - Instances scripts for multi-instance applications. + - Cluster config for tarantool3-based cluster applications. - `tt logrotate`: don't exit at non-running instance, just warn and proceed with the other instances, like `tt stop` and `tt kill` do. - `tt coredump pack`: if `-e` option is omitted first search tarantool @@ -157,16 +165,16 @@ The release introduces `upgrade` and `downgrade` subcommands for - `tt replicaset downgrade`: command to downgrade the schema on a Tarantool cluster. - * `-v (--version)`: (required) specify schema version to downgrade to. - * `-r (--replicaset)`: specify the replicaset name(s) to downgrade. - * `-t (--timeout)`: timeout for waiting the LSN synchronization (in seconds) + - `-v (--version)`: (required) specify schema version to downgrade to. + - `-r (--replicaset)`: specify the replicaset name(s) to downgrade. + - `-t (--timeout)`: timeout for waiting the LSN synchronization (in seconds) (default 5). - `tt replicaset upgrade`: command to upgrade the schema on a Tarantool cluster. - * `-r (--replicaset)`: specify the replicaset name(s) to upgrade. - * `-t (--timeout)`: timeout for waiting the LSN synchronization (in seconds) + - `-r (--replicaset)`: specify the replicaset name(s) to upgrade. + - `-t (--timeout)`: timeout for waiting the LSN synchronization (in seconds) (default 5). - * supports upgrading the database schema on remote cluster by upgrading + - supports upgrading the database schema on remote cluster by upgrading each replicaset individually using `tt replicaset upgrade `. - New flag `--timestamp` of `tt cat` and `tt play` commands is added to specify operations ending with the given timestamp. This value can be specified @@ -208,7 +216,7 @@ Additionally, several fixes were implemented to improve stability. ### Added - `tt status`: display `config`, `box`, and `replication upstream` statuses. - * `--details`: display detailed reports of errors and warnings from + - `--details`: display detailed reports of errors and warnings from instances. - `tt stop` confirmation prompt. `-y` option is added to accept stop without prompting. @@ -239,8 +247,8 @@ Additionally, several fixes were implemented to improve stability. ### Added - `tt log`: a module for viewing instances logs. Supported options: - * `--lines` number of lines to print. - * `--follow` print appended data as log files grow. + - `--lines` number of lines to print. + - `--follow` print appended data as log files grow. - `tt connect`: support format for Tarantool tuples for Tarantool versions >= 3.2. - `tt enable`: create a symbolic link in 'instances_enabled' directory to @@ -266,7 +274,7 @@ Additionally, several fixes were implemented to improve stability. ### Changed -- Do not create Dockerfile.* in application's directory. +- Do not create Dockerfile.\* in application's directory. ## [2.3.1] - 2024-06-13 @@ -280,11 +288,11 @@ Additionally, several fixes were implemented to improve stability. - `tt status`: displays the mode of the instance. - `tt coredump`: enhances coredump inspection: - * `tt coredump pack`: puts gdb.sh and GDB-extensions into the archive so + - `tt coredump pack`: puts gdb.sh and GDB-extensions into the archive so that it contains everything necessary for convenient coredump inspection. - * `tt coredump inspect`: allows archive path as an argument (archive should + - `tt coredump inspect`: allows archive path as an argument (archive should be created with `tt coredump pack`). - * `tt coredump inspect`: added `-s` option to specify the location of + - `tt coredump inspect`: added `-s` option to specify the location of tarantool sources. - `tt cluster publish`: ability to publish a new instance config. - `tt pack` does not create unnecessary directories and removes files that are @@ -298,7 +306,7 @@ Additionally, several fixes were implemented to improve stability. file. - `tt pack`: systemd unit parameterizing support. - `tt replicaset vshard`: module to manage vshard in the tarantool replicaset. - * `tt replicaset vshard bootstrap`: command to bootstrap vshard. + - `tt replicaset vshard bootstrap`: command to bootstrap vshard. ### Fixed @@ -321,9 +329,9 @@ Additionally, several fixes were implemented to improve stability. replicaset with cluster config (3.0) orchestrator. - `tt cluster replicaset`: module to manage replicaset via 3.0 cluster config storage. - * `tt cluster replicaset promote`: command to promote an instance in + - `tt cluster replicaset promote`: command to promote an instance in the replicaset. - * `tt cluster replicaset demote`: command to demote an instance in + - `tt cluster replicaset demote`: command to demote an instance in the replicaset. - `tt connect --binary`: connect to instance using binary port. - `tt kill`: command to stop instance(s) with SIGQUIT and SIGKILL signals. @@ -383,12 +391,12 @@ Additionally, several fixes were implemented to improve stability. ### Added -- Module ``tt replicaset``, to manage replicasets: - - ``tt replicaset status`` to show a cluster status information. +- Module `tt replicaset`, to manage replicasets: + - `tt replicaset status` to show a cluster status information. ### Changed -- Disable ``tt run`` tarantool flag parsing. +- Disable `tt run` tarantool flag parsing. ### Fixed @@ -402,8 +410,8 @@ Additionally, several fixes were implemented to improve stability. - `tt connect` auto-completion shows directories and files when there are no running apps. - `tt rocks --server` now accepts several URL's. -- Disable ``tt run`` tarantool flag parsing. -- Now ``tt run`` starts instance without our wrapper. +- Disable `tt run` tarantool flag parsing. +- Now `tt run` starts instance without our wrapper. ### Added @@ -418,7 +426,7 @@ Additionally, several fixes were implemented to improve stability. - Print log messages to stderr. - Global flags are required to be positioned only before child - commands. Example: ``tt --cfg tt.yaml install tt``. + commands. Example: `tt --cfg tt.yaml install tt`. - tt config format: separate tt environment options from application options. - tt version: additional version information for non-release builds. - Working directory is changed to an application source directory. @@ -439,7 +447,7 @@ Additionally, several fixes were implemented to improve stability. - Log rotation functionality and configuration is removed from `tt`. `tt logrotate` command re-opens a log file and sends SIGHUP to the child `tarantool` processes. -- ``tt cat``: all diagnostic messages are printed to stderr. +- `tt cat`: all diagnostic messages are printed to stderr. - Print `tarantool` stdout/stderr and watchdog logs to the same log file - `tt.log`. @@ -447,16 +455,16 @@ Additionally, several fixes were implemented to improve stability. - tt completion: added luarocks completions. - tarantool-ee: search and install development builds. -- ``tt play``: ability to pass username and password via flags and environment +- `tt play`: ability to pass username and password via flags and environment variables. - tt cluster: credentials could be passed via environment variables and command flags. ### Fixed -- ``tt rocks``: broken ``--verbose`` option. -- ``tt binaries``: tarantool-ee binaries not shown. -- ``tt cluster``: recognize app:instance as a etcd URL. +- `tt rocks`: broken `--verbose` option. +- `tt binaries`: tarantool-ee binaries not shown. +- `tt cluster`: recognize app:instance as a etcd URL. ## [1.3.0] - 2023-09-28 @@ -464,15 +472,15 @@ Additionally, several fixes were implemented to improve stability. - Use CLI arg connect string for the prompt line and the title to avoid too long prompt line when using 'app:instance' target format. -- ``tt rocks``: luarocks version has been updated to 3.9.2. +- `tt rocks`: luarocks version has been updated to 3.9.2. ### Added - `tt install tarantool/tt`: ability to install tarantool and tt from an - arbitrary commit. The binary has the name tt/tarantool_ + seven-digit hash. + arbitrary commit. The binary has the name tt/tarantool\_ + seven-digit hash. - New `tt pack` flag `--tarantool-version` is added to specify tarantool version for pack in docker. It is supported only with `--use-docker` enabled. -- Module ``tt cluster``, to show or publish a cluster or an instance +- Module `tt cluster`, to show or publish a cluster or an instance configuration. - `tt connect`: added command `\help` to show the help with a list of available commands. @@ -551,9 +559,9 @@ Additionally, several fixes were implemented to improve stability. ### Fixed -- ``tt start`` now does not start an instance if it is already running. -- ``tt rocks`` uses rocks repo path relative to tt environment config location. -- ``tt connect`` now does not crash on `\q` input. +- `tt start` now does not start an instance if it is already running. +- `tt rocks` uses rocks repo path relative to tt environment config location. +- `tt connect` now does not crash on `\q` input. ### Added @@ -562,10 +570,10 @@ Additionally, several fixes were implemented to improve stability. in case of the pattern doesn't contain delimiter `:`, and suitable instances otherwise. - support tt environment directories overriding using environment variables: - * TT_CLI_REPO_ROCKS environment variable value is used as rocks repository + - TT_CLI_REPO_ROCKS environment variable value is used as rocks repository path if it is set and there is no tt.repo.rocks in tt configuration file or tt.repo.rocks directory does not include repository manifest file. - * TT_CLI_TARANTOOL_PREFIX environment variable value is used for as tarantool + - TT_CLI_TARANTOOL_PREFIX environment variable value is used for as tarantool installation prefix directory for rocks commands if it is set and tarantool executable is found in PATH. - smart auto-completion for `tt create`. It shows a list of built-in templates @@ -576,39 +584,39 @@ Additionally, several fixes were implemented to improve stability. ### Changed -- ``tt install tarantool`` without version specification now installs the +- `tt install tarantool` without version specification now installs the latest release. -- ``tt install/search tarantool-ee`` now uses credentials from `tarantool.io` +- `tt install/search tarantool-ee` now uses credentials from `tarantool.io` customer zone. Also, installation now requires specifying the version. -- ``tt search tarantool-ee`` options changed. A new `--version` flag has been +- `tt search tarantool-ee` options changed. A new `--version` flag has been added to allow search for a specific release. The `--dev` and `--dbg` options have been merged into a single `--debug` option. -- ``tt search`` now uses subcommands for searching tarantool/tarantool-ee/tt +- `tt search` now uses subcommands for searching tarantool/tarantool-ee/tt binaries ### Added -- ``--dynamic`` option for `tt install tarantool` command to build non-static +- `--dynamic` option for `tt install tarantool` command to build non-static tarantool executable. ### Fixed -- ``tt connect`` command does not break a console after executing `os.exit()` +- `tt connect` command does not break a console after executing `os.exit()` command anymore. ## [1.0.2] - 2023-04-21 ### Fixed -- ``tt cartridge`` command takes into account run dir path from the `tt` +- `tt cartridge` command takes into account run dir path from the `tt` environment. So most of the `tt cartridge` sub-commands are able to work without specifying `--run-dir` option. -- ``tt install`` command checks it's write rights to binary and include +- `tt install` command checks it's write rights to binary and include directories before installing binaries. ### Changed -- ``tt install/uninstall`` command line interface is updated. Program names +- `tt install/uninstall` command line interface is updated. Program names have become sub-commands with their own options. ## [1.0.1] - 2023-04-04 @@ -616,7 +624,7 @@ Additionally, several fixes were implemented to improve stability. ### Added - A configurable variable `cluster_cookie` for `tt create cartridge` template. -- ``tt build`` accepts application name for building. +- `tt build` accepts application name for building. - Creating wal, vinyl and memtx directories for `tt pack`. If these directories are not located in the same directory in the environment for packing, the result package will contain separate snap/vinyl/wal directories for @@ -628,9 +636,9 @@ Additionally, several fixes were implemented to improve stability. ### Changed -- ``tt uninstall`` does not ask version if only one version of a program is +- `tt uninstall` does not ask version if only one version of a program is installed. -- ``tt rocks init`` is disabled. +- `tt rocks init` is disabled. ## [1.0.0] - 2023-03-23 @@ -643,10 +651,10 @@ Additionally, several fixes were implemented to improve stability. log file. This option affects only single instance applications. - An ability to set different directories for WAL, vinyl and snapshots artifacts. -- ``tt instances`` command to print a list of enabled applications. -- SSL options for ``tt connect`` command. +- `tt instances` command to print a list of enabled applications. +- SSL options for `tt connect` command. - An ability to pass arguments to a connect command. -- ``tt binaries`` command. It shows a list of installed binaries and their +- `tt binaries` command. It shows a list of installed binaries and their versions. ### Changed @@ -660,7 +668,7 @@ Additionally, several fixes were implemented to improve stability. ### Fixed -- Output of the ``help`` with all commands. +- Output of the `help` with all commands. - Allow more characters for URI credentials. ## [0.4.0] - 2022-12-31 @@ -668,8 +676,8 @@ Additionally, several fixes were implemented to improve stability. ### Added - Support of rocks repository specified in tt config. -- ``cfg dump`` module. It prints tt environment configuration. -- ``--use-docker`` option for ``tt pack`` for packing environments in docker +- `cfg dump` module. It prints tt environment configuration. +- `--use-docker` option for `tt pack` for packing environments in docker container. - Support of MacOS. @@ -681,10 +689,10 @@ Additionally, several fixes were implemented to improve stability. ### Added -- ``tt restart`` confirmation prompt. ``-y`` option is added to accept restart +- `tt restart` confirmation prompt. `-y` option is added to accept restart without prompting. -- ``tt pack`` will generate systemd unit for rpm and deb packages. -- ``--use-docker`` option for ``tt install`` to build Tarantool in +- `tt pack` will generate systemd unit for rpm and deb packages. +- `--use-docker` option for `tt install` to build Tarantool in Ubuntu 16.04 container. - Ability to use the `start/stop/restart/status/check` commands without arguments to interact with all instances of the environment simultaneously. @@ -695,16 +703,16 @@ Additionally, several fixes were implemented to improve stability. ### Changed -- ``tt cartridge`` sub-commands ``create``, ``build``, ``pack`` are removed. -- ``remove`` command is renamed to ``uninstall``. -- Updated values in system ``tarantool.yaml`` for ``bin_dir``, ``inc_dir`` - and ``repo: distfiles``. +- `tt cartridge` sub-commands `create`, `build`, `pack` are removed. +- `remove` command is renamed to `uninstall`. +- Updated values in system `tarantool.yaml` for `bin_dir`, `inc_dir` + and `repo: distfiles`. ### Fixed -- Working of the ``help`` module with multi-level commands (commands with +- Working of the `help` module with multi-level commands (commands with several subcommands). -- Using the system ``tarantool.yaml`` when installing from the repository. +- Using the system `tarantool.yaml` when installing from the repository. ## [0.2.1] - 2022-11-24 @@ -717,16 +725,16 @@ Additionally, several fixes were implemented to improve stability. ### Added -- Module ``tt init``, to create tt environment configuration file. -- Module ``tt daemon``, to manage the ``tt`` daemon. +- Module `tt init`, to create tt environment configuration file. +- Module `tt daemon`, to manage the `tt` daemon. - Built-in application templates support. Cartridge application template is added. -- Using ``default_cfg`` from ``.tarantoolctl`` for ``tarantool.yaml`` - generation in ``tt init``. +- Using `default_cfg` from `.tarantoolctl` for `tarantool.yaml` + generation in `tt init`. ### Changed -- Modules ``tt start``, ``tt connect`` and ``tt catridge`` now use relative +- Modules `tt start`, `tt connect` and `tt catridge` now use relative paths for unix sockets. It allows to use socket paths longer than sun_path limit.(108/106 on linux/macOS) e.g foo/bar.sock -> ./bar.sock @@ -734,25 +742,25 @@ Additionally, several fixes were implemented to improve stability. ### Added -- Module ``tt version``, to get information about the version of the CLI. -- Module ``tt completion``, to generate autocompletion for a specified shell. -- Module ``tt help``, to get information about the CLI and its modules. -- Module ``tt start``, responsible for launching the instance according to the +- Module `tt version`, to get information about the version of the CLI. +- Module `tt completion`, to generate autocompletion for a specified shell. +- Module `tt help`, to get information about the CLI and its modules. +- Module `tt start`, responsible for launching the instance according to the application file. -- Module ``tt stop``, responsible for terminating the instance. -- Module ``tt status``, to get information about the state of the instance. -- Module ``tt restart``, responsible for restarting of the instance. -- Module ``tt logrotate``, to rotate instance logs. -- Module ``tt check``, to check an application file for syntax errors. -- Module ``tt connect``, used to connect to a running instance. -- Module ``tt rocks``, LuaRocks package manager. -- Module ``tt cat``, to print into stdout the contents of .snap/.xlog files. -- Module ``tt play``, to play the contents of .snap/.xlog files to another +- Module `tt stop`, responsible for terminating the instance. +- Module `tt status`, to get information about the state of the instance. +- Module `tt restart`, responsible for restarting of the instance. +- Module `tt logrotate`, to rotate instance logs. +- Module `tt check`, to check an application file for syntax errors. +- Module `tt connect`, used to connect to a running instance. +- Module `tt rocks`, LuaRocks package manager. +- Module `tt cat`, to print into stdout the contents of .snap/.xlog files. +- Module `tt play`, to play the contents of .snap/.xlog files to another Tarantool instance. -- Module ``tt coredump``, to pack/unpack/inspect tarantool coredump. -- Module ``tt run``, to start tarantool instance using tt wrapper. -- Module ``tt search``, to show available tt/tarantool versions. -- Module ``tt create``, to create an application from a template. -- Module ``tt build``, to build an application. -- Module ``tt install``, to install tarantool/tt. -- Module ``tt remove``, to remove tarantool/tt. +- Module `tt coredump`, to pack/unpack/inspect tarantool coredump. +- Module `tt run`, to start tarantool instance using tt wrapper. +- Module `tt search`, to show available tt/tarantool versions. +- Module `tt create`, to create an application from a template. +- Module `tt build`, to build an application. +- Module `tt install`, to install tarantool/tt. +- Module `tt remove`, to remove tarantool/tt. diff --git a/cli/cmd/tcm.go b/cli/cmd/tcm.go index 9f9f6ec8a..60197063e 100644 --- a/cli/cmd/tcm.go +++ b/cli/cmd/tcm.go @@ -2,19 +2,31 @@ package cmd import ( "errors" + "fmt" + "log" "os" "os/exec" + "path/filepath" "time" + "github.com/jedib0t/go-pretty/v6/table" + "github.com/jedib0t/go-pretty/v6/text" "github.com/spf13/cobra" "github.com/tarantool/tt/cli/cmdcontext" "github.com/tarantool/tt/cli/modules" + "github.com/tarantool/tt/cli/process_utils" tcmCmd "github.com/tarantool/tt/cli/tcm" "github.com/tarantool/tt/cli/util" + libwatchdog "github.com/tarantool/tt/lib/watchdog" ) var tcmCtx = tcmCmd.TcmCtx{} +const ( + tcmPidFile = "tcmPidFile.pid" + watchdogPidFile = "watchdogPidFile.pid" +) + func newTcmStartCmd() *cobra.Command { var tcmCmd = &cobra.Command{ Use: "start", @@ -26,7 +38,6 @@ func newTcmStartCmd() *cobra.Command { cmdCtx.CommandName = cmd.Name() err := modules.RunCmd(&cmdCtx, cmd.CommandPath(), &modulesInfo, internalStartTcm, args) util.HandleCmdErr(cmd, err) - }, } tcmCmd.Flags().StringVar(&tcmCtx.Executable, "path", "", "the path to the tcm binary file") @@ -35,6 +46,35 @@ func newTcmStartCmd() *cobra.Command { return tcmCmd } +func newTcmStatusCmd() *cobra.Command { + var tcmCmd = &cobra.Command{ + Use: "status", + Short: "Status tcm application", + Long: `Status to the tcm. + tt tcm status`, + Run: func(cmd *cobra.Command, args []string) { + cmdCtx.CommandName = cmd.Name() + err := modules.RunCmd(&cmdCtx, cmd.CommandPath(), &modulesInfo, internalTcmStatus, args) + util.HandleCmdErr(cmd, err) + }, + } + return tcmCmd +} + +func newTcmStopCmd() *cobra.Command { + var tcmCmd = &cobra.Command{ + Use: "stop", + Short: "Stop tcm application", + Long: `Stop to the tcm. tt tcm stop`, + Run: func(cmd *cobra.Command, args []string) { + cmdCtx.CommandName = cmd.Name() + err := modules.RunCmd(&cmdCtx, cmd.CommandPath(), &modulesInfo, internalTcmStop, args) + util.HandleCmdErr(cmd, err) + }, + } + return tcmCmd +} + func NewTcmCmd() *cobra.Command { var tcmCmd = &cobra.Command{ Use: "tcm", @@ -42,6 +82,8 @@ func NewTcmCmd() *cobra.Command { } tcmCmd.AddCommand( newTcmStartCmd(), + newTcmStatusCmd(), + newTcmStopCmd(), ) return tcmCmd } @@ -49,26 +91,28 @@ func NewTcmCmd() *cobra.Command { func startTcmInteractive() error { tcmApp := exec.Command(tcmCtx.Executable) - tcmApp.Stdout = os.Stdout - tcmApp.Stderr = os.Stderr - - if err := tcmApp.Run(); err != nil { + if err := tcmApp.Start(); err != nil { return err } - return nil -} + if tcmApp == nil || tcmApp.Process == nil { + return errors.New("process is not running") + } -func startTcmUnderWatchDog() error { - wd, err := tcmCmd.NewWatchdog(5 * time.Second) + err := process_utils.CreatePIDFile(tcmPidFile, tcmApp.Process.Pid) if err != nil { return err } + log.Printf("(INFO): Interactive process PID %d written to %s\n", tcmApp.Process.Pid, tcmPidFile) + return nil +} + +func startTcmUnderWatchDog() error { + wd := libwatchdog.NewWatchdog(tcmPidFile, watchdogPidFile, 5*time.Second) if err := wd.Start(tcmCtx.Executable); err != nil { return err } - return nil } @@ -87,11 +131,61 @@ func internalStartTcm(cmdCtx *cmdcontext.CmdCtx, args []string) error { if err := startTcmInteractive(); err != nil { return err } + } else { + if err := startTcmUnderWatchDog(); err != nil { + return err + } } - if err := startTcmUnderWatchDog(); err != nil { + return nil +} + +func internalTcmStatus(cmdCtx *cmdcontext.CmdCtx, args []string) error { + pidAbsPath, err := filepath.Abs(tcmPidFile) + if err != nil { return err } + if _, err := os.Stat(pidAbsPath); err != nil { + return fmt.Errorf("path does not exist: %v", err) + } + + ts := table.NewWriter() + ts.SetOutputMirror(os.Stdout) + + ts.AppendHeader( + table.Row{"APPLICATION", "STATUS", "PID"}) + + ts.SetColumnConfigs([]table.ColumnConfig{ + {Number: 1, Align: text.AlignLeft, AlignHeader: text.AlignLeft}, + {Number: 2, Align: text.AlignLeft, AlignHeader: text.AlignLeft}, + {Number: 3, Align: text.AlignLeft, AlignHeader: text.AlignLeft}, + {Number: 4, Align: text.AlignLeft, AlignHeader: text.AlignLeft}, + }) + + status := process_utils.ProcessStatus(pidAbsPath) + + ts.AppendRows([]table.Row{ + {"TCM", status.Status, status.PID}, + }) + ts.Render() + return nil +} + +func internalTcmStop(cmdCtx *cmdcontext.CmdCtx, args []string) error { + if isExists, _ := process_utils.ExistsAndRecord(watchdogPidFile); isExists { + _, err := process_utils.StopProcess(watchdogPidFile) + if err != nil { + return err + } + log.Println("Watchdog and TCM stoped") + } else { + _, err := process_utils.StopProcess(tcmPidFile) + if err != nil { + return err + } + log.Println("TCM stoped") + } + return nil } diff --git a/cli/process_utils/process_utils.go b/cli/process_utils/process_utils.go index c49e274a1..75a0ae7d1 100644 --- a/cli/process_utils/process_utils.go +++ b/cli/process_utils/process_utils.go @@ -104,6 +104,27 @@ func CheckPIDFile(pidFileName string) error { return nil } +// ExistsAndRecord checks if the process with the given pidFileName exists and is alive. +// If it does, returns true, otherwise returns false. +// If something went wrong while trying to read the PID file, returns an error. +func ExistsAndRecord(pidFileName string) (bool, error) { + if _, err := os.Stat(pidFileName); err == nil { + // The PID file already exists. We have to check if the process is alive. + pid, err := GetPIDFromFile(pidFileName) + if err != nil { + return false, fmt.Errorf(`pID file exists, but PID can't be read. Error: "%v"`, err) + } + if res, _ := IsProcessAlive(pid); res { + return true, nil + } + } else if !os.IsNotExist(err) { + return false, fmt.Errorf(`something went wrong while trying to read the PID file. Error: "%v"`, + err) + } + + return false, nil +} + // CreatePIDFile checks that the instance PID file is absent or // deprecated and creates a new one. Returns an error on failure. func CreatePIDFile(pidFileName string, pid int) error { diff --git a/cli/tcm/tcm.go b/cli/tcm/tcm.go index cd03a882b..67417fdf5 100644 --- a/cli/tcm/tcm.go +++ b/cli/tcm/tcm.go @@ -2,5 +2,8 @@ package tcm type TcmCtx struct { Executable string - Watchdog bool + TcmPidFile string + + Watchdog bool + WathdogPidFile string } diff --git a/lib/watchdog/watchdog.go b/lib/watchdog/watchdog.go new file mode 100644 index 000000000..44a1bc225 --- /dev/null +++ b/lib/watchdog/watchdog.go @@ -0,0 +1,239 @@ +package watchdog + +import ( + "context" + "errors" + "log" + "os" + "os/exec" + "os/signal" + "sync" + "sync/atomic" + "syscall" + "time" + + "github.com/tarantool/tt/cli/process_utils" +) + +type Watchdog struct { + cmd *exec.Cmd + restartTimeout time.Duration + shouldStop atomic.Bool + doneBarrier sync.WaitGroup + pidFile string + wdPidFile string + + cmdMutex sync.Mutex + pidFileMutex sync.Mutex + signalChan chan os.Signal + processGroupPID atomic.Int32 + startupComplete chan struct{} +} + +// NewWatchdog initializes a new Watchdog instance with the specified +// PID file paths and restart timeout duration. It sets up channels +// for signal notification and startup completion. Returns a pointer +// to the created Watchdog. +func NewWatchdog(pidFile, wdPidFile string, restartTimeout time.Duration) *Watchdog { + return &Watchdog{ + pidFile: pidFile, + wdPidFile: wdPidFile, + restartTimeout: restartTimeout, + signalChan: make(chan os.Signal, 1), + startupComplete: make(chan struct{}), + } +} + +// Start begins monitoring and managing the target process. +// It handles process execution, restart logic, and signal processing. +func (wd *Watchdog) Start(bin string, args ...string) error { + // Add to wait group to track active goroutines + wd.doneBarrier.Add(1) + // Ensure we decrement wait group when done + defer wd.doneBarrier.Done() + + // Create context for graceful shutdown + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() // Ensure context is canceled when we exit + + // Register signal handler for termination signals + signal.Notify(wd.signalChan, syscall.SIGINT, syscall.SIGTERM) + // Clean up signal handlers when done + defer signal.Stop(wd.signalChan) + + // Signal handling goroutine + go func() { + select { + case sig := <-wd.signalChan: + // Only process signal if not already stopping + if !wd.shouldStop.Load() { + log.Printf("(INFO): Received signal: %v", sig) + wd.Stop() + } + case <-ctx.Done(): + } + }() + + // Main process management loop + for { + // Check if we should stop before each iteration + if wd.shouldStop.Load() { + return nil + } + + // Start the managed process + wd.cmdMutex.Lock() + wd.cmd = exec.Command(bin, args...) + // Create new process group for proper signal handling + wd.cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + + // Start the process + if err := wd.cmd.Start(); err != nil { + wd.cmdMutex.Unlock() + log.Printf("(ERROR): Failed to start process: %v", err) + return err + } + + // Store process group PID atomically + wd.processGroupPID.Store(int32(wd.cmd.Process.Pid)) + wd.cmdMutex.Unlock() + + // Write PID files after successful start + if err := wd.writePIDFiles(); err != nil { + log.Printf("(ERROR): Failed to write PID files: %v", err) + _ = wd.terminateProcess() // Clean up if PID files fail + return err + } + + log.Println("(INFO): Process started successfully") + close(wd.startupComplete) // Signal that startup is complete + + // Wait for process completion in separate goroutine + waitChan := make(chan error, 1) + go func() { waitChan <- wd.cmd.Wait() }() + + select { + case err := <-waitChan: + // Check for stop signal after process exits + if wd.shouldStop.Load() { + return nil + } + + // Handle process exit status + if err != nil { + if errors.As(err, new(*exec.ExitError)) { + log.Printf("(WARN): Process exited with error: %v", err) + } else { + log.Printf("(ERROR): Process failed: %v", err) + return err + } + } else { + log.Println("(INFO): Process completed successfully.") + } + + case <-ctx.Done(): + // Context canceled - terminate process + _ = wd.terminateProcess() + return nil + } + + // Check stop condition again before restart + if wd.shouldStop.Load() { + return nil + } + + // Wait before restarting + log.Printf("(INFO): Waiting %s before restart...", wd.restartTimeout) + select { + case <-time.After(wd.restartTimeout): + // Continue to next iteration after timeout + case <-ctx.Done(): + // Exit if context canceled during wait + return nil + } + + // Reset startup complete channel for next iteration + wd.startupComplete = make(chan struct{}) + } +} + +// Stop initiates a graceful shutdown of the Watchdog and its managed process. +// It ensures all resources are properly cleaned up and goroutines are terminated. +func (wd *Watchdog) Stop() { + // Atomically set shouldStop flag to prevent multiple concurrent stops + // CompareAndSwap ensures only one goroutine can execute the stop sequence + if !wd.shouldStop.CompareAndSwap(false, true) { + return // Already stopping or stopped + } + + // Ensure process startup is complete before attempting to stop + // This prevents races during process initialization + select { + case <-wd.startupComplete: + // Normal case - startup already completed + default: + // Startup still in progress - wait for completion + log.Println("(INFO): Waiting for process startup...") + <-wd.startupComplete + } + + // Terminate the managed process + _ = wd.terminateProcess() + + // Clean up signal handling + signal.Stop(wd.signalChan) + close(wd.signalChan) + + // Wait for all goroutines to complete + // This ensures we don't exit while signal handlers are still running + wd.doneBarrier.Wait() + + // Final log message indicating successful shutdown + log.Println("(INFO): Watchdog stopped.") +} + +// terminateProcess sends a termination signal to the managed process. +func (wd *Watchdog) terminateProcess() error { + wd.cmdMutex.Lock() + defer wd.cmdMutex.Unlock() + + if wd.cmd == nil || wd.cmd.Process == nil { + return nil + } + + log.Println("(INFO): Stopping process...") + + pgid := int(wd.processGroupPID.Load()) + + // Send SIGTERM to entire process group if available (preferred method) + if pgid > 0 { + return syscall.Kill(-pgid, syscall.SIGTERM) + } + + return wd.cmd.Process.Signal(syscall.SIGTERM) +} + +// writePIDFiles creates PID files for both the monitored process and the watchdog itself. +func (wd *Watchdog) writePIDFiles() error { + wd.pidFileMutex.Lock() + defer wd.pidFileMutex.Unlock() + + if wd.cmd == nil || wd.cmd.Process == nil { + return errors.New("process is not running") + } + + if err := process_utils.CreatePIDFile(wd.pidFile, wd.cmd.Process.Pid); err != nil { + return err + } + log.Printf("(INFO): Process PID %d written to %s", wd.cmd.Process.Pid, wd.pidFile) + + if isExistsAndRecord, _ := process_utils.ExistsAndRecord(wd.wdPidFile); !isExistsAndRecord { + if err := process_utils.CreatePIDFile(wd.wdPidFile, os.Getpid()); err != nil { + return err + } + } + + log.Printf("(INFO): Watchdog PID %d written to %s", os.Getpid(), wd.wdPidFile) + + return nil +} diff --git a/lib/watchdog/watchdog_test.go b/lib/watchdog/watchdog_test.go new file mode 100644 index 000000000..ec6be41f5 --- /dev/null +++ b/lib/watchdog/watchdog_test.go @@ -0,0 +1,124 @@ +package watchdog + +import ( + "os" + "os/exec" + "path/filepath" + "syscall" + "testing" + "time" +) + +// TestWatchdog_StartStop tests that the watchdog starts a process, creates a PID +// file, and stops the process when asked to. +func TestWatchdog_StartStop(t *testing.T) { + pidFile := filepath.Join(t.TempDir(), "test.pid") + wdPidFile := filepath.Join(t.TempDir(), "watchdog.pid") + + wd := NewWatchdog(pidFile, wdPidFile, 100*time.Millisecond) + + err := wd.Start("sleep", "1") + if err != nil { + t.Fatalf("Start failed: %v", err) + } + + time.Sleep(100 * time.Millisecond) + + wd.Stop() + + if _, err := os.Stat(pidFile); os.IsNotExist(err) { + t.Error("PID file not created") + } + if _, err := os.Stat(wdPidFile); os.IsNotExist(err) { + t.Error("Watchdog PID file not created") + } +} + +// TestWatchdog_SignalHandling tests that the watchdog can handle system signals. +// It verifies that sending a SIGTERM signal to the watchdog's signal channel +// causes the watchdog to stop the monitored process within the expected time frame. +func TestWatchdog_SignalHandling(t *testing.T) { + pidFile := filepath.Join(t.TempDir(), "test.pid") + wdPidFile := filepath.Join(t.TempDir(), "watchdog.pid") + + wd := NewWatchdog(pidFile, wdPidFile, time.Second) + + go func() { + err := wd.Start("sleep", "10") + if err != nil { + t.Logf("Start exited with: %v", err) + } + }() + + time.Sleep(100 * time.Millisecond) + + wd.signalChan <- syscall.SIGTERM + + select { + case <-time.After(500 * time.Millisecond): + t.Error("Watchdog didn't stop on SIGTERM") + default: + } +} + +// TestWatchdog_TerminateProcess verifies that the watchdog's terminateProcess +// function successfully kills the monitored process and its process group. +func TestWatchdog_TerminateProcess(t *testing.T) { + wd := &Watchdog{ + pidFile: filepath.Join(t.TempDir(), "test.pid"), + wdPidFile: filepath.Join(t.TempDir(), "watchdog.pid"), + restartTimeout: time.Second, + } + + cmd := exec.Command("sleep", "10") + if err := cmd.Start(); err != nil { + t.Fatalf("Failed to start test process: %v", err) + } + defer cmd.Process.Kill() + + wd.cmd = cmd + wd.processGroupPID.Store(int32(cmd.Process.Pid)) + + if err := wd.terminateProcess(); err != nil { + t.Errorf("terminateProcess failed: %v", err) + } + + _, err := cmd.Process.Wait() + if err == nil { + t.Error("Process was not terminated") + } +} + +// TestWatchdog_WritePIDFiles verifies that the Watchdog's writePIDFiles +// method successfully creates the expected PID files for both the monitored +// process and the watchdog itself. It starts a test process, assigns it to +// the watchdog, and checks if the PID files are correctly created in the +// specified temporary directories. +func TestWatchdog_WritePIDFiles(t *testing.T) { + pidFile := filepath.Join(t.TempDir(), "test.pid") + wdPidFile := filepath.Join(t.TempDir(), "watchdog.pid") + + wd := &Watchdog{ + pidFile: pidFile, + wdPidFile: wdPidFile, + } + + cmd := exec.Command("sleep", "1") + if err := cmd.Start(); err != nil { + t.Fatalf("Failed to start test process: %v", err) + } + defer cmd.Process.Kill() + + wd.cmd = cmd + + if err := wd.writePIDFiles(); err != nil { + t.Errorf("writePIDFiles failed: %v", err) + } + + if _, err := os.Stat(pidFile); os.IsNotExist(err) { + t.Error("Process PID file not created") + } + if _, err := os.Stat(wdPidFile); os.IsNotExist(err) { + t.Error("Watchdog PID file not created") + } +} diff --git a/test/integration/tcm/test_tcm.py b/test/integration/tcm/test_tcm.py index f5613422f..d7d89b3d4 100644 --- a/test/integration/tcm/test_tcm.py +++ b/test/integration/tcm/test_tcm.py @@ -1,34 +1,76 @@ +<<<<<<< HEAD from subprocess import PIPE, Popen +======= +import os +from subprocess import PIPE, STDOUT, Popen, run +>>>>>>> d913784 (tcm: add the tt tcm status command) from utils import skip_if_tarantool_ce, wait_for_lines_in_output TcmStartCommand = ("tcm", "start") TcmStartWatchdogCommand = ("tcm", "start", "--watchdog") +TcmStatusCommand = ("tcm", "status") +TcmStopCommand = ("tcm", "stop") -def test_tcm_start_success(tt_cmd): +def test_tcm_start_success(tt_cmd, tmp_path): skip_if_tarantool_ce() - cmd = [str(tt_cmd), *TcmStartCommand] - print(f"Run: {' '.join(cmd)}") + start_cmd = [tt_cmd, *TcmStartCommand] + print(f"Run: {start_cmd}") tcm = Popen( - cmd, + start_cmd, + cwd=tmp_path, + text=True, + encoding="utf-8", + stdout=PIPE, + stderr=STDOUT, + ) + + output = wait_for_lines_in_output(tcm.stdout, ["(INFO):Process PID"]) + + assert tcm.pid + + with open(os.path.join(tmp_path, 'tcmPidFile.pid'), 'r') as f: + tcm_pid = f.read().strip() + assert f'(INFO): Interactive process PID {tcm_pid} written to tcmPidFile.pid' in output.strip() + + cmdStatus = [str(tt_cmd), *TcmStatusCommand] + print(f"Run: {' '.join(cmdStatus)}") + + status = Popen( + cmdStatus, + cwd=tmp_path, text=True, encoding="utf-8", stdout=PIPE, - stderr=PIPE, + stderr=STDOUT, ) - wait_for_lines_in_output(tcm.stdout, ["TCM_CLUSTER_CONNECTION_RATE_LIMIT"]) - tcm.terminate() - tcm.wait() + output = wait_for_lines_in_output(status.stdout, ["TCM", "RUNNING"]) + assert "RUNNING" in output + + cmdStop = [str(tt_cmd), *TcmStopCommand] + print(f"Run: {' '.join(cmdStop)}") + + stop = Popen( + cmdStop, + cwd=tmp_path, + text=True, + encoding="utf-8", + stdout=PIPE, + stderr=STDOUT, + ) + + output = wait_for_lines_in_output(stop.stdout, ["TCM"]) + assert "TCM stoped" in output.strip() assert tcm.poll() is not None -def test_tcm_start_with_watchdog_success(tt_cmd): +def test_tcm_start_with_watchdog_success(tt_cmd, tmp_path): skip_if_tarantool_ce() cmd = [str(tt_cmd), *TcmStartWatchdogCommand] @@ -36,19 +78,68 @@ def test_tcm_start_with_watchdog_success(tt_cmd): tcm = Popen( cmd, + cwd=tmp_path, text=True, encoding="utf-8", stdout=PIPE, - stderr=PIPE, + stderr=STDOUT, ) - wait_for_lines_in_output(tcm.stdout, ["connecting to storage..."]) - tcm.terminate() - tcm.wait() + output = wait_for_lines_in_output(tcm.stdout, ["(INFO): Process started successfully"]) + assert "(INFO): Process started successfully" in output.strip() - assert tcm.pid is not None + cmdStatus = [str(tt_cmd), *TcmStatusCommand] + print(f"Run: {' '.join(cmdStatus)}") + + status = run( + cmdStatus, + cwd=tmp_path, + text=True, + encoding="utf-8", + stdout=PIPE, + stderr=STDOUT, + ) + + with open(os.path.join(tmp_path, 'tcmPidFile.pid'), 'r') as f: + tcm_pid = f.read().strip() + + assert "TCM" and "RUNNING" and tcm_pid in status.stdout tcm.terminate() tcm.wait() + assert tcm.pid is not None assert tcm.poll() is not None + + skip_if_tarantool_ce() + + start_cmd = [tt_cmd, *TcmStartCommand] + print(f"Run: {start_cmd}") + + tcm = Popen( + start_cmd, + cwd=tmp_path, + text=True, + encoding="utf-8", + stdout=PIPE, + stderr=STDOUT, + ) + + output = wait_for_lines_in_output(tcm.stdout, ["(INFO):Process PID"]) + assert tcm.pid + + with open(os.path.join(tmp_path, 'tcmPidFile.pid'), 'r') as f: + tcm_pid = f.read().strip() + assert f'(INFO): Interactive process PID {tcm_pid} written to tcmPidFile.pid' in output.strip() + + tcmDouble = Popen( + start_cmd, + cwd=tmp_path, + text=True, + encoding="utf-8", + stdout=PIPE, + stderr=STDOUT, + ) + + output = wait_for_lines_in_output(tcmDouble.stdout, ["(INFO):Process PID"]) + assert tcm.pid From 544bbc1486231deead6c1b5da552c62aace023c0 Mon Sep 17 00:00:00 2001 From: AlexandrLitkevich Date: Tue, 15 Apr 2025 16:02:29 +0400 Subject: [PATCH 2/2] tcm: add tt tcm stop command @TarantoolBot document Title: add the tt tcm stop command. Add command tt tcm stop to gracefully terminate TCM: - Works in both modes(watchdog and interactive) - Stops all subprocesses Closes #TNTP-1993 --- CHANGELOG.md | 199 ++++++++++++------------ cli/cmd/tcm.go | 28 +--- cli/process_utils/process_utils.go | 6 +- cli/process_utils/process_utils_test.go | 36 +++++ cli/tcm/tcm.go | 8 +- cli/tcm/watchdog.go | 158 ------------------- cli/tcm/watchdog_test.go | 69 -------- lib/watchdog/watchdog.go | 99 +++++++----- lib/watchdog/watchdog_test.go | 145 ++++++++++------- test/integration/tcm/test_tcm.py | 14 +- test/utils.py | 8 + 11 files changed, 304 insertions(+), 466 deletions(-) create mode 100644 cli/process_utils/process_utils_test.go delete mode 100644 cli/tcm/watchdog.go delete mode 100644 cli/tcm/watchdog_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c81aeb53..7649c892d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,13 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added -- `tt aeon connect`: added tests for connect file/app. -- `tt pack `: support `.packignore` file to specify files that should not be included - in package (works the same as `.gitignore`). -- `tt tcm start`: add the tcm command. -- `tt tcm start` OR `tt tcm start --path /path/to/tcm`: added the capability to run TCM in interactive mode. -- `tt tcm start --watchdog`: implemented Watchdog mode for automatic restarting of TCM upon unexpected termination. - `tt tcm status`: added command to check TCM runtime status (modes: `watchdog` or `interactive`). +- `tt tcm stop`: add command for graceful termination of TCM processes (modes: `watchdog` or `interactive`). ### Changed @@ -121,16 +116,16 @@ continues to improve `tt play` command. - `tt aeon connect`: add support to connect Aeon database. - `tt play`: support of the SSL parameters by using next flags: - - `sslkeyfile` - path to a private SSL key file, - - `sslcertfile` - path to an SSL certificate file, - - `sslcafile` - path to a trusted certificate authorities (CA) file, - - `sslciphers` - colon-separated list of SSL cipher suites the connection. + * `sslkeyfile` - path to a private SSL key file, + * `sslcertfile` - path to an SSL certificate file, + * `sslcafile` - path to a trusted certificate authorities (CA) file, + * `sslciphers` - colon-separated list of SSL cipher suites the connection. - `tt play`: support connection to a target instance by `application` name or `application:instance` name. - `tt coredump pack`: add options to customize coredump packing: - - `-e (--executable)`: specify Tarantool executable path. - - `-p (--pid)`: specify PID of the dumped process. - - `-t (--time)`: specify time of dump (seconds since the Epoch). + * `-e (--executable)`: specify Tarantool executable path. + * `-p (--pid)`: specify PID of the dumped process. + * `-t (--time)`: specify time of dump (seconds since the Epoch). - `tt.yaml`: allows to specify a list of modules directories. - Environment variable TT_CLI_MODULES_PATH can be used to specify an extra path with modules. @@ -138,8 +133,8 @@ continues to improve `tt play` command. ### Changed - `tt stop/kill/clean/logrotate`: no longer need: - - Instances scripts for multi-instance applications. - - Cluster config for tarantool3-based cluster applications. + * Instances scripts for multi-instance applications. + * Cluster config for tarantool3-based cluster applications. - `tt logrotate`: don't exit at non-running instance, just warn and proceed with the other instances, like `tt stop` and `tt kill` do. - `tt coredump pack`: if `-e` option is omitted first search tarantool @@ -165,16 +160,16 @@ The release introduces `upgrade` and `downgrade` subcommands for - `tt replicaset downgrade`: command to downgrade the schema on a Tarantool cluster. - - `-v (--version)`: (required) specify schema version to downgrade to. - - `-r (--replicaset)`: specify the replicaset name(s) to downgrade. - - `-t (--timeout)`: timeout for waiting the LSN synchronization (in seconds) + * `-v (--version)`: (required) specify schema version to downgrade to. + * `-r (--replicaset)`: specify the replicaset name(s) to downgrade. + * `-t (--timeout)`: timeout for waiting the LSN synchronization (in seconds) (default 5). - `tt replicaset upgrade`: command to upgrade the schema on a Tarantool cluster. - - `-r (--replicaset)`: specify the replicaset name(s) to upgrade. - - `-t (--timeout)`: timeout for waiting the LSN synchronization (in seconds) + * `-r (--replicaset)`: specify the replicaset name(s) to upgrade. + * `-t (--timeout)`: timeout for waiting the LSN synchronization (in seconds) (default 5). - - supports upgrading the database schema on remote cluster by upgrading + * supports upgrading the database schema on remote cluster by upgrading each replicaset individually using `tt replicaset upgrade `. - New flag `--timestamp` of `tt cat` and `tt play` commands is added to specify operations ending with the given timestamp. This value can be specified @@ -216,7 +211,7 @@ Additionally, several fixes were implemented to improve stability. ### Added - `tt status`: display `config`, `box`, and `replication upstream` statuses. - - `--details`: display detailed reports of errors and warnings from + * `--details`: display detailed reports of errors and warnings from instances. - `tt stop` confirmation prompt. `-y` option is added to accept stop without prompting. @@ -247,8 +242,8 @@ Additionally, several fixes were implemented to improve stability. ### Added - `tt log`: a module for viewing instances logs. Supported options: - - `--lines` number of lines to print. - - `--follow` print appended data as log files grow. + * `--lines` number of lines to print. + * `--follow` print appended data as log files grow. - `tt connect`: support format for Tarantool tuples for Tarantool versions >= 3.2. - `tt enable`: create a symbolic link in 'instances_enabled' directory to @@ -274,7 +269,7 @@ Additionally, several fixes were implemented to improve stability. ### Changed -- Do not create Dockerfile.\* in application's directory. +- Do not create Dockerfile.* in application's directory. ## [2.3.1] - 2024-06-13 @@ -288,11 +283,11 @@ Additionally, several fixes were implemented to improve stability. - `tt status`: displays the mode of the instance. - `tt coredump`: enhances coredump inspection: - - `tt coredump pack`: puts gdb.sh and GDB-extensions into the archive so + * `tt coredump pack`: puts gdb.sh and GDB-extensions into the archive so that it contains everything necessary for convenient coredump inspection. - - `tt coredump inspect`: allows archive path as an argument (archive should + * `tt coredump inspect`: allows archive path as an argument (archive should be created with `tt coredump pack`). - - `tt coredump inspect`: added `-s` option to specify the location of + * `tt coredump inspect`: added `-s` option to specify the location of tarantool sources. - `tt cluster publish`: ability to publish a new instance config. - `tt pack` does not create unnecessary directories and removes files that are @@ -306,7 +301,7 @@ Additionally, several fixes were implemented to improve stability. file. - `tt pack`: systemd unit parameterizing support. - `tt replicaset vshard`: module to manage vshard in the tarantool replicaset. - - `tt replicaset vshard bootstrap`: command to bootstrap vshard. + * `tt replicaset vshard bootstrap`: command to bootstrap vshard. ### Fixed @@ -329,9 +324,9 @@ Additionally, several fixes were implemented to improve stability. replicaset with cluster config (3.0) orchestrator. - `tt cluster replicaset`: module to manage replicaset via 3.0 cluster config storage. - - `tt cluster replicaset promote`: command to promote an instance in + * `tt cluster replicaset promote`: command to promote an instance in the replicaset. - - `tt cluster replicaset demote`: command to demote an instance in + * `tt cluster replicaset demote`: command to demote an instance in the replicaset. - `tt connect --binary`: connect to instance using binary port. - `tt kill`: command to stop instance(s) with SIGQUIT and SIGKILL signals. @@ -391,12 +386,12 @@ Additionally, several fixes were implemented to improve stability. ### Added -- Module `tt replicaset`, to manage replicasets: - - `tt replicaset status` to show a cluster status information. +- Module ``tt replicaset``, to manage replicasets: + - ``tt replicaset status`` to show a cluster status information. ### Changed -- Disable `tt run` tarantool flag parsing. +- Disable ``tt run`` tarantool flag parsing. ### Fixed @@ -410,8 +405,8 @@ Additionally, several fixes were implemented to improve stability. - `tt connect` auto-completion shows directories and files when there are no running apps. - `tt rocks --server` now accepts several URL's. -- Disable `tt run` tarantool flag parsing. -- Now `tt run` starts instance without our wrapper. +- Disable ``tt run`` tarantool flag parsing. +- Now ``tt run`` starts instance without our wrapper. ### Added @@ -426,7 +421,7 @@ Additionally, several fixes were implemented to improve stability. - Print log messages to stderr. - Global flags are required to be positioned only before child - commands. Example: `tt --cfg tt.yaml install tt`. + commands. Example: ``tt --cfg tt.yaml install tt``. - tt config format: separate tt environment options from application options. - tt version: additional version information for non-release builds. - Working directory is changed to an application source directory. @@ -447,7 +442,7 @@ Additionally, several fixes were implemented to improve stability. - Log rotation functionality and configuration is removed from `tt`. `tt logrotate` command re-opens a log file and sends SIGHUP to the child `tarantool` processes. -- `tt cat`: all diagnostic messages are printed to stderr. +- ``tt cat``: all diagnostic messages are printed to stderr. - Print `tarantool` stdout/stderr and watchdog logs to the same log file - `tt.log`. @@ -455,16 +450,16 @@ Additionally, several fixes were implemented to improve stability. - tt completion: added luarocks completions. - tarantool-ee: search and install development builds. -- `tt play`: ability to pass username and password via flags and environment +- ``tt play``: ability to pass username and password via flags and environment variables. - tt cluster: credentials could be passed via environment variables and command flags. ### Fixed -- `tt rocks`: broken `--verbose` option. -- `tt binaries`: tarantool-ee binaries not shown. -- `tt cluster`: recognize app:instance as a etcd URL. +- ``tt rocks``: broken ``--verbose`` option. +- ``tt binaries``: tarantool-ee binaries not shown. +- ``tt cluster``: recognize app:instance as a etcd URL. ## [1.3.0] - 2023-09-28 @@ -472,15 +467,15 @@ Additionally, several fixes were implemented to improve stability. - Use CLI arg connect string for the prompt line and the title to avoid too long prompt line when using 'app:instance' target format. -- `tt rocks`: luarocks version has been updated to 3.9.2. +- ``tt rocks``: luarocks version has been updated to 3.9.2. ### Added - `tt install tarantool/tt`: ability to install tarantool and tt from an - arbitrary commit. The binary has the name tt/tarantool\_ + seven-digit hash. + arbitrary commit. The binary has the name tt/tarantool_ + seven-digit hash. - New `tt pack` flag `--tarantool-version` is added to specify tarantool version for pack in docker. It is supported only with `--use-docker` enabled. -- Module `tt cluster`, to show or publish a cluster or an instance +- Module ``tt cluster``, to show or publish a cluster or an instance configuration. - `tt connect`: added command `\help` to show the help with a list of available commands. @@ -559,9 +554,9 @@ Additionally, several fixes were implemented to improve stability. ### Fixed -- `tt start` now does not start an instance if it is already running. -- `tt rocks` uses rocks repo path relative to tt environment config location. -- `tt connect` now does not crash on `\q` input. +- ``tt start`` now does not start an instance if it is already running. +- ``tt rocks`` uses rocks repo path relative to tt environment config location. +- ``tt connect`` now does not crash on `\q` input. ### Added @@ -570,10 +565,10 @@ Additionally, several fixes were implemented to improve stability. in case of the pattern doesn't contain delimiter `:`, and suitable instances otherwise. - support tt environment directories overriding using environment variables: - - TT_CLI_REPO_ROCKS environment variable value is used as rocks repository + * TT_CLI_REPO_ROCKS environment variable value is used as rocks repository path if it is set and there is no tt.repo.rocks in tt configuration file or tt.repo.rocks directory does not include repository manifest file. - - TT_CLI_TARANTOOL_PREFIX environment variable value is used for as tarantool + * TT_CLI_TARANTOOL_PREFIX environment variable value is used for as tarantool installation prefix directory for rocks commands if it is set and tarantool executable is found in PATH. - smart auto-completion for `tt create`. It shows a list of built-in templates @@ -584,39 +579,39 @@ Additionally, several fixes were implemented to improve stability. ### Changed -- `tt install tarantool` without version specification now installs the +- ``tt install tarantool`` without version specification now installs the latest release. -- `tt install/search tarantool-ee` now uses credentials from `tarantool.io` +- ``tt install/search tarantool-ee`` now uses credentials from `tarantool.io` customer zone. Also, installation now requires specifying the version. -- `tt search tarantool-ee` options changed. A new `--version` flag has been +- ``tt search tarantool-ee`` options changed. A new `--version` flag has been added to allow search for a specific release. The `--dev` and `--dbg` options have been merged into a single `--debug` option. -- `tt search` now uses subcommands for searching tarantool/tarantool-ee/tt +- ``tt search`` now uses subcommands for searching tarantool/tarantool-ee/tt binaries ### Added -- `--dynamic` option for `tt install tarantool` command to build non-static +- ``--dynamic`` option for `tt install tarantool` command to build non-static tarantool executable. ### Fixed -- `tt connect` command does not break a console after executing `os.exit()` +- ``tt connect`` command does not break a console after executing `os.exit()` command anymore. ## [1.0.2] - 2023-04-21 ### Fixed -- `tt cartridge` command takes into account run dir path from the `tt` +- ``tt cartridge`` command takes into account run dir path from the `tt` environment. So most of the `tt cartridge` sub-commands are able to work without specifying `--run-dir` option. -- `tt install` command checks it's write rights to binary and include +- ``tt install`` command checks it's write rights to binary and include directories before installing binaries. ### Changed -- `tt install/uninstall` command line interface is updated. Program names +- ``tt install/uninstall`` command line interface is updated. Program names have become sub-commands with their own options. ## [1.0.1] - 2023-04-04 @@ -624,7 +619,7 @@ Additionally, several fixes were implemented to improve stability. ### Added - A configurable variable `cluster_cookie` for `tt create cartridge` template. -- `tt build` accepts application name for building. +- ``tt build`` accepts application name for building. - Creating wal, vinyl and memtx directories for `tt pack`. If these directories are not located in the same directory in the environment for packing, the result package will contain separate snap/vinyl/wal directories for @@ -636,9 +631,9 @@ Additionally, several fixes were implemented to improve stability. ### Changed -- `tt uninstall` does not ask version if only one version of a program is +- ``tt uninstall`` does not ask version if only one version of a program is installed. -- `tt rocks init` is disabled. +- ``tt rocks init`` is disabled. ## [1.0.0] - 2023-03-23 @@ -651,10 +646,10 @@ Additionally, several fixes were implemented to improve stability. log file. This option affects only single instance applications. - An ability to set different directories for WAL, vinyl and snapshots artifacts. -- `tt instances` command to print a list of enabled applications. -- SSL options for `tt connect` command. +- ``tt instances`` command to print a list of enabled applications. +- SSL options for ``tt connect`` command. - An ability to pass arguments to a connect command. -- `tt binaries` command. It shows a list of installed binaries and their +- ``tt binaries`` command. It shows a list of installed binaries and their versions. ### Changed @@ -668,7 +663,7 @@ Additionally, several fixes were implemented to improve stability. ### Fixed -- Output of the `help` with all commands. +- Output of the ``help`` with all commands. - Allow more characters for URI credentials. ## [0.4.0] - 2022-12-31 @@ -676,8 +671,8 @@ Additionally, several fixes were implemented to improve stability. ### Added - Support of rocks repository specified in tt config. -- `cfg dump` module. It prints tt environment configuration. -- `--use-docker` option for `tt pack` for packing environments in docker +- ``cfg dump`` module. It prints tt environment configuration. +- ``--use-docker`` option for ``tt pack`` for packing environments in docker container. - Support of MacOS. @@ -689,10 +684,10 @@ Additionally, several fixes were implemented to improve stability. ### Added -- `tt restart` confirmation prompt. `-y` option is added to accept restart +- ``tt restart`` confirmation prompt. ``-y`` option is added to accept restart without prompting. -- `tt pack` will generate systemd unit for rpm and deb packages. -- `--use-docker` option for `tt install` to build Tarantool in +- ``tt pack`` will generate systemd unit for rpm and deb packages. +- ``--use-docker`` option for ``tt install`` to build Tarantool in Ubuntu 16.04 container. - Ability to use the `start/stop/restart/status/check` commands without arguments to interact with all instances of the environment simultaneously. @@ -703,16 +698,16 @@ Additionally, several fixes were implemented to improve stability. ### Changed -- `tt cartridge` sub-commands `create`, `build`, `pack` are removed. -- `remove` command is renamed to `uninstall`. -- Updated values in system `tarantool.yaml` for `bin_dir`, `inc_dir` - and `repo: distfiles`. +- ``tt cartridge`` sub-commands ``create``, ``build``, ``pack`` are removed. +- ``remove`` command is renamed to ``uninstall``. +- Updated values in system ``tarantool.yaml`` for ``bin_dir``, ``inc_dir`` + and ``repo: distfiles``. ### Fixed -- Working of the `help` module with multi-level commands (commands with +- Working of the ``help`` module with multi-level commands (commands with several subcommands). -- Using the system `tarantool.yaml` when installing from the repository. +- Using the system ``tarantool.yaml`` when installing from the repository. ## [0.2.1] - 2022-11-24 @@ -725,16 +720,16 @@ Additionally, several fixes were implemented to improve stability. ### Added -- Module `tt init`, to create tt environment configuration file. -- Module `tt daemon`, to manage the `tt` daemon. +- Module ``tt init``, to create tt environment configuration file. +- Module ``tt daemon``, to manage the ``tt`` daemon. - Built-in application templates support. Cartridge application template is added. -- Using `default_cfg` from `.tarantoolctl` for `tarantool.yaml` - generation in `tt init`. +- Using ``default_cfg`` from ``.tarantoolctl`` for ``tarantool.yaml`` + generation in ``tt init``. ### Changed -- Modules `tt start`, `tt connect` and `tt catridge` now use relative +- Modules ``tt start``, ``tt connect`` and ``tt catridge`` now use relative paths for unix sockets. It allows to use socket paths longer than sun_path limit.(108/106 on linux/macOS) e.g foo/bar.sock -> ./bar.sock @@ -742,25 +737,25 @@ Additionally, several fixes were implemented to improve stability. ### Added -- Module `tt version`, to get information about the version of the CLI. -- Module `tt completion`, to generate autocompletion for a specified shell. -- Module `tt help`, to get information about the CLI and its modules. -- Module `tt start`, responsible for launching the instance according to the +- Module ``tt version``, to get information about the version of the CLI. +- Module ``tt completion``, to generate autocompletion for a specified shell. +- Module ``tt help``, to get information about the CLI and its modules. +- Module ``tt start``, responsible for launching the instance according to the application file. -- Module `tt stop`, responsible for terminating the instance. -- Module `tt status`, to get information about the state of the instance. -- Module `tt restart`, responsible for restarting of the instance. -- Module `tt logrotate`, to rotate instance logs. -- Module `tt check`, to check an application file for syntax errors. -- Module `tt connect`, used to connect to a running instance. -- Module `tt rocks`, LuaRocks package manager. -- Module `tt cat`, to print into stdout the contents of .snap/.xlog files. -- Module `tt play`, to play the contents of .snap/.xlog files to another +- Module ``tt stop``, responsible for terminating the instance. +- Module ``tt status``, to get information about the state of the instance. +- Module ``tt restart``, responsible for restarting of the instance. +- Module ``tt logrotate``, to rotate instance logs. +- Module ``tt check``, to check an application file for syntax errors. +- Module ``tt connect``, used to connect to a running instance. +- Module ``tt rocks``, LuaRocks package manager. +- Module ``tt cat``, to print into stdout the contents of .snap/.xlog files. +- Module ``tt play``, to play the contents of .snap/.xlog files to another Tarantool instance. -- Module `tt coredump`, to pack/unpack/inspect tarantool coredump. -- Module `tt run`, to start tarantool instance using tt wrapper. -- Module `tt search`, to show available tt/tarantool versions. -- Module `tt create`, to create an application from a template. -- Module `tt build`, to build an application. -- Module `tt install`, to install tarantool/tt. -- Module `tt remove`, to remove tarantool/tt. +- Module ``tt coredump``, to pack/unpack/inspect tarantool coredump. +- Module ``tt run``, to start tarantool instance using tt wrapper. +- Module ``tt search``, to show available tt/tarantool versions. +- Module ``tt create``, to create an application from a template. +- Module ``tt build``, to build an application. +- Module ``tt install``, to install tarantool/tt. +- Module ``tt remove``, to remove tarantool/tt. \ No newline at end of file diff --git a/cli/cmd/tcm.go b/cli/cmd/tcm.go index 60197063e..fc0d7a5a2 100644 --- a/cli/cmd/tcm.go +++ b/cli/cmd/tcm.go @@ -13,18 +13,16 @@ import ( "github.com/jedib0t/go-pretty/v6/text" "github.com/spf13/cobra" "github.com/tarantool/tt/cli/cmdcontext" - "github.com/tarantool/tt/cli/modules" "github.com/tarantool/tt/cli/process_utils" tcmCmd "github.com/tarantool/tt/cli/tcm" - "github.com/tarantool/tt/cli/util" libwatchdog "github.com/tarantool/tt/lib/watchdog" ) var tcmCtx = tcmCmd.TcmCtx{} const ( - tcmPidFile = "tcmPidFile.pid" - watchdogPidFile = "watchdogPidFile.pid" + tcmPidFile = "tcm.pid" + watchdogPidFile = "watchdog.pid" ) func newTcmStartCmd() *cobra.Command { @@ -34,11 +32,7 @@ func newTcmStartCmd() *cobra.Command { Long: `Start to the tcm. tt tcm start --watchdog tt tcm start --path`, - Run: func(cmd *cobra.Command, args []string) { - cmdCtx.CommandName = cmd.Name() - err := modules.RunCmd(&cmdCtx, cmd.CommandPath(), &modulesInfo, internalStartTcm, args) - util.HandleCmdErr(cmd, err) - }, + Run: RunModuleFunc(internalStartTcm), } tcmCmd.Flags().StringVar(&tcmCtx.Executable, "path", "", "the path to the tcm binary file") tcmCmd.Flags().BoolVar(&tcmCtx.Watchdog, "watchdog", false, "enables the watchdog") @@ -52,11 +46,7 @@ func newTcmStatusCmd() *cobra.Command { Short: "Status tcm application", Long: `Status to the tcm. tt tcm status`, - Run: func(cmd *cobra.Command, args []string) { - cmdCtx.CommandName = cmd.Name() - err := modules.RunCmd(&cmdCtx, cmd.CommandPath(), &modulesInfo, internalTcmStatus, args) - util.HandleCmdErr(cmd, err) - }, + Run: RunModuleFunc(internalTcmStatus), } return tcmCmd } @@ -66,11 +56,7 @@ func newTcmStopCmd() *cobra.Command { Use: "stop", Short: "Stop tcm application", Long: `Stop to the tcm. tt tcm stop`, - Run: func(cmd *cobra.Command, args []string) { - cmdCtx.CommandName = cmd.Name() - err := modules.RunCmd(&cmdCtx, cmd.CommandPath(), &modulesInfo, internalTcmStop, args) - util.HandleCmdErr(cmd, err) - }, + Run: RunModuleFunc(internalTcmStop), } return tcmCmd } @@ -178,13 +164,13 @@ func internalTcmStop(cmdCtx *cmdcontext.CmdCtx, args []string) error { if err != nil { return err } - log.Println("Watchdog and TCM stoped") + log.Println("Watchdog and TCM stopped") } else { _, err := process_utils.StopProcess(tcmPidFile) if err != nil { return err } - log.Println("TCM stoped") + log.Println("TCM stopped") } return nil diff --git a/cli/process_utils/process_utils.go b/cli/process_utils/process_utils.go index 75a0ae7d1..96e1ac170 100644 --- a/cli/process_utils/process_utils.go +++ b/cli/process_utils/process_utils.go @@ -112,14 +112,14 @@ func ExistsAndRecord(pidFileName string) (bool, error) { // The PID file already exists. We have to check if the process is alive. pid, err := GetPIDFromFile(pidFileName) if err != nil { - return false, fmt.Errorf(`pID file exists, but PID can't be read. Error: "%v"`, err) + return false, fmt.Errorf(`PID file exists, but PID can't be read. Error: "%v"`, err) } if res, _ := IsProcessAlive(pid); res { return true, nil } } else if !os.IsNotExist(err) { - return false, fmt.Errorf(`something went wrong while trying to read the PID file. Error: "%v"`, - err) + return false, fmt.Errorf(`something went wrong while trying to read the`+ + `PID file. Error: "%v"`, err) } return false, nil diff --git a/cli/process_utils/process_utils_test.go b/cli/process_utils/process_utils_test.go new file mode 100644 index 000000000..198d3f83f --- /dev/null +++ b/cli/process_utils/process_utils_test.go @@ -0,0 +1,36 @@ +package process_utils + +import ( + "os" + "os/exec" + "testing" + + "github.com/stretchr/testify/require" +) + +func Test_ExistsAndRecord(t *testing.T) { + testFile := "test.pid" + invalid := "invalid.pid" + cmd := exec.Command("sleep", "10") + + t.Cleanup(func() { + os.Remove(testFile) + }) + + err := cmd.Start() + require.NoError(t, err) + + err = CreatePIDFile(testFile, cmd.Process.Pid) + require.NoError(t, err) + + status, err := ExistsAndRecord(testFile) + require.NoError(t, err) + require.True(t, status) + + err = cmd.Process.Kill() + require.NoError(t, err) + + statusInvalid, err := ExistsAndRecord(invalid) + require.False(t, statusInvalid) + require.NoError(t, err) +} diff --git a/cli/tcm/tcm.go b/cli/tcm/tcm.go index 67417fdf5..133dbbb9d 100644 --- a/cli/tcm/tcm.go +++ b/cli/tcm/tcm.go @@ -1,9 +1,13 @@ package tcm +// TcmCtx holds parameters and state for managing the TCM process and its watchdog. type TcmCtx struct { + // Path to the TCM executable file. Executable string + // Path to the file storing the TCM process PID. TcmPidFile string - - Watchdog bool + // Flag indicating whether the watchdog is enabled. + Watchdog bool + // Path to the file storing the watchdog process PID. WathdogPidFile string } diff --git a/cli/tcm/watchdog.go b/cli/tcm/watchdog.go deleted file mode 100644 index dd4c7f453..000000000 --- a/cli/tcm/watchdog.go +++ /dev/null @@ -1,158 +0,0 @@ -package tcm - -import ( - "context" - "errors" - "fmt" - "log" - "os" - "os/exec" - "os/signal" - "path/filepath" - "sync" - "syscall" - "time" -) - -// Watchdog manages the lifecycle of a process. -type Watchdog struct { - // The command to execute and monitor. - cmd *exec.Cmd - // Time to wait before restarting the process. - restartTimeout time.Duration - // Flag to indicate if the Watchdog should stop. - shouldStop bool - // Mutex to protect access to shouldStop. - stopMutex sync.Mutex - // WaitGroup to wait for all goroutines to finish. - doneBarrier sync.WaitGroup - // File to store the process PID. - pidFile string -} - -// NewWatchdog creates a new Watchdog instance. -func NewWatchdog(restartTimeout time.Duration) (*Watchdog, error) { - return &Watchdog{ - restartTimeout: restartTimeout, - pidFile: "tcm/pidFile.pid", - }, nil -} - -// Start starts the process and monitors its execution. -func (wd *Watchdog) Start(bin string, args ...string) error { - wd.doneBarrier.Add(1) - defer wd.doneBarrier.Done() - - signalCtx, signalCancel := context.WithCancel(context.Background()) - defer signalCancel() - - go wd.handleSignals(signalCtx, signalCancel) - - for { - wd.stopMutex.Lock() - if wd.shouldStop { - wd.stopMutex.Unlock() - return nil - } - wd.stopMutex.Unlock() - - wd.cmd = exec.Command(bin, args...) - wd.cmd.Stdout = os.Stdout - wd.cmd.Stderr = os.Stderr - - log.Println("(INFO): Starting process...") - if err := wd.cmd.Start(); err != nil { - log.Printf("(ERROR): Failed to start process: %v\n", err) - return err - } - - if err := wd.writePIDToFile(); err != nil { - log.Printf("(ERROR): Failed to write PID to file: %v\n", err) - return err - } - - err := wd.cmd.Wait() - if err != nil { - var exitErr *exec.ExitError - if errors.As(err, &exitErr) { - log.Printf("(WARN): Process exited with error: %v\n", exitErr) - } else { - log.Printf("(ERROR): Process failed: %v\n", err) - return err - } - } else { - log.Println("(INFO): Process completed successfully.") - } - - wd.stopMutex.Lock() - if wd.shouldStop { - wd.stopMutex.Unlock() - return nil - } - wd.stopMutex.Unlock() - - log.Printf("(INFO): Waiting for %s before restart...\n", wd.restartTimeout) - time.Sleep(wd.restartTimeout) - } -} - -// Stop stops the process and shuts down the Watchdog. -func (wd *Watchdog) Stop() { - wd.stopMutex.Lock() - wd.shouldStop = true - if wd.cmd != nil && wd.cmd.Process != nil { - log.Println("(INFO): Stopping process...") - if err := wd.cmd.Process.Signal(syscall.SIGTERM); err != nil { - log.Printf("(ERROR): Failed to stop process: %v\n", err) - } - } - wd.stopMutex.Unlock() - - wd.doneBarrier.Wait() - os.RemoveAll(filepath.Dir(wd.pidFile)) - log.Println("(INFO): Watchdog stopped.") -} - -// handleSignals listens for OS signals and stops the Watchdog gracefully. -func (wd *Watchdog) handleSignals(ctx context.Context, cancel context.CancelFunc) { - signalChan := make(chan os.Signal, 1) - signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM) - - select { - case <-signalChan: - log.Println("(INFO): Received stop signal.") - wd.Stop() - cancel() - case <-ctx.Done(): - return - } -} - -// writePIDToFile writes the PID of the process to a file. -func (wd *Watchdog) writePIDToFile() error { - if wd.cmd == nil || wd.cmd.Process == nil { - return errors.New("process is not running") - } - - pid := wd.cmd.Process.Pid - pidData := fmt.Sprintf("%d", pid) - - dir := filepath.Dir(wd.pidFile) - if err := os.MkdirAll(dir, os.ModePerm); err != nil { - return err - } - - file, err := os.Create(wd.pidFile) - if err != nil { - return fmt.Errorf("failed to create PID file: %v", err) - } - defer file.Close() - - _, err = file.WriteString(pidData) - if err != nil { - return err - } - - log.Printf("(INFO): PID %d written to %s\n", pid, wd.pidFile) - return nil -} diff --git a/cli/tcm/watchdog_test.go b/cli/tcm/watchdog_test.go deleted file mode 100644 index 83408ba28..000000000 --- a/cli/tcm/watchdog_test.go +++ /dev/null @@ -1,69 +0,0 @@ -package tcm - -import ( - "fmt" - "os" - "os/exec" - "testing" - "time" - - "github.com/stretchr/testify/require" -) - -func TestWatchdogStartProcess(t *testing.T) { - watchdog, err := NewWatchdog(1 * time.Second) - require.NoError(t, err) - - go func() { - watchdog.Start("sleep", "5") - require.NoError(t, err) - }() - - time.Sleep(2 * time.Second) - - _, err = os.Stat(watchdog.pidFile) - require.NoError(t, err) - - watchdog.Stop() -} - -func TestWatchdogRestartProcess(t *testing.T) { - watchdog, err := NewWatchdog(1 * time.Second) - require.NoError(t, err) - - go func() { - err := watchdog.Start("sleep", "1") - require.NoError(t, err) - }() - - time.Sleep(3 * time.Second) - - _, err = os.Stat(watchdog.pidFile) - require.NoError(t, err) - - watchdog.Stop() -} - -func TestWritePIDToFile(t *testing.T) { - pidFile := "/tmp/watchdog_test.pid" - defer os.Remove(pidFile) - - cmd := exec.Command("sleep", "1") - err := cmd.Start() - require.NoError(t, err) - defer cmd.Process.Kill() - - watchdog := &Watchdog{ - cmd: cmd, - pidFile: pidFile, - } - - err = watchdog.writePIDToFile() - require.NoError(t, err) - - pidData, err := os.ReadFile(pidFile) - require.NoError(t, err) - - expectedPID := fmt.Sprintf("%d", cmd.Process.Pid) - require.Equal(t, expectedPID, string(pidData)) -} diff --git a/lib/watchdog/watchdog.go b/lib/watchdog/watchdog.go index 44a1bc225..4409977f7 100644 --- a/lib/watchdog/watchdog.go +++ b/lib/watchdog/watchdog.go @@ -15,18 +15,31 @@ import ( "github.com/tarantool/tt/cli/process_utils" ) +// Watchdog manages a child process, ensuring reliable startup, automatic restarts on failure, +// and graceful shutdown. It handles system signals, maintains PID file consistency, +// and provides thread-safe operations for concurrent process management. type Watchdog struct { - cmd *exec.Cmd + // cmd is the child process command (protected by cmdMutex). + cmd *exec.Cmd + // restartTimeout defines delay before restart (0 = immediate). restartTimeout time.Duration - shouldStop atomic.Bool - doneBarrier sync.WaitGroup - pidFile string - wdPidFile string - - cmdMutex sync.Mutex - pidFileMutex sync.Mutex - signalChan chan os.Signal + // shouldStop is atomic flag to prevent restarts when true. + shouldStop atomic.Bool + // doneBarrier waits for goroutines during shutdown. + doneBarrier sync.WaitGroup + // pidFile stores child process PID (protected by pidFileMutex). + pidFile string + // wdPidFile stores watchdog's own PID. + wdPidFile string + // cmdMutex guards cmd operations. + cmdMutex sync.Mutex + // pidFileMutex protects PID file access. + pidFileMutex sync.Mutex + // signalChan receives termination signals. + signalChan chan os.Signal + // processGroupPID stores Process Group ID for cleanup. processGroupPID atomic.Int32 + // startupComplete signals successful child process start. startupComplete chan struct{} } @@ -47,25 +60,25 @@ func NewWatchdog(pidFile, wdPidFile string, restartTimeout time.Duration) *Watch // Start begins monitoring and managing the target process. // It handles process execution, restart logic, and signal processing. func (wd *Watchdog) Start(bin string, args ...string) error { - // Add to wait group to track active goroutines + // Add to wait group to track active goroutines. wd.doneBarrier.Add(1) - // Ensure we decrement wait group when done + // Ensure we decrement wait group when done. defer wd.doneBarrier.Done() - // Create context for graceful shutdown + // Create context for graceful shutdown. ctx, cancel := context.WithCancel(context.Background()) - defer cancel() // Ensure context is canceled when we exit + defer cancel() // Ensure context is canceled when we exit. - // Register signal handler for termination signals + // Register signal handler for termination signals. signal.Notify(wd.signalChan, syscall.SIGINT, syscall.SIGTERM) - // Clean up signal handlers when done + // Clean up signal handlers when done. defer signal.Stop(wd.signalChan) - // Signal handling goroutine + // Signal handling goroutine. go func() { select { case sig := <-wd.signalChan: - // Only process signal if not already stopping + // Only process signal if not already stopping. if !wd.shouldStop.Load() { log.Printf("(INFO): Received signal: %v", sig) wd.Stop() @@ -74,20 +87,20 @@ func (wd *Watchdog) Start(bin string, args ...string) error { } }() - // Main process management loop + // Main process management loop. for { - // Check if we should stop before each iteration + // Check if we should stop before each iteration. if wd.shouldStop.Load() { return nil } - // Start the managed process + // Start the managed process. wd.cmdMutex.Lock() wd.cmd = exec.Command(bin, args...) - // Create new process group for proper signal handling + // Create new process group for proper signal handling. wd.cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} - // Start the process + // Start the process. if err := wd.cmd.Start(); err != nil { wd.cmdMutex.Unlock() log.Printf("(ERROR): Failed to start process: %v", err) @@ -101,25 +114,25 @@ func (wd *Watchdog) Start(bin string, args ...string) error { // Write PID files after successful start if err := wd.writePIDFiles(); err != nil { log.Printf("(ERROR): Failed to write PID files: %v", err) - _ = wd.terminateProcess() // Clean up if PID files fail + _ = wd.terminateProcess() // Clean up if PID files fail. return err } log.Println("(INFO): Process started successfully") - close(wd.startupComplete) // Signal that startup is complete + close(wd.startupComplete) // Signal that startup is complete. - // Wait for process completion in separate goroutine + // Wait for process completion in separate goroutine. waitChan := make(chan error, 1) go func() { waitChan <- wd.cmd.Wait() }() select { case err := <-waitChan: - // Check for stop signal after process exits + // Check for stop signal after process exits. if wd.shouldStop.Load() { return nil } - // Handle process exit status + // Handle process exit status. if err != nil { if errors.As(err, new(*exec.ExitError)) { log.Printf("(WARN): Process exited with error: %v", err) @@ -132,12 +145,12 @@ func (wd *Watchdog) Start(bin string, args ...string) error { } case <-ctx.Done(): - // Context canceled - terminate process + // Context canceled - terminate process. _ = wd.terminateProcess() return nil } - // Check stop condition again before restart + // Check stop condition again before restart. if wd.shouldStop.Load() { return nil } @@ -146,13 +159,13 @@ func (wd *Watchdog) Start(bin string, args ...string) error { log.Printf("(INFO): Waiting %s before restart...", wd.restartTimeout) select { case <-time.After(wd.restartTimeout): - // Continue to next iteration after timeout + // Continue to next iteration after timeout. case <-ctx.Done(): - // Exit if context canceled during wait + // Exit if context canceled during wait. return nil } - // Reset startup complete channel for next iteration + // Reset startup complete channel for next iteration. wd.startupComplete = make(chan struct{}) } } @@ -163,32 +176,32 @@ func (wd *Watchdog) Stop() { // Atomically set shouldStop flag to prevent multiple concurrent stops // CompareAndSwap ensures only one goroutine can execute the stop sequence if !wd.shouldStop.CompareAndSwap(false, true) { - return // Already stopping or stopped + return // Already stopping or stopped. } - // Ensure process startup is complete before attempting to stop - // This prevents races during process initialization + // Ensure process startup is complete before attempting to stop. + // This prevents races during process initialization. select { case <-wd.startupComplete: - // Normal case - startup already completed + // Normal case - startup already completed. default: - // Startup still in progress - wait for completion + // Startup still in progress - wait for completion. log.Println("(INFO): Waiting for process startup...") <-wd.startupComplete } - // Terminate the managed process + // Terminate the managed process. _ = wd.terminateProcess() - // Clean up signal handling + // Clean up signal handling. signal.Stop(wd.signalChan) close(wd.signalChan) - // Wait for all goroutines to complete - // This ensures we don't exit while signal handlers are still running + // Wait for all goroutines to complete. + // This ensures we don't exit while signal handlers are still running. wd.doneBarrier.Wait() - // Final log message indicating successful shutdown + // Final log message indicating successful shutdown. log.Println("(INFO): Watchdog stopped.") } @@ -205,7 +218,7 @@ func (wd *Watchdog) terminateProcess() error { pgid := int(wd.processGroupPID.Load()) - // Send SIGTERM to entire process group if available (preferred method) + // Send SIGTERM to entire process group if available (preferred method). if pgid > 0 { return syscall.Kill(-pgid, syscall.SIGTERM) } diff --git a/lib/watchdog/watchdog_test.go b/lib/watchdog/watchdog_test.go index ec6be41f5..823a8cbc9 100644 --- a/lib/watchdog/watchdog_test.go +++ b/lib/watchdog/watchdog_test.go @@ -7,31 +7,91 @@ import ( "syscall" "testing" "time" + + "github.com/stretchr/testify/require" ) -// TestWatchdog_StartStop tests that the watchdog starts a process, creates a PID -// file, and stops the process when asked to. -func TestWatchdog_StartStop(t *testing.T) { - pidFile := filepath.Join(t.TempDir(), "test.pid") - wdPidFile := filepath.Join(t.TempDir(), "watchdog.pid") +func cleanupPidFiles() { + os.Remove("test.pid") + os.Remove("wd.pid") +} - wd := NewWatchdog(pidFile, wdPidFile, 100*time.Millisecond) +func verifyProcessRunning(t *testing.T, wd *Watchdog) { + wd.cmdMutex.Lock() + defer wd.cmdMutex.Unlock() - err := wd.Start("sleep", "1") - if err != nil { - t.Fatalf("Start failed: %v", err) + if wd.cmd == nil || wd.cmd.Process == nil { + t.Fatal("process should be running") } +} - time.Sleep(100 * time.Millisecond) +func verifyNoErrors(t *testing.T, errChan chan error) { + select { + case err := <-errChan: + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + case <-time.After(500 * time.Millisecond): + t.Fatal("timeout waiting for Start to return") + } +} + +func TestWatchdog_Successful(t *testing.T) { + wd := NewWatchdog("test.pid", "wd.pid", 100*time.Millisecond) + t.Cleanup(cleanupPidFiles) + + cmd := exec.Command("sleep", "1") + errChan := make(chan error, 1) + go func() { errChan <- wd.Start(cmd.Path, cmd.Args[1:]...) }() + + // Wait for process to start + time.Sleep(200 * time.Millisecond) + + // Verify process is running + verifyProcessRunning(t, wd) + + // Stop the watchdog wd.Stop() + verifyNoErrors(t, errChan) +} - if _, err := os.Stat(pidFile); os.IsNotExist(err) { - t.Error("PID file not created") - } - if _, err := os.Stat(wdPidFile); os.IsNotExist(err) { - t.Error("Watchdog PID file not created") +func TestWatchdog_EarlyTermination(t *testing.T) { + wd := NewWatchdog("test.pid", "wd.pid", time.Second) + t.Cleanup(cleanupPidFiles) + + cmd := exec.Command("sleep", "10") + errChan := make(chan error, 1) + + go func() { errChan <- wd.Start(cmd.Path, cmd.Args[1:]...) }() + + // Wait for process to start + time.Sleep(200 * time.Millisecond) + + // Stop while process is running + wd.Stop() + verifyNoErrors(t, errChan) +} + +func TestWatchdog_ProcessRestart(t *testing.T) { + wd := NewWatchdog("test.pid", "wd.pid", 100*time.Millisecond) + t.Cleanup(cleanupPidFiles) + + cmd := exec.Command("false") + errChan := make(chan error, 1) + + go func() { errChan <- wd.Start(cmd.Path, cmd.Args[1:]...) }() + + // Wait for at least one restart + time.Sleep(300 * time.Millisecond) + + // Should still be running (restarting) + if wd.shouldStop.Load() { + t.Fatal("watchdog should not be stopped") } + + wd.Stop() + verifyNoErrors(t, errChan) } // TestWatchdog_SignalHandling tests that the watchdog can handle system signals. @@ -45,9 +105,7 @@ func TestWatchdog_SignalHandling(t *testing.T) { go func() { err := wd.Start("sleep", "10") - if err != nil { - t.Logf("Start exited with: %v", err) - } + require.NoError(t, err) }() time.Sleep(100 * time.Millisecond) @@ -61,34 +119,6 @@ func TestWatchdog_SignalHandling(t *testing.T) { } } -// TestWatchdog_TerminateProcess verifies that the watchdog's terminateProcess -// function successfully kills the monitored process and its process group. -func TestWatchdog_TerminateProcess(t *testing.T) { - wd := &Watchdog{ - pidFile: filepath.Join(t.TempDir(), "test.pid"), - wdPidFile: filepath.Join(t.TempDir(), "watchdog.pid"), - restartTimeout: time.Second, - } - - cmd := exec.Command("sleep", "10") - if err := cmd.Start(); err != nil { - t.Fatalf("Failed to start test process: %v", err) - } - defer cmd.Process.Kill() - - wd.cmd = cmd - wd.processGroupPID.Store(int32(cmd.Process.Pid)) - - if err := wd.terminateProcess(); err != nil { - t.Errorf("terminateProcess failed: %v", err) - } - - _, err := cmd.Process.Wait() - if err == nil { - t.Error("Process was not terminated") - } -} - // TestWatchdog_WritePIDFiles verifies that the Watchdog's writePIDFiles // method successfully creates the expected PID files for both the monitored // process and the watchdog itself. It starts a test process, assigns it to @@ -104,21 +134,20 @@ func TestWatchdog_WritePIDFiles(t *testing.T) { } cmd := exec.Command("sleep", "1") - if err := cmd.Start(); err != nil { - t.Fatalf("Failed to start test process: %v", err) - } + err := cmd.Start() + require.NoError(t, err) + defer cmd.Process.Kill() wd.cmd = cmd - if err := wd.writePIDFiles(); err != nil { - t.Errorf("writePIDFiles failed: %v", err) - } + err = wd.writePIDFiles() + require.NoError(t, err) + + _, err = os.Stat(pidFile) + require.NoError(t, err) + + _, err = os.Stat(wdPidFile) + require.NoError(t, err) - if _, err := os.Stat(pidFile); os.IsNotExist(err) { - t.Error("Process PID file not created") - } - if _, err := os.Stat(wdPidFile); os.IsNotExist(err) { - t.Error("Watchdog PID file not created") - } } diff --git a/test/integration/tcm/test_tcm.py b/test/integration/tcm/test_tcm.py index d7d89b3d4..898f8763b 100644 --- a/test/integration/tcm/test_tcm.py +++ b/test/integration/tcm/test_tcm.py @@ -1,12 +1,8 @@ -<<<<<<< HEAD -from subprocess import PIPE, Popen -======= import os from subprocess import PIPE, STDOUT, Popen, run ->>>>>>> d913784 (tcm: add the tt tcm status command) -from utils import skip_if_tarantool_ce, wait_for_lines_in_output +from utils import skip_if_tcm_not_supported, wait_for_lines_in_output TcmStartCommand = ("tcm", "start") TcmStartWatchdogCommand = ("tcm", "start", "--watchdog") @@ -15,7 +11,7 @@ def test_tcm_start_success(tt_cmd, tmp_path): - skip_if_tarantool_ce() + skip_if_tcm_not_supported() start_cmd = [tt_cmd, *TcmStartCommand] print(f"Run: {start_cmd}") @@ -66,12 +62,12 @@ def test_tcm_start_success(tt_cmd, tmp_path): output = wait_for_lines_in_output(stop.stdout, ["TCM"]) - assert "TCM stoped" in output.strip() + assert "TCM stopped" in output.strip() assert tcm.poll() is not None def test_tcm_start_with_watchdog_success(tt_cmd, tmp_path): - skip_if_tarantool_ce() + skip_if_tcm_not_supported() cmd = [str(tt_cmd), *TcmStartWatchdogCommand] print(f"Run: {' '.join(cmd)}") @@ -111,8 +107,6 @@ def test_tcm_start_with_watchdog_success(tt_cmd, tmp_path): assert tcm.pid is not None assert tcm.poll() is not None - skip_if_tarantool_ce() - start_cmd = [tt_cmd, *TcmStartCommand] print(f"Run: {start_cmd}") diff --git a/test/utils.py b/test/utils.py index e7ec46305..6286f5501 100644 --- a/test/utils.py +++ b/test/utils.py @@ -581,6 +581,14 @@ def is_cluster_app_supported(): return major >= 3 +def skip_if_tcm_not_supported(): + if not is_tarantool_ee(): + pytest.skip("Tarantool Enterprise required") + + if is_tarantool_less_3(): + pytest.skip("TCM not supported") + + def is_tuple_format_supported(): major, minor = get_tarantool_version() return major > 3 or (major == 3 and minor >= 2)