diff --git a/README.md b/README.md index 34285f5..86f79a7 100644 --- a/README.md +++ b/README.md @@ -94,4 +94,4 @@ We can use similar way to set up and call other vcluster-ops commands. ## Licensing -vcluster is open source code and is under the Apache 2.0 license. Please see `LICENSE` for details. \ No newline at end of file +vcluster is open source and is under the Apache 2.0 license. Please see `LICENSE` for details. diff --git a/commands/cluster_command_launcher.go b/commands/cluster_command_launcher.go index 23742a9..9cb8f2a 100644 --- a/commands/cluster_command_launcher.go +++ b/commands/cluster_command_launcher.go @@ -59,6 +59,8 @@ const ( dataPathKey = "dataPath" communalStorageLocationFlag = "communal-storage-location" communalStorageLocationKey = "communalStorageLocation" + archiveNameFlag = "archive-name" + archiveNameKey = "archiveName" ipv6Flag = "ipv6" ipv6Key = "ipv6" eonModeFlag = "eon-mode" @@ -156,6 +158,7 @@ var flagKeyMap = map[string]string{ verboseFlag: verboseKey, outputFileFlag: outputFileKey, sandboxFlag: sandboxKey, + archiveNameFlag: archiveNameKey, targetDBNameFlag: targetDBNameKey, targetHostsFlag: targetHostsKey, targetUserNameFlag: targetUserNameKey, @@ -213,8 +216,9 @@ const ( showRestorePointsSubCmd = "show_restore_points" installPkgSubCmd = "install_packages" // hidden Cmds (for internal testing only) - getDrainingStatusSubCmd = "get_draining_status" promoteSandboxSubCmd = "promote_sandbox" + saveRestorePointsSubCmd = "save_restore_point" + getDrainingStatusSubCmd = "get_draining_status" ) // cmdGlobals holds global variables shared by multiple @@ -580,6 +584,7 @@ func constructCmds() []*cobra.Command { // hidden cmds (for internal testing only) makeCmdGetDrainingStatus(), makeCmdPromoteSandbox(), + makeCmdSaveRestorePoint(), } } diff --git a/commands/cmd_create_db.go b/commands/cmd_create_db.go index a7164a5..a869996 100644 --- a/commands/cmd_create_db.go +++ b/commands/cmd_create_db.go @@ -275,7 +275,6 @@ func (c *CmdCreateDB) Run(vcc vclusterops.ClusterCommands) error { vcc.V(1).Info("Called method Run()") vdb, createError := vcc.VCreateDatabase(c.createDBOptions) if createError != nil { - vcc.LogError(createError, "Failed to create the database.") return createError } diff --git a/commands/cmd_save_restore_point.go b/commands/cmd_save_restore_point.go new file mode 100644 index 0000000..c3c6fc7 --- /dev/null +++ b/commands/cmd_save_restore_point.go @@ -0,0 +1,161 @@ +/* + (c) Copyright [2023-2024] Open Text. + Licensed under the Apache License, Version 2.0 (the "License"); + You may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package commands + +import ( + "github.com/spf13/cobra" + "github.com/spf13/viper" + "github.com/vertica/vcluster/vclusterops" + "github.com/vertica/vcluster/vclusterops/vlog" +) + +/* CmdSaveRestorePoint + * + * Parses arguments to save-restore-points and calls + * the high-level function for save-restore-points. + * + * Implements ClusterCommand interface + */ + +type CmdSaveRestorePoint struct { + CmdBase + saveRestoreOptions *vclusterops.VSaveRestorePointOptions +} + +func makeCmdSaveRestorePoint() *cobra.Command { + newCmd := &CmdSaveRestorePoint{} + opt := vclusterops.VSaveRestorePointFactory() + newCmd.saveRestoreOptions = &opt + + cmd := makeBasicCobraCmd( + newCmd, + saveRestorePointsSubCmd, + "Save a restore point in a given archive.", + `Save a restore point in a given archive. + +Examples: + # Save restore point in a given archive with user input + vcluster save_restore_point --db-name test_db \ + --archive-name ARCHIVE_ONE + + # Save restore point for a sandbox + vcluster save_restore_point --db-name test_db \ + --archive-name ARCHIVE_ONE --sandbox SANDBOX_ONE + +`, + []string{dbNameFlag, hostsFlag, passwordFlag, + ipv6Flag, configFlag, eonModeFlag}, + ) + + // local flags + newCmd.setLocalFlags(cmd) + + // require db-name and archive-name + markFlagsRequired(cmd, dbNameFlag, archiveNameFlag) + + // hide this subcommand + cmd.Hidden = true + + return cmd +} + +// setLocalFlags will set the local flags the command has +func (c *CmdSaveRestorePoint) setLocalFlags(cmd *cobra.Command) { + cmd.Flags().StringVar( + &c.saveRestoreOptions.ArchiveName, + archiveNameFlag, + "", + "Collection of restore points that belong to a certain archive.", + ) + cmd.Flags().StringVar( + &c.saveRestoreOptions.Sandbox, + sandboxFlag, + "", + "The name of target sandbox", + ) +} + +func (c *CmdSaveRestorePoint) Parse(inputArgv []string, logger vlog.Printer) error { + c.argv = inputArgv + logger.LogArgParse(&c.argv) + + // for some options, we do not want to use their default values, + // if they are not provided in cli, + // reset the value of those options to nil + c.ResetUserInputOptions(&c.saveRestoreOptions.DatabaseOptions) + + // save_restore_point only works for an Eon db so we assume the user always runs this subcommand + // on an Eon db. When Eon mode cannot be found in config file, we set its value to true. + if !viper.IsSet(eonModeKey) { + c.saveRestoreOptions.IsEon = true + } + + return c.validateParse(logger) +} + +// all validations of the arguments should go in here +func (c *CmdSaveRestorePoint) validateParse(logger vlog.Printer) error { + logger.Info("Called validateParse()") + + err := c.ValidateParseBaseOptions(&c.saveRestoreOptions.DatabaseOptions) + if err != nil { + return err + } + + if !c.usePassword() { + err = c.getCertFilesFromCertPaths(&c.saveRestoreOptions.DatabaseOptions) + if err != nil { + return err + } + } + + err = c.setConfigParam(&c.saveRestoreOptions.DatabaseOptions) + if err != nil { + return err + } + + err = c.setDBPassword(&c.saveRestoreOptions.DatabaseOptions) + if err != nil { + return err + } + + return nil +} + +func (c *CmdSaveRestorePoint) Analyze(logger vlog.Printer) error { + logger.Info("Called method Analyze()") + return nil +} + +func (c *CmdSaveRestorePoint) Run(vcc vclusterops.ClusterCommands) error { + vcc.LogInfo("Called method Run()") + + options := c.saveRestoreOptions + + err := vcc.VSaveRestorePoint(options) + if err != nil { + vcc.LogError(err, "failed to save restore points", "DBName", options.DBName) + return err + } + + vcc.DisplayInfo("Successfully saved restore points in database %s", options.DBName) + return nil +} + +// SetDatabaseOptions will assign a vclusterops.DatabaseOptions instance to the one in CmdSaveRestorePoint +func (c *CmdSaveRestorePoint) SetDatabaseOptions(opt *vclusterops.DatabaseOptions) { + c.saveRestoreOptions.DatabaseOptions = *opt +} diff --git a/vclusterops/cluster_op.go b/vclusterops/cluster_op.go index 8693902..9e0e449 100644 --- a/vclusterops/cluster_op.go +++ b/vclusterops/cluster_op.go @@ -72,10 +72,11 @@ const ( ) const ( - SuccessCode = 200 - MultipleChoiceCode = 300 - UnauthorizedCode = 401 - InternalErrorCode = 500 + SuccessCode = 200 + MultipleChoiceCode = 300 + UnauthorizedCode = 401 + PreconditionFailedCode = 412 + InternalErrorCode = 500 ) // hostHTTPResult is used to save result of an Adapter's sendRequest(...) function @@ -97,13 +98,17 @@ const respSuccStatusCode = 0 // The HTTP response with a 401 status code can have several scenarios: // 1. Wrong password // 2. Wrong certificate -// 3. The local node has not yet joined the cluster; the HTTP server will accept connections once the node joins the cluster. -// HTTPCheckDBRunningOp in create_db need to check all scenarios to see any HTTP running -// For HTTPSPollNodeStateOp in start_db, it requires only handling the first and second scenarios +// HTTPCheckDBRunningOp in create_db and HTTPSPollNodeStateOp in start_db need to handle these scenarios func (hostResult *hostHTTPResult) isUnauthorizedRequest() bool { return hostResult.statusCode == UnauthorizedCode } +// The HTTP response with a 412 may happen if +// the local node has not yet joined the cluster; the HTTP server will accept connections once the node joins the cluster. +func (hostResult *hostHTTPResult) hasPreconditionFailed() bool { + return hostResult.statusCode == PreconditionFailedCode +} + // isSuccess returns true if status code is 200 func (hostResult *hostHTTPResult) isSuccess() bool { return hostResult.statusCode == SuccessCode @@ -129,7 +134,8 @@ func (hostResult *hostHTTPResult) isInternalError() bool { } func (hostResult *hostHTTPResult) isHTTPRunning() bool { - if hostResult.isPassing() || hostResult.isUnauthorizedRequest() || hostResult.isInternalError() { + if hostResult.isPassing() || hostResult.isUnauthorizedRequest() || + hostResult.isInternalError() || hostResult.hasPreconditionFailed() { return true } return false @@ -561,6 +567,7 @@ type ClusterCommands interface { VSandbox(options *VSandboxOptions) error VScrutinize(options *VScrutinizeOptions) error VShowRestorePoints(options *VShowRestorePointsOptions) (restorePoints []RestorePoint, err error) + VSaveRestorePoint(options *VSaveRestorePointOptions) (err error) VStartDatabase(options *VStartDatabaseOptions) (vdbPtr *VCoordinationDatabase, err error) VStartNodes(options *VStartNodesOptions) error VStartSubcluster(startScOpt *VStartScOptions) error diff --git a/vclusterops/cmd_type.go b/vclusterops/cmd_type.go index 005c671..09265aa 100644 --- a/vclusterops/cmd_type.go +++ b/vclusterops/cmd_type.go @@ -20,6 +20,7 @@ const ( SandboxSCCmd UnsandboxSCCmd ShowRestorePointsCmd + SaveRestorePointsCmd InstallPackagesCmd ConfigRecoverCmd GetDrainingStatusCmd @@ -60,6 +61,7 @@ var cmdStringMap = map[CmdType]string{ SandboxSCCmd: "sandbox_subcluster", UnsandboxSCCmd: "unsandbox_subcluster", ShowRestorePointsCmd: "show_restore_points", + SaveRestorePointsCmd: "save_restore_point", InstallPackagesCmd: "install_packages", ConfigRecoverCmd: "manage_config_recover", GetDrainingStatusCmd: "get_draining_status", diff --git a/vclusterops/create_db.go b/vclusterops/create_db.go index 03d4992..57ae8ee 100644 --- a/vclusterops/create_db.go +++ b/vclusterops/create_db.go @@ -295,6 +295,7 @@ func (vcc VClusterCommands) VCreateDatabase(options *VCreateDatabaseOptions) (VC vdb := makeVCoordinationDatabase() err := vdb.setFromCreateDBOptions(options, vcc.Log) if err != nil { + vcc.Log.Error(err, "fail to create database") return vdb, err } // produce instructions diff --git a/vclusterops/fetch_node_state.go b/vclusterops/fetch_node_state.go index 45d9646..e2cbfd7 100644 --- a/vclusterops/fetch_node_state.go +++ b/vclusterops/fetch_node_state.go @@ -72,7 +72,7 @@ func (vcc VClusterCommands) VFetchNodeState(options *VFetchNodeStateOptions) ([] // this vdb is used to fetch node version var vdb VCoordinationDatabase - err = vcc.getVDBFromRunningDBIncludeSandbox(&vdb, &options.DatabaseOptions, util.MainClusterSandbox) + err = vcc.getVDBFromMainRunningDBContainsSandbox(&vdb, &options.DatabaseOptions) if err != nil { vcc.Log.PrintInfo("Error from vdb build: %s", err.Error()) @@ -91,7 +91,13 @@ func (vcc VClusterCommands) VFetchNodeState(options *VFetchNodeStateOptions) ([] return vcc.fetchNodeStateFromDownDB(options) } - // produce list_all_nodes instructions + nodeStates := buildNodeStateList(&vdb, false /*forDownDatabase*/) + // return the result if no need to get version info + if !options.GetVersion { + return nodeStates, nil + } + + // produce instructions to fill node information instructions, err := vcc.produceListAllNodesInstructions(options, &vdb) if err != nil { return nil, fmt.Errorf("fail to produce instructions, %w", err) @@ -102,7 +108,6 @@ func (vcc VClusterCommands) VFetchNodeState(options *VFetchNodeStateOptions) ([] // give the instructions to the VClusterOpEngine to run runError := clusterOpEngine.run(vcc.Log) - nodeStates := clusterOpEngine.execContext.nodesInfo if runError == nil { // fill node version for i, nodeInfo := range nodeStates { @@ -116,34 +121,9 @@ func (vcc VClusterCommands) VFetchNodeState(options *VFetchNodeStateOptions) ([] nodeInfo.Address) } } - - return nodeStates, nil - } - - // error out in case of wrong certificate or password - if len(clusterOpEngine.execContext.hostsWithWrongAuth) > 0 { - return nodeStates, - fmt.Errorf("wrong certificate or password on hosts %v", clusterOpEngine.execContext.hostsWithWrongAuth) - } - - // if failed to get node info from a running database, - // we will try to get it by reading catalog editor - upNodeCount := 0 - for _, n := range nodeStates { - if n.State == util.NodeUpState { - upNodeCount++ - } - } - - if upNodeCount == 0 { - if options.SkipDownDatabase { - return []NodeInfo{}, rfc7807.New(rfc7807.FetchDownDatabase) - } - - return vcc.fetchNodeStateFromDownDB(options) } - return nodeStates, runError + return nodeStates, nil } func (vcc VClusterCommands) fetchNodeStateFromDownDB(options *VFetchNodeStateOptions) ([]NodeInfo, error) { @@ -163,18 +143,7 @@ func (vcc VClusterCommands) fetchNodeStateFromDownDB(options *VFetchNodeStateOpt return nodeStates, err } - for _, h := range vdb.HostList { - var nodeInfo NodeInfo - n := vdb.HostNodeMap[h] - nodeInfo.Address = n.Address - nodeInfo.Name = n.Name - nodeInfo.CatalogPath = n.CatalogPath - nodeInfo.Subcluster = n.Subcluster - nodeInfo.IsPrimary = n.IsPrimary - nodeInfo.Version = n.Version - nodeInfo.State = util.NodeDownState - nodeStates = append(nodeStates, nodeInfo) - } + nodeStates = buildNodeStateList(&vdb, true /*forDownDatabase*/) return nodeStates, nil } @@ -186,73 +155,64 @@ func (vcc VClusterCommands) produceListAllNodesInstructions( vdb *VCoordinationDatabase) ([]clusterOp, error) { var instructions []clusterOp - // get hosts - hosts := options.Hosts - - // validate user name - usePassword := false - if options.Password != nil { - usePassword = true - err := options.validateUserName(vcc.Log) - if err != nil { - return instructions, err - } - } - nmaHealthOp := makeNMAHealthOpSkipUnreachable(options.Hosts) nmaReadVerticaVersionOp := makeNMAReadVerticaVersionOp(vdb) - // Trim host list - hosts = options.updateHostlist(vcc, vdb, hosts) - - httpsCheckNodeStateOp, err := makeHTTPSCheckNodeStateOp(hosts, - usePassword, options.UserName, options.Password) - if err != nil { - return instructions, err - } - if options.GetVersion { instructions = append(instructions, &nmaHealthOp, &nmaReadVerticaVersionOp) } - instructions = append(instructions, - &httpsCheckNodeStateOp, - ) - return instructions, nil } -// Update and limit the hostlist based on status and sandbox info -// Note: if we have any UP main cluster host in the input list, the trimmed hostlist would always contain -// -// only main cluster UP hosts. -func (options *VFetchNodeStateOptions) updateHostlist(vcc VClusterCommands, vdb *VCoordinationDatabase, inputHosts []string) []string { - var mainClusterHosts []string - var upSandboxHosts []string - - for _, h := range inputHosts { - vnode, ok := vdb.HostNodeMap[h] - if !ok { - // host address not found in vdb, skip it - continue +func buildNodeStateList(vdb *VCoordinationDatabase, forDownDatabase bool) []NodeInfo { + var nodeStates []NodeInfo + + // a map from a subcluster name to whether it is primary + // Context: if a node is primary, the subcluster it belongs to is a primary subcluster. + // If any of the nodes are down in such a primary subcluster, HTTPSUpdateNodeStateOp cannot correctly + // update its IsPrimary value, because this op sends request to each host. + // We use the following scMap to check whether any node is primary in each subcluster, + // then update other nodes' IsPrimary value in this subcluster. + scMap := make(map[string]bool) + + for _, h := range vdb.HostList { + var nodeInfo NodeInfo + n := vdb.HostNodeMap[h] + nodeInfo.Address = n.Address + nodeInfo.CatalogPath = n.CatalogPath + nodeInfo.IsPrimary = n.IsPrimary + nodeInfo.Name = n.Name + nodeInfo.Sandbox = n.Sandbox + if forDownDatabase { + nodeInfo.State = util.NodeDownState + } else { + nodeInfo.State = n.State } - if vnode.Sandbox == "" && (vnode.State == util.NodeUpState || vnode.State == util.NodeUnknownState) { - mainClusterHosts = append(mainClusterHosts, vnode.Address) - } else if vnode.State == util.NodeUpState { - upSandboxHosts = append(upSandboxHosts, vnode.Address) + nodeInfo.Subcluster = n.Subcluster + nodeInfo.Version = n.Version + + nodeStates = append(nodeStates, nodeInfo) + + if !forDownDatabase { + if isPrimary, exists := scMap[n.Subcluster]; exists { + scMap[n.Subcluster] = isPrimary || n.IsPrimary + } else { + scMap[n.Subcluster] = n.IsPrimary + } } } - if len(mainClusterHosts) > 0 { - vcc.Log.PrintWarning("Main cluster UP node found in host list. The status would be fetched from a main cluster host!") - return mainClusterHosts - } - if len(upSandboxHosts) > 0 { - vcc.Log.PrintWarning("Only sandboxed UP nodes found in host list. The status would be fetched from a sandbox host!") - return upSandboxHosts + + // update IsPrimary of the nodes for running database + if !forDownDatabase { + for i := 0; i < len(nodeStates); i++ { + nodeInfo := nodeStates[i] + scName := nodeInfo.Subcluster + nodeStates[i].IsPrimary = scMap[scName] + } } - // We do not have an up host, so better try with complete input hostlist - return inputHosts + return nodeStates } diff --git a/vclusterops/https_check_node_state_op.go b/vclusterops/https_check_node_state_op.go deleted file mode 100644 index 46adb6a..0000000 --- a/vclusterops/https_check_node_state_op.go +++ /dev/null @@ -1,138 +0,0 @@ -/* - (c) Copyright [2023-2024] Open Text. - Licensed under the Apache License, Version 2.0 (the "License"); - You may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package vclusterops - -import ( - "errors" - "fmt" - - "github.com/vertica/vcluster/vclusterops/util" -) - -type httpsCheckNodeStateOp struct { - opBase - opHTTPSBase -} - -func makeHTTPSCheckNodeStateOp(hosts []string, - useHTTPPassword bool, - userName string, - httpsPassword *string, -) (httpsCheckNodeStateOp, error) { - op := httpsCheckNodeStateOp{} - op.name = "HTTPCheckNodeStateOp" - op.description = "Check node state from running database" - // The hosts are the ones we are going to talk to. - // They can be a subset of the actual host information that we return, - // as if any of the hosts is responsive, spread can give us the info of all nodes - op.hosts = hosts - op.useHTTPPassword = useHTTPPassword - - err := util.ValidateUsernameAndPassword(op.name, useHTTPPassword, userName) - if err != nil { - return op, err - } - - op.userName = userName - op.httpsPassword = httpsPassword - return op, nil -} - -func (op *httpsCheckNodeStateOp) setupClusterHTTPRequest(hosts []string) error { - for _, host := range hosts { - httpRequest := hostHTTPRequest{} - httpRequest.Method = GetMethod - httpRequest.buildHTTPSEndpoint("nodes") - if op.useHTTPPassword { - httpRequest.Password = op.httpsPassword - httpRequest.Username = op.userName - } - op.clusterHTTPRequest.RequestCollection[host] = httpRequest - } - - return nil -} - -func (op *httpsCheckNodeStateOp) prepare(execContext *opEngineExecContext) error { - execContext.dispatcher.setup(op.hosts) - - return op.setupClusterHTTPRequest(op.hosts) -} - -func (op *httpsCheckNodeStateOp) execute(execContext *opEngineExecContext) error { - if err := op.runExecute(execContext); err != nil { - return err - } - - return op.processResult(execContext) -} - -func (op *httpsCheckNodeStateOp) processResult(execContext *opEngineExecContext) error { - var allErrs error - respondingNodeCount := 0 - - for host, result := range op.clusterHTTPRequest.ResultCollection { - op.logResponse(host, result) - - if result.isUnauthorizedRequest() { - op.logger.PrintError("[%s] unauthorized request: %s", op.name, result.content) - execContext.hostsWithWrongAuth = append(execContext.hostsWithWrongAuth, host) - // return here because we assume that - // we will get the same error across other nodes - allErrs = errors.Join(allErrs, result.err) - return allErrs - } - - if !result.isPassing() { - // for any error, we continue to the next node - if result.isInternalError() { - op.logger.PrintError("[%s] internal error of the /nodes endpoint: %s", op.name, result.content) - // At internal error originated from the server, so its a - // response, just not a successful one. - respondingNodeCount++ - } - allErrs = errors.Join(allErrs, result.err) - continue - } - - // parse the /nodes endpoint response - respondingNodeCount++ - nodesStates := nodesStateInfo{} - err := op.parseAndCheckResponse(host, result.content, &nodesStates) - if err != nil { - err = fmt.Errorf("[%s] fail to parse result on host %s: %w", - op.name, host, err) - allErrs = errors.Join(allErrs, err) - continue - } - - nodesInfo := nodesInfo{} - for _, node := range nodesStates.NodeList { - n := node.asNodeInfoWithoutVer() - nodesInfo.NodeList = append(nodesInfo.NodeList, n) - } - // successful case, write the result into exec context - execContext.nodesInfo = nodesInfo.NodeList - op.logger.PrintInfo("reporting results as obtained from the host [%s] ", host) - return nil - } - - return allErrs -} - -func (op *httpsCheckNodeStateOp) finalize(_ *opEngineExecContext) error { - return nil -} diff --git a/vclusterops/https_get_nodes_info_op.go b/vclusterops/https_get_nodes_info_op.go index 66e462b..3eb9065 100644 --- a/vclusterops/https_get_nodes_info_op.go +++ b/vclusterops/https_get_nodes_info_op.go @@ -113,6 +113,13 @@ func (op *httpsGetNodesInfoOp) processResult(_ *opEngineExecContext) error { for host, result := range op.clusterHTTPRequest.ResultCollection { op.logResponse(host, result) + // A host may have precondition failed, such as + // "Local node has not joined cluster yet, HTTP server will accept connections when the node has joined the cluster" + // In this case, we skip use the information from that host + if result.hasPreconditionFailed() { + continue + } + if result.isUnauthorizedRequest() { detail := fmt.Sprintf("[%s] wrong password/certificate for https service on host %s", op.name, host) diff --git a/vclusterops/https_stop_db_op.go b/vclusterops/https_stop_db_op.go index 37e102b..3821e7a 100644 --- a/vclusterops/https_stop_db_op.go +++ b/vclusterops/https_stop_db_op.go @@ -21,6 +21,7 @@ import ( "regexp" "strconv" + mapset "github.com/deckarep/golang-set/v2" "github.com/vertica/vcluster/vclusterops/util" ) @@ -87,6 +88,7 @@ func (op *httpsStopDBOp) prepare(execContext *opEngineExecContext) error { sandboxOnly := false var mainHost string var hosts []string + sandboxes := mapset.NewSet[string]() for h, sb := range execContext.upHostsToSandboxes { if sb == op.sandbox && sb != "" { // stop db only on sandbox @@ -96,7 +98,8 @@ func (op *httpsStopDBOp) prepare(execContext *opEngineExecContext) error { } if sb == "" { mainHost = h - } else { + } else if !sandboxes.Contains(sb) { + sandboxes.Add(sb) hosts = append(hosts, h) } } @@ -124,6 +127,7 @@ func (op *httpsStopDBOp) execute(execContext *opEngineExecContext) error { func (op *httpsStopDBOp) processResult(_ *opEngineExecContext) error { var allErrs error re := regexp.MustCompile(`Set subcluster \(.*\) to draining state.*`) + regHang := regexp.MustCompile(`context\s+deadline\s+exceeded\s+\(Client\.Timeout\s+exceeded\s+while\s+awaiting\s+headers\)`) for host, result := range op.clusterHTTPRequest.ResultCollection { op.logResponse(host, result) @@ -135,6 +139,11 @@ func (op *httpsStopDBOp) processResult(_ *opEngineExecContext) error { } if !result.isPassing() { allErrs = errors.Join(allErrs, result.err) + if regHang.MatchString(result.err.Error()) { + err := fmt.Errorf("hint: use NMA endpoint /v1/vertica-process/signal?signal_type=kill to terminate a hanging Vertica " + + "process on the failed host") + allErrs = errors.Join(allErrs, err) + } continue } diff --git a/vclusterops/https_update_node_state_op.go b/vclusterops/https_update_node_state_op.go index 401f9ba..a8a8703 100644 --- a/vclusterops/https_update_node_state_op.go +++ b/vclusterops/https_update_node_state_op.go @@ -87,6 +87,19 @@ func (op *httpsUpdateNodeStateOp) processResult(execContext *opEngineExecContext for host, result := range op.clusterHTTPRequest.ResultCollection { op.logResponse(host, result) + // A host may have precondition failed, such as + // "Local node has not joined cluster yet, HTTP server will accept connections when the node has joined the cluster" + // In this case, we mark the node status as UNKNOWN + if result.hasPreconditionFailed() { + vnode, ok := op.vdb.HostNodeMap[host] + if !ok { + return fmt.Errorf("cannot find host %s in vdb", host) + } + vnode.State = util.NodeUnknownState + + continue + } + if result.isUnauthorizedRequest() { op.logger.PrintError("[%s] unauthorized request: %s", op.name, result.content) execContext.hostsWithWrongAuth = append(execContext.hostsWithWrongAuth, host) @@ -124,6 +137,7 @@ func (op *httpsUpdateNodeStateOp) processResult(execContext *opEngineExecContext return fmt.Errorf("cannot find host %s in vdb", host) } vnode.State = nodeInfo.State + vnode.IsPrimary = nodeInfo.IsPrimary } else { // if the result format is wrong on any of the hosts, we should throw an error return fmt.Errorf(util.NodeInfoCountMismatch, op.name, len(nodesInformation.NodeList), host) diff --git a/vclusterops/nma_save_restore_points_op.go b/vclusterops/nma_save_restore_points_op.go new file mode 100644 index 0000000..a549226 --- /dev/null +++ b/vclusterops/nma_save_restore_points_op.go @@ -0,0 +1,132 @@ +/* + (c) Copyright [2023-2024] Open Text. + Licensed under the Apache License, Version 2.0 (the "License"); + You may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package vclusterops + +import ( + "encoding/json" + "errors" + "fmt" + + "github.com/vertica/vcluster/vclusterops/vlog" +) + +type nmaSaveRestorePointsOp struct { + opBase + dbName string + username string + archiveName string + sandbox string +} + +type saveRestorePointsRequestData struct { + DBName string `json:"dbname"` + ArchiveName string `json:"archive_name"` + UserName string `json:"username"` +} + +// This op is used to save restore points in a database +func makeNMASaveRestorePointsOp(logger vlog.Printer, hosts []string, dbName, username string, + archiveName string, sandbox string) nmaSaveRestorePointsOp { + return nmaSaveRestorePointsOp{ + opBase: opBase{ + name: "NMASaveRestorePointsOp", + description: "Run save restore point query", + logger: logger.WithName("NMASaveRestorePointsOp"), + hosts: hosts, + }, + dbName: dbName, + username: username, + archiveName: archiveName, + sandbox: sandbox, + } +} + +// make https json data +func (op *nmaSaveRestorePointsOp) setupRequestBody() (map[string]string, error) { + hostRequestBodyMap := make(map[string]string, len(op.hosts)) + for _, host := range op.hosts { + requestData := saveRestorePointsRequestData{} + requestData.DBName = op.dbName + requestData.ArchiveName = op.archiveName + requestData.UserName = op.username + + dataBytes, err := json.Marshal(requestData) + if err != nil { + return nil, fmt.Errorf("[%s] fail to marshal request data to JSON string, detail %w", op.name, err) + } + hostRequestBodyMap[host] = string(dataBytes) + } + return hostRequestBodyMap, nil +} + +func (op *nmaSaveRestorePointsOp) setupClusterHTTPRequest(hostRequestBodyMap map[string]string) error { + for host, requestBody := range hostRequestBodyMap { + httpRequest := hostHTTPRequest{} + httpRequest.Method = PostMethod + httpRequest.buildNMAEndpoint("restore-points/save") + httpRequest.RequestData = requestBody + op.clusterHTTPRequest.RequestCollection[host] = httpRequest + } + return nil +} + +func (op *nmaSaveRestorePointsOp) prepare(execContext *opEngineExecContext) error { + hostRequestBody, err := op.setupRequestBody() + if err != nil { + return err + } + execContext.dispatcher.setup(op.hosts) + return op.setupClusterHTTPRequest(hostRequestBody) +} + +func (op *nmaSaveRestorePointsOp) execute(execContext *opEngineExecContext) error { + if err := op.runExecute(execContext); err != nil { + return err + } + + return op.processResult(execContext) +} + +func (op *nmaSaveRestorePointsOp) finalize(_ *opEngineExecContext) error { + return nil +} + +/* +Sample response from the NMA restore-points endpoint: +RespStr: "" (status code:200) +*/ +func (op *nmaSaveRestorePointsOp) processResult(_ *opEngineExecContext) error { + var allErrs error + for host, result := range op.clusterHTTPRequest.ResultCollection { + op.logResponse(host, result) + if result.isUnauthorizedRequest() { + return fmt.Errorf("[%s] wrong certificate for NMA service on host %s", + op.name, host) + } + if result.isPassing() { + var responseObj RestorePoint + err := op.parseAndCheckResponse(host, result.content, &responseObj) + if err != nil { + allErrs = errors.Join(allErrs, err) + continue + } + op.logger.PrintInfo("OP Name: [%s], response: %v", op.name, result.content) + return nil + } + allErrs = errors.Join(allErrs, result.err) + } + return allErrs +} diff --git a/vclusterops/nma_show_restore_points_op.go b/vclusterops/nma_show_restore_points_op.go index 318682c..33baf96 100644 --- a/vclusterops/nma_show_restore_points_op.go +++ b/vclusterops/nma_show_restore_points_op.go @@ -152,6 +152,8 @@ type RestorePoint struct { Timestamp string `json:"timestamp,omitempty"` // The version of Vertica running when the restore point was created. VerticaVersion string `json:"vertica_version,omitempty"` + // Response str returned from the server. + RespStr string `json:"RespStr,omitempty"` } /* @@ -188,7 +190,6 @@ func (op *nmaShowRestorePointsOp) processResult(execContext *opEngineExecContext allErrs = errors.Join(allErrs, err) continue } - op.logger.PrintInfo("[%s] response: %v", op.name, result.content) execContext.restorePoints = responseObj return nil diff --git a/vclusterops/revive_db_test.go b/vclusterops/revive_db_test.go index 9a7068a..5ae4ab0 100644 --- a/vclusterops/revive_db_test.go +++ b/vclusterops/revive_db_test.go @@ -45,7 +45,8 @@ func TestFindSpecifiedRestorePoint(t *testing.T) { options.RestorePoint.ID = expectedID _, err = options.findSpecifiedRestorePoint(allRestorePoints) expectedErr := fmt.Errorf("found 2 restore points instead of 1: " + - "[{Archive:archive1 ID:id3 Index:2 Timestamp: VerticaVersion:} {Archive:archive1 ID:id3 Index:3 Timestamp: VerticaVersion:}]") + "[{Archive:archive1 ID:id3 Index:2 Timestamp: VerticaVersion: RespStr:} " + + "{Archive:archive1 ID:id3 Index:3 Timestamp: VerticaVersion: RespStr:}]") assert.EqualError(t, err, expectedErr.Error()) // Test case: No matching restore points found diff --git a/vclusterops/save_restore_points.go b/vclusterops/save_restore_points.go new file mode 100644 index 0000000..7c61d51 --- /dev/null +++ b/vclusterops/save_restore_points.go @@ -0,0 +1,200 @@ +/* + (c) Copyright [2023-2024] Open Text. + Licensed under the Apache License, Version 2.0 (the "License"); + You may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package vclusterops + +import ( + "fmt" + + "github.com/vertica/vcluster/vclusterops/util" + "github.com/vertica/vcluster/vclusterops/vlog" +) + +type VSaveRestorePointOptions struct { + DatabaseOptions + ArchiveName string + + // the name of the sandbox to target, if left empty the main cluster is assumed + Sandbox string +} + +func VSaveRestorePointFactory() VSaveRestorePointOptions { + options := VSaveRestorePointOptions{} + // set default values to the params + options.setDefaultValues() + return options +} + +func (options *VSaveRestorePointOptions) validateEonOptions(_ vlog.Printer) error { + if !options.IsEon { + return fmt.Errorf("save restore point is only supported in Eon mode") + } + return nil +} + +// Save restore impl +func (options *VSaveRestorePointOptions) validateRequiredOptions(logger vlog.Printer) error { + err := options.validateEonOptions(logger) + if err != nil { + return err + } + err = options.validateBaseOptions(SaveRestorePointsCmd, logger) + if err != nil { + return err + } + if options.ArchiveName == "" { + return fmt.Errorf("must specify an archive name") + } + err = util.ValidateArchiveName(options.ArchiveName) + if err != nil { + return err + } + return nil +} + +func (options *VSaveRestorePointOptions) validateExtraOptions() error { + if options.Sandbox != "" { + return util.ValidateSandboxName(options.Sandbox) + } + return nil +} + +func (options *VSaveRestorePointOptions) validateParseOptions(logger vlog.Printer) error { + // batch 1: validate required parameters + err := options.validateRequiredOptions(logger) + if err != nil { + return err + } + + // batch 2: validate all other params + err = options.validateExtraOptions() + if err != nil { + return err + } + return nil +} + +// analyzeOptions will modify some options based on what is chosen +func (options *VSaveRestorePointOptions) analyzeOptions() (err error) { + // we analyze host names when it is set in user input, otherwise we use hosts in yaml config + if len(options.RawHosts) > 0 { + // resolve RawHosts to be IP addresses + hostAddresses, err := util.ResolveRawHostsToAddresses(options.RawHosts, options.IPv6) + if err != nil { + return err + } + options.Hosts = hostAddresses + } + return nil +} + +func (options *VSaveRestorePointOptions) validateAnalyzeOptions(logger vlog.Printer) error { + if err := options.validateParseOptions(logger); err != nil { + return err + } + if err := options.validateUserName(logger); err != nil { + return err + } + return options.analyzeOptions() +} + +// Update and limit the hostlist based on status and sandbox info +// Note: If sandbox provided, pick up sandbox up hosts and return. Else return up hosts. +// +// only main cluster UP hosts. +func (options *VSaveRestorePointOptions) updateHostlist(_ VClusterCommands, vdb *VCoordinationDatabase, + inputHosts []string, sandbox string) []string { + var clusterHosts []string + var upSandboxHosts []string + + for _, h := range inputHosts { + vnode, ok := vdb.HostNodeMap[h] + if !ok { + // host address not found in vdb, skip it + continue + } + if vnode.Sandbox == "" && vnode.State == util.NodeUpState { + clusterHosts = append(clusterHosts, vnode.Address) + } else if vnode.Sandbox == sandbox && vnode.State == util.NodeUpState { + upSandboxHosts = append(upSandboxHosts, vnode.Address) + } + } + if sandbox == "" { + return clusterHosts + } + return upSandboxHosts +} + +// VSaveRestorePoint can save restore point to a given archive +func (vcc VClusterCommands) VSaveRestorePoint(options *VSaveRestorePointOptions) (err error) { + /* + * - Produce Instructions + * - Create a VClusterOpEngine + * - Give the instructions to the VClusterOpEngine to run + */ + + // validate and analyze options + err = options.validateAnalyzeOptions(vcc.Log) + if err != nil { + return err + } + + // produce save restore points instructions + instructions, err := vcc.produceSaveRestorePointsInstructions(options) + if err != nil { + return fmt.Errorf("fail to produce instructions, %w", err) + } + + // create a VClusterOpEngine, and add certs to the engine + clusterOpEngine := makeClusterOpEngine(instructions, options) + + // give the instructions to the VClusterOpEngine to run + runError := clusterOpEngine.run(vcc.Log) + if runError != nil { + return fmt.Errorf("fail to save restore point: %w", runError) + } + return nil +} + +// The generated instructions will later perform the following operations necessary +// for a successful save_restore_point: +// - Retrieve VDB from HTTP endpoints +// - Check NMA connectivity +// - Check Vertica versions +// - Run save restore points on the target node +func (vcc VClusterCommands) produceSaveRestorePointsInstructions(options *VSaveRestorePointOptions) ([]clusterOp, error) { + var instructions []clusterOp + vdb := makeVCoordinationDatabase() + + err := vcc.getVDBFromRunningDBIncludeSandbox(&vdb, &options.DatabaseOptions, util.MainClusterSandbox) + if err != nil { + return instructions, err + } + + // get up hosts + hosts := options.Hosts + nmaHealthOp := makeNMAHealthOp(options.Hosts) + // Trim host list + hosts = options.updateHostlist(vcc, &vdb, hosts, options.Sandbox) + bootstrapHost := []string{getInitiator(hosts)} + + nmaSaveRestorePointOp := makeNMASaveRestorePointsOp(vcc.Log, bootstrapHost, + options.DBName, options.UserName, options.ArchiveName, options.Sandbox) + + instructions = append(instructions, + &nmaHealthOp, + &nmaSaveRestorePointOp) + return instructions, nil +} diff --git a/vclusterops/restore_points.go b/vclusterops/show_restore_points.go similarity index 100% rename from vclusterops/restore_points.go rename to vclusterops/show_restore_points.go diff --git a/vclusterops/util/util.go b/vclusterops/util/util.go index 352c37d..3a620d9 100644 --- a/vclusterops/util/util.go +++ b/vclusterops/util/util.go @@ -561,6 +561,10 @@ func ValidateSandboxName(dbName string) error { return ValidateName(dbName, "sandbox", true) } +func ValidateArchiveName(archive string) error { + return ValidateName(archive, "archive", true) +} + // suppress help message for hidden options func SetParserUsage(parser *flag.FlagSet, op string) { fmt.Printf("Usage of %s:\n", op)