Merge pull request #179 from outbrain/merge-downstream-gh

added force-master-takeover: planned master switch onto a direct child
outbrain-inc · Mar 10, 2016 · f1c214b · f1c214b
2 parents 18ddffa + 617f859
commit f1c214b
Show file tree

Hide file tree

Showing 4 changed files with 85 additions and 10 deletions.
diff --git a/build.sh b/build.sh
@@ -6,7 +6,7 @@
 #
 set -e
 
-RELEASE_VERSION="1.4.579"
+RELEASE_VERSION="1.4.580"
 TOPDIR=/tmp/orchestrator-release
 export RELEASE_VERSION TOPDIR
 export GO15VENDOREXPERIMENT=1

diff --git a/go/app/cli.go b/go/app/cli.go
@@ -1117,6 +1117,44 @@ func Cli(command string, strict bool, instance string, destination string, owner
 				fmt.Println(promotedInstanceKey.DisplayString())
 			}
 		}
+	case registerCliCommand("force-master-takeover", "Recovery", `Forcibly discard master and promote another (direct child) instance instead, even if everything is running well`):
+		{
+			clusterName := getClusterName(clusterAlias, instanceKey)
+			clusterMasters, err := inst.ReadClusterWriteableMaster(clusterName)
+			if err != nil {
+				log.Fatalf("Cannot deduce cluster master for %+v", clusterName)
+			}
+			var clusterMaster *inst.Instance
+			if len(clusterMasters) == 1 {
+				clusterMaster = clusterMasters[0]
+			} else {
+				log.Fatalf("Cannot deduce cluster master for %+v", clusterName)
+			}
+
+			if destinationKey == nil {
+				log.Fatal("Cannot deduce destination, the instance to promote in place of the master. Please provide with -d")
+			}
+			destination := validateInstanceIsFound(destinationKey)
+			if !destination.MasterKey.Equals(&clusterMaster.Key) {
+				log.Fatalf("You may only promote a direct child of the master %+v. The master of %+v is %+v.", clusterMaster.Key, destination.Key, destination.MasterKey)
+			}
+			log.Debugf("Will demote %+v and promote %+v instead", clusterMaster.Key, *destinationKey)
+
+			recoveryAttempted, topologyRecovery, err := logic.ForceExecuteRecovery(clusterName, inst.DeadMaster, &clusterMaster.Key, destinationKey, false)
+			if err != nil {
+				log.Fatale(err)
+			}
+			if !recoveryAttempted {
+				log.Fatalf("Unexpected error: recovery not attempted. This should not happen")
+			}
+			if topologyRecovery == nil {
+				log.Fatalf("Recovery attempted but with no results. This should not happen")
+			}
+			if topologyRecovery.SuccessorKey == nil {
+				log.Fatalf("Recovery attempted yet no slave promoted")
+			}
+			fmt.Println(topologyRecovery.SuccessorKey.DisplayString())
+		}
 	case registerCliCommand("replication-analysis", "Recovery", `Request an analysis of potential crash incidents in all known topologies`):
 		{
 			analysis, err := inst.GetReplicationAnalysis("", false, false)

diff --git a/go/cmd/orchestrator/main.go b/go/cmd/orchestrator/main.go
@@ -761,6 +761,30 @@ Cheatsheet:
 
             orchestrator -c recover-lite -i dead.instance.com --debug
 
+				force-master-takeover
+						Forcibly discard master and promote another (direct child) instance instead, even if everything is running well.
+						This allows for planned switchover.
+						NOTE:
+						- You must specify the instance to promote via "-d"
+						- Promoted instance must be a direct child of the existing master
+						- This will not work in a master-master configuration
+						- Orchestrator just treats this command as a DeadMaster failover scenario
+						- It is STRONGLY suggested that you first relocate everything below your chosen instance-to-promote.
+						  It *is* a planned failover thing.
+						- Otherwise orchestrator will do its thing in moving instances around, hopefully promoting your requested
+						  server on top.
+						- Orchestrator will issue all relevant pre-failover and post-failover external processes.
+						- At this time orchestrator will not issue 'SET GLOBAL read_only=1' on the existing master, nor will
+						  it issue a 'FLUSH TABLES WITH READ LOCK'. This is being investigated.
+						Examples:
+
+						orchestrator -c force-master-takeover -alias mycluster -d immediate.child.of.master.com
+								Indicate cluster by alias. Orchestrator automatically figures out the master
+
+						orchestrator -c force-master-takeover -i instance.in.relevant.cluster.com -d immediate.child.of.master.com
+								Indicate cluster by an instance. You don't structly need to specify the master, orchestrator
+								will infer the master's identify.
+
         replication-analysis
             Request an analysis of potential crash incidents in all known topologies.
             Output format is not yet stabilized and may change in the future. Do not trust the output
@@ -870,15 +894,11 @@ Cheatsheet:
 
             orchestrator -c resolve -i cname.to.resolve
 
-        reset-internal-db-deployment
-            Clear internal db deployment history, use if somehow corrupted internal deployment history.
-            When configured with '"SmartOrchestratorDatabaseUpdate": true', Orchestrator does housekeeping for its
-            own database schema, and verifies proposed deployment vs deployment history.
-            In case of contradiction between the two orchestrator bails out. Such a contradiction should not occur, and may
-            signify an inconsistency in the orchestrator code itself.
-            By resetting history orchestrator redeploys its schema (without causing data loss) and accepts the new instructions
-            as the de-factor deployment rule.
-
+        redeploy-internal-db
+						Force internal schema migration to current backend structure. Orchestrator keeps track of the deployed
+						versions and will not reissue a migration for a version already deployed. Normally you should not use
+						this command, and it is provided mostly for building and testing purposes. Nonetheless it is safe to
+						use and at most it wastes some cycles.
     `
 
 // main is the application's entry point. It will either spawn a CLI or HTTP itnerfaces.

diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go
@@ -1020,3 +1020,20 @@ func CheckAndRecover(specificInstance *inst.InstanceKey, candidateInstanceKey *i
 	}
 	return recoveryAttempted, promotedSlaveKey, err
 }
+
+// ForceExecuteRecovery can be called to issue a recovery process even if analysis says there is no recovery case.
+// The caller of this function injects the type of analysis it wishes the function to assume.
+// By calling this function one takes responsibility for one's actions.
+func ForceExecuteRecovery(clusterName string, analysisCode inst.AnalysisCode, failedInstanceKey *inst.InstanceKey, candidateInstanceKey *inst.InstanceKey, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
+	clusterInfo, err := inst.ReadClusterInfo(clusterName)
+	if err != nil {
+		return recoveryAttempted, topologyRecovery, err
+	}
+
+	analysisEntry := inst.ReplicationAnalysis{
+		Analysis:            analysisCode,
+		ClusterDetails:      *clusterInfo,
+		AnalyzedInstanceKey: *failedInstanceKey,
+	}
+	return executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, true, skipProcesses)
+}