Skip to content

Commit

Permalink
Merge pull request #179 from outbrain/merge-downstream-gh
Browse files Browse the repository at this point in the history
added force-master-takeover: planned master switch onto a direct child
  • Loading branch information
Shlomi Noach committed Mar 10, 2016
2 parents 18ddffa + 617f859 commit f1c214b
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 10 deletions.
2 changes: 1 addition & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#
set -e

RELEASE_VERSION="1.4.579"
RELEASE_VERSION="1.4.580"
TOPDIR=/tmp/orchestrator-release
export RELEASE_VERSION TOPDIR
export GO15VENDOREXPERIMENT=1
Expand Down
38 changes: 38 additions & 0 deletions go/app/cli.go
Original file line number Diff line number Diff line change
Expand Up @@ -1117,6 +1117,44 @@ func Cli(command string, strict bool, instance string, destination string, owner
fmt.Println(promotedInstanceKey.DisplayString())
}
}
case registerCliCommand("force-master-takeover", "Recovery", `Forcibly discard master and promote another (direct child) instance instead, even if everything is running well`):
{
clusterName := getClusterName(clusterAlias, instanceKey)
clusterMasters, err := inst.ReadClusterWriteableMaster(clusterName)
if err != nil {
log.Fatalf("Cannot deduce cluster master for %+v", clusterName)
}
var clusterMaster *inst.Instance
if len(clusterMasters) == 1 {
clusterMaster = clusterMasters[0]
} else {
log.Fatalf("Cannot deduce cluster master for %+v", clusterName)
}

if destinationKey == nil {
log.Fatal("Cannot deduce destination, the instance to promote in place of the master. Please provide with -d")
}
destination := validateInstanceIsFound(destinationKey)
if !destination.MasterKey.Equals(&clusterMaster.Key) {
log.Fatalf("You may only promote a direct child of the master %+v. The master of %+v is %+v.", clusterMaster.Key, destination.Key, destination.MasterKey)
}
log.Debugf("Will demote %+v and promote %+v instead", clusterMaster.Key, *destinationKey)

recoveryAttempted, topologyRecovery, err := logic.ForceExecuteRecovery(clusterName, inst.DeadMaster, &clusterMaster.Key, destinationKey, false)
if err != nil {
log.Fatale(err)
}
if !recoveryAttempted {
log.Fatalf("Unexpected error: recovery not attempted. This should not happen")
}
if topologyRecovery == nil {
log.Fatalf("Recovery attempted but with no results. This should not happen")
}
if topologyRecovery.SuccessorKey == nil {
log.Fatalf("Recovery attempted yet no slave promoted")
}
fmt.Println(topologyRecovery.SuccessorKey.DisplayString())
}
case registerCliCommand("replication-analysis", "Recovery", `Request an analysis of potential crash incidents in all known topologies`):
{
analysis, err := inst.GetReplicationAnalysis("", false, false)
Expand Down
38 changes: 29 additions & 9 deletions go/cmd/orchestrator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -761,6 +761,30 @@ Cheatsheet:
orchestrator -c recover-lite -i dead.instance.com --debug
force-master-takeover
Forcibly discard master and promote another (direct child) instance instead, even if everything is running well.
This allows for planned switchover.
NOTE:
- You must specify the instance to promote via "-d"
- Promoted instance must be a direct child of the existing master
- This will not work in a master-master configuration
- Orchestrator just treats this command as a DeadMaster failover scenario
- It is STRONGLY suggested that you first relocate everything below your chosen instance-to-promote.
It *is* a planned failover thing.
- Otherwise orchestrator will do its thing in moving instances around, hopefully promoting your requested
server on top.
- Orchestrator will issue all relevant pre-failover and post-failover external processes.
- At this time orchestrator will not issue 'SET GLOBAL read_only=1' on the existing master, nor will
it issue a 'FLUSH TABLES WITH READ LOCK'. This is being investigated.
Examples:
orchestrator -c force-master-takeover -alias mycluster -d immediate.child.of.master.com
Indicate cluster by alias. Orchestrator automatically figures out the master
orchestrator -c force-master-takeover -i instance.in.relevant.cluster.com -d immediate.child.of.master.com
Indicate cluster by an instance. You don't structly need to specify the master, orchestrator
will infer the master's identify.
replication-analysis
Request an analysis of potential crash incidents in all known topologies.
Output format is not yet stabilized and may change in the future. Do not trust the output
Expand Down Expand Up @@ -870,15 +894,11 @@ Cheatsheet:
orchestrator -c resolve -i cname.to.resolve
reset-internal-db-deployment
Clear internal db deployment history, use if somehow corrupted internal deployment history.
When configured with '"SmartOrchestratorDatabaseUpdate": true', Orchestrator does housekeeping for its
own database schema, and verifies proposed deployment vs deployment history.
In case of contradiction between the two orchestrator bails out. Such a contradiction should not occur, and may
signify an inconsistency in the orchestrator code itself.
By resetting history orchestrator redeploys its schema (without causing data loss) and accepts the new instructions
as the de-factor deployment rule.
redeploy-internal-db
Force internal schema migration to current backend structure. Orchestrator keeps track of the deployed
versions and will not reissue a migration for a version already deployed. Normally you should not use
this command, and it is provided mostly for building and testing purposes. Nonetheless it is safe to
use and at most it wastes some cycles.
`

// main is the application's entry point. It will either spawn a CLI or HTTP itnerfaces.
Expand Down
17 changes: 17 additions & 0 deletions go/logic/topology_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -1020,3 +1020,20 @@ func CheckAndRecover(specificInstance *inst.InstanceKey, candidateInstanceKey *i
}
return recoveryAttempted, promotedSlaveKey, err
}

// ForceExecuteRecovery can be called to issue a recovery process even if analysis says there is no recovery case.
// The caller of this function injects the type of analysis it wishes the function to assume.
// By calling this function one takes responsibility for one's actions.
func ForceExecuteRecovery(clusterName string, analysisCode inst.AnalysisCode, failedInstanceKey *inst.InstanceKey, candidateInstanceKey *inst.InstanceKey, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
clusterInfo, err := inst.ReadClusterInfo(clusterName)
if err != nil {
return recoveryAttempted, topologyRecovery, err
}

analysisEntry := inst.ReplicationAnalysis{
Analysis: analysisCode,
ClusterDetails: *clusterInfo,
AnalyzedInstanceKey: *failedInstanceKey,
}
return executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, true, skipProcesses)
}

0 comments on commit f1c214b

Please sign in to comment.