Skip to content

Commit

Permalink
Merge pull request #320 from prashanth26/bugfix/delete-unhealthy-mach…
Browse files Browse the repository at this point in the history
…ines

Fixed issues while deleting unhealthy machines
  • Loading branch information
prashanth26 authored Aug 22, 2019
2 parents 373b416 + 99fadf9 commit f2cb0ff
Show file tree
Hide file tree
Showing 5 changed files with 240 additions and 29 deletions.
7 changes: 4 additions & 3 deletions pkg/controller/drain.go
Original file line number Diff line number Diff line change
Expand Up @@ -852,12 +852,13 @@ func (o *DrainOptions) evictPodWithoutPVInternal(attemptEvict bool, pod *corev1.
podArray := []*api.Pod{pod}

timeout := o.getTerminationGracePeriod(pod)
if timeout < o.Timeout {
glog.V(3).Infof("Overriding large termination grace period (%s) for the pod %s/%s", timeout.String(), pod.Namespace, pod.Name)
if timeout > o.Timeout {
glog.V(3).Infof("Overriding large termination grace period (%s) for the pod %s/%s and setting it to %s", timeout.String(), pod.Namespace, pod.Name, o.Timeout)
timeout = o.Timeout
}

podArray, err = o.waitForDelete(podArray, Interval, timeout, true, getPodFn)
bufferPeriod := 30 * time.Second
podArray, err = o.waitForDelete(podArray, Interval, timeout+bufferPeriod, true, getPodFn)
if err == nil {
if len(podArray) > 0 {
returnCh <- fmt.Errorf("timeout expired while waiting for pod %q terminating scheduled on node %v", pod.Name, pod.Spec.NodeName)
Expand Down
32 changes: 30 additions & 2 deletions pkg/controller/drain_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ var _ = Describe("drain", func() {
const terminationGracePeriodShortBy8 = terminationGracePeriodShort / 8
const terminationGracePeriodMedium = 10 * time.Second
const terminationGracePeriodDefault = 20 * time.Second
const terminationGracePeriodLong = 40 * time.Second
const terminationGracePeriodLong = 2 * time.Minute

type stats struct {
nPodsWithoutPV int
Expand Down Expand Up @@ -711,7 +711,35 @@ var _ = Describe("drain", func() {
drainError: nil,
nEvictions: 0,
minDrainDuration: 0,
}))
}),
Entry("Successful drain for pods with long termination grace period",
&setup{
stats: stats{
nPodsWithoutPV: 10,
nPodsWithOnlyExclusivePV: 2,
nPodsWithOnlySharedPV: 0,
nPodsWithExclusiveAndSharedPV: 0,
},
attemptEviction: true,
terminationGracePeriod: terminationGracePeriodLong,
},
[]podDrainHandler{deletePod, sleepFor(terminationGracePeriodShortBy8), detachExclusiveVolumes},
&expectation{
stats: stats{
nPodsWithoutPV: 0,
nPodsWithOnlyExclusivePV: 0,
nPodsWithOnlySharedPV: 0,
nPodsWithExclusiveAndSharedPV: 0,
},
// Because waitForDetach polling Interval is equal to terminationGracePeriodShort
timeout: terminationGracePeriodLong,
drainTimeout: false,
drainError: nil,
nEvictions: 12,
// Because waitForDetach polling Interval is equal to terminationGracePeriodShort
minDrainDuration: terminationGracePeriodMedium,
}),
)
})

func getPodWithoutPV(ns, name, nodeName string, terminationGracePeriod time.Duration, labels map[string]string) *corev1.Pod {
Expand Down
41 changes: 29 additions & 12 deletions pkg/controller/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -484,21 +484,38 @@ func (c *controller) machineDelete(machine *v1alpha1.Machine, driver driver.Driv
}

if machineID != "" {
maxEvictRetries := c.safetyOptions.MaxEvictRetries
pvDetachTimeOut := c.safetyOptions.PvDetachTimeout.Duration
timeOutDuration := c.safetyOptions.MachineDrainTimeout.Duration
var (
forceDeletePods = false
forceDeleteMachine = false
timeOutOccurred = false
maxEvictRetries = c.safetyOptions.MaxEvictRetries
pvDetachTimeOut = c.safetyOptions.PvDetachTimeout.Duration
timeOutDuration = c.safetyOptions.MachineDrainTimeout.Duration
forceDeleteLabelPresent = machine.Labels["force-deletion"] == "True"
lastDrainFailed = machine.Status.LastOperation.Description[0:12] == "Drain failed"
)

// Timeout value obtained by subtracting last operation with expected time out period
timeOut := metav1.Now().Add(-timeOutDuration).Sub(machine.Status.CurrentStatus.LastUpdateTime.Time)
forceDeletePods := false

// To perform forceful drain either one of the below conditions must be satified
// 1. force-deletion: "True" label must be present
// 2. Deletion operation is more than drain-timeout minutes old
if machine.Labels["force-deletion"] == "True" || timeOut > 0 {
timeOutOccurred = timeOut > 0

if forceDeleteLabelPresent || timeOutOccurred || lastDrainFailed {
// To perform forceful machine drain/delete either one of the below conditions must be satified
// 1. force-deletion: "True" label must be present
// 2. Deletion operation is more than drain-timeout minutes old
// 3. Last machine drain had failed
forceDeleteMachine = true
forceDeletePods = true
timeOutDuration = 30 * time.Second
timeOutDuration = 1 * time.Minute
maxEvictRetries = 1
glog.V(2).Infof("Force deletion has been triggerred for machine %q", machine.Name)

glog.V(2).Infof(
"Force deletion has been triggerred for machine %q due to Label:%t, timeout:%t, lastDrainFailed:%t",
machine.Name,
forceDeleteLabelPresent,
timeOutOccurred,
lastDrainFailed,
)
}

buf := bytes.NewBuffer([]byte{})
Expand Down Expand Up @@ -526,7 +543,7 @@ func (c *controller) machineDelete(machine *v1alpha1.Machine, driver driver.Driv
// Drain successful
glog.V(2).Infof("Drain successful for machine %q. \nBuf:%v \nErrBuf:%v", machine.Name, buf, errBuf)

} else if err != nil && machine.Labels["force-deletion"] == "True" {
} else if err != nil && forceDeleteMachine {
// Drain failed on force deletion
glog.Warningf("Drain failed for machine %q. However, since it's a force deletion shall continue deletion of VM. \nBuf:%v \nErrBuf:%v \nErr-Message:%v", machine.Name, buf, errBuf, err)

Expand Down
187 changes: 176 additions & 11 deletions pkg/controller/machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"time"

machineapi "github.com/gardener/machine-controller-manager/pkg/apis/machine"
"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
machinev1 "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
"github.com/gardener/machine-controller-manager/pkg/apis/machine/validation"
fakemachineapi "github.com/gardener/machine-controller-manager/pkg/client/clientset/versioned/typed/machine/v1alpha1/fake"
Expand Down Expand Up @@ -634,11 +635,12 @@ var _ = Describe("machine", func() {
fakeResourceActions *customfake.ResourceActions
}
type action struct {
machine string
fakeProviderID string
fakeNodeName string
fakeError error
forceDelete bool
machine string
fakeProviderID string
fakeNodeName string
fakeError error
forceDeleteLabelPresent bool
fakeMachineStatus *machinev1.MachineStatus
}
type expect struct {
machine *machinev1.Machine
Expand Down Expand Up @@ -711,14 +713,21 @@ var _ = Describe("machine", func() {
machine, err = controller.controlMachineClient.Machines(objMeta.Namespace).Get(action.machine, metav1.GetOptions{})
Expect(err).ToNot(HaveOccurred())

if data.action.forceDelete {
if data.action.forceDeleteLabelPresent {
// Add labels for force deletion
clone := machine.DeepCopy()
clone.Labels["force-deletion"] = "True"
machine, err = controller.controlMachineClient.Machines(objMeta.Namespace).Update(clone)
Expect(err).ToNot(HaveOccurred())
}

if data.action.fakeMachineStatus != nil {
clone := machine.DeepCopy()
clone.Status = *data.action.fakeMachineStatus
machine, err = controller.controlMachineClient.Machines(objMeta.Namespace).UpdateStatus(clone)
Expect(err).ToNot(HaveOccurred())
}

if data.setup.fakeResourceActions != nil {
trackers.TargetCore.SetFakeResourceActions(data.setup.fakeResourceActions)
}
Expand All @@ -745,7 +754,7 @@ var _ = Describe("machine", func() {
Expect(node).ToNot(BeNil())
}
},
Entry("Machine deletion", &data{
Entry("Simple machine deletion", &data{
setup: setup{
secrets: []*corev1.Secret{
{
Expand Down Expand Up @@ -822,7 +831,86 @@ var _ = Describe("machine", func() {
machineDeleted: false,
},
}),
Entry("Machine force deletion", &data{
Entry("Machine force deletion label is present", &data{
setup: setup{
secrets: []*corev1.Secret{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
},
},
aws: []*machinev1.AWSMachineClass{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: machinev1.AWSMachineClassSpec{
SecretRef: newSecretReference(objMeta, 0),
},
},
},
machines: newMachines(1, &machinev1.MachineTemplateSpec{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: machinev1.MachineSpec{
Class: machinev1.ClassSpec{
Kind: "AWSMachineClass",
Name: "machine-0",
},
},
}, nil, nil, nil, nil),
},
action: action{
machine: "machine-0",
fakeProviderID: "fakeID-0",
fakeNodeName: "fakeNode-0",
fakeError: nil,
forceDeleteLabelPresent: true,
},
expect: expect{
errOccurred: false,
machineDeleted: true,
},
}),
Entry("Machine force deletion label is present and when drain call fails (APIServer call fails)", &data{
setup: setup{
secrets: []*corev1.Secret{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
},
},
aws: []*machinev1.AWSMachineClass{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: machinev1.AWSMachineClassSpec{
SecretRef: newSecretReference(objMeta, 0),
},
},
},
machines: newMachines(1, &machinev1.MachineTemplateSpec{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: machinev1.MachineSpec{
Class: machinev1.ClassSpec{
Kind: "AWSMachineClass",
Name: "machine-0",
},
},
}, nil, nil, nil, nil),
fakeResourceActions: &customfake.ResourceActions{
Node: customfake.Actions{
Update: "Failed to update nodes",
},
},
},
action: action{
machine: "machine-0",
fakeProviderID: "fakeID-0",
fakeNodeName: "fakeNode-0",
fakeError: nil,
forceDeleteLabelPresent: true,
},
expect: expect{
errOccurred: false,
machineDeleted: true,
},
}),
Entry("Machine deletion when timeout occurred", &data{
setup: setup{
secrets: []*corev1.Secret{
{
Expand Down Expand Up @@ -852,14 +940,78 @@ var _ = Describe("machine", func() {
fakeProviderID: "fakeID-0",
fakeNodeName: "fakeNode-0",
fakeError: nil,
forceDelete: true,
fakeMachineStatus: &machinev1.MachineStatus{
Node: "fakeNode-0",
LastOperation: machinev1.LastOperation{
Description: "Deleting machine from cloud provider",
State: v1alpha1.MachineStateProcessing,
Type: v1alpha1.MachineOperationDelete,
LastUpdateTime: metav1.Now(),
},
CurrentStatus: machinev1.CurrentStatus{
Phase: v1alpha1.MachineTerminating,
TimeoutActive: false,
// Updating last update time to 30 minutes before now
LastUpdateTime: metav1.NewTime(time.Now().Add(-30 * time.Minute)),
},
},
},
expect: expect{
errOccurred: false,
machineDeleted: true,
},
}),
Entry("Machine force deletion when drain call fails (APIServer call fails)", &data{
Entry("Machine deletion when last drain failed", &data{
setup: setup{
secrets: []*corev1.Secret{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
},
},
aws: []*machinev1.AWSMachineClass{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: machinev1.AWSMachineClassSpec{
SecretRef: newSecretReference(objMeta, 0),
},
},
},
machines: newMachines(1, &machinev1.MachineTemplateSpec{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: machinev1.MachineSpec{
Class: machinev1.ClassSpec{
Kind: "AWSMachineClass",
Name: "machine-0",
},
},
}, nil, nil, nil, nil),
},
action: action{
machine: "machine-0",
fakeProviderID: "fakeID-0",
fakeNodeName: "fakeNode-0",
fakeError: nil,
fakeMachineStatus: &machinev1.MachineStatus{
Node: "fakeNode-0",
LastOperation: machinev1.LastOperation{
Description: "Drain failed - for random reason",
State: v1alpha1.MachineStateFailed,
Type: v1alpha1.MachineOperationDelete,
LastUpdateTime: metav1.Now(),
},
CurrentStatus: machinev1.CurrentStatus{
Phase: v1alpha1.MachineTerminating,
TimeoutActive: false,
LastUpdateTime: metav1.NewTime(time.Now().Add(-2 * time.Minute)),
},
},
},
expect: expect{
errOccurred: false,
machineDeleted: true,
},
}),
Entry("Machine deletion when last drain failed and current drain call also fails (APIServer call fails)", &data{
setup: setup{
secrets: []*corev1.Secret{
{
Expand Down Expand Up @@ -894,7 +1046,20 @@ var _ = Describe("machine", func() {
fakeProviderID: "fakeID-0",
fakeNodeName: "fakeNode-0",
fakeError: nil,
forceDelete: true,
fakeMachineStatus: &machinev1.MachineStatus{
Node: "fakeNode-0",
LastOperation: machinev1.LastOperation{
Description: "Drain failed - for random reason",
State: v1alpha1.MachineStateFailed,
Type: v1alpha1.MachineOperationDelete,
LastUpdateTime: metav1.Now(),
},
CurrentStatus: machinev1.CurrentStatus{
Phase: v1alpha1.MachineTerminating,
TimeoutActive: false,
LastUpdateTime: metav1.NewTime(time.Now().Add(-2 * time.Minute)),
},
},
},
expect: expect{
errOccurred: false,
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/machineset.go
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ func (c *controller) manageReplicas(allMachines []*v1alpha1.Machine, machineSet
c.terminateMachines(staleMachines, machineSet)

diff := len(activeMachines) - int(machineSet.Spec.Replicas)
glog.V(3).Infof("Difference between current active replicas and desired replicas - %d", diff)
glog.V(4).Infof("Difference between current active replicas and desired replicas - %d", diff)

if diff < 0 {
// If MachineSet is frozen and no deletion timestamp, don't process it
Expand Down

0 comments on commit f2cb0ff

Please sign in to comment.