Skip to content

Commit

Permalink
Added failed machine to prevent scaledown (#291)
Browse files Browse the repository at this point in the history
* Added failed machine to prevent scaledown

* Addressed review comments

* Added unit test

* Addressed review comments

* Changes logs

* Changes logs - II
  • Loading branch information
sssash18 committed Jan 30, 2024
1 parent 914d134 commit 46ee449
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 7 deletions.
17 changes: 17 additions & 0 deletions cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,23 @@ func TestDeleteNodes(t *testing.T) {
err: nil,
},
},
{
"should not scale down a machine deployment when the corresponding machine is already in failed state",
setup{
nodes: newNodes(2, "fakeID", []bool{true, false}),
machines: newMachines(2, "fakeID", &v1alpha1.MachineStatus{CurrentStatus: v1alpha1.CurrentStatus{Phase: v1alpha1.MachineFailed}}, "machinedeployment-1", "machineset-1", []string{"3", "3"}, []bool{false, false}),
machineSets: newMachineSets(1, "machinedeployment-1"),
machineDeployments: newMachineDeployments(1, 2, nil, nil, nil),
nodeGroups: []string{nodeGroup1},
},
action{node: newNodes(1, "fakeID", []bool{false})[0]},
expect{
machines: newMachines(2, "fakeID", &v1alpha1.MachineStatus{CurrentStatus: v1alpha1.CurrentStatus{Phase: v1alpha1.MachineFailed}}, "machinedeployment-1", "machineset-1", []string{"3", "3"}, []bool{false, false}),
mdName: "machinedeployment-1",
mdReplicas: 2,
err: nil,
},
},
{
"should not scale down a machine deployment below the minimum",
setup{
Expand Down
15 changes: 8 additions & 7 deletions cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ func (m *McmManager) prioritizeMachinesForDeletion(targetMachineRefs []*Ref, mdN
klog.Errorf("Unable to fetch Machine object %s, Error: %v", machineRef.Name, err)
return true, err
}
if isMachineTerminating(mc) {
if isMachineFailedOrTerminating(mc) {
return false, nil
}
expectedToTerminateMachineNodePairs[mc.Name] = mc.Labels["node"]
Expand All @@ -530,8 +530,6 @@ func (m *McmManager) prioritizeMachinesForDeletion(targetMachineRefs []*Ref, mdN
klog.Errorf("could not prioritize machine %s for deletion, aborting scale in of machine deployment, Error: %v", machineRef.Name, err)
return 0, fmt.Errorf("could not prioritize machine %s for deletion, aborting scale in of machine deployment, Error: %v", machineRef.Name, err)
}
// Break out of loop when update succeeds
klog.Infof("Machine %s of machineDeployment %s marked with priority 1 successfully", machineRef.Name, mdName)
}
klog.V(2).Infof("Expected to remove following {machineRef: corresponding node} pairs %s", expectedToTerminateMachineNodePairs)
return len(expectedToTerminateMachineNodePairs), nil
Expand Down Expand Up @@ -560,6 +558,9 @@ func (m *McmManager) updateAnnotationOnMachine(ctx context.Context, mcName strin
clone.Annotations[key] = val
}
_, err = m.machineClient.Machines(machine.Namespace).Update(ctx, clone, metav1.UpdateOptions{})
if err == nil {
klog.Infof("Machine %s marked with priority 1 successfully", mcName)
}
return true, err
}

Expand Down Expand Up @@ -988,10 +989,10 @@ func buildGenericLabels(template *nodeTemplate, nodeName string) map[string]stri
return result
}

// isMachineTerminating returns true if machine is already being terminated or considered for termination by autoscaler.
func isMachineTerminating(machine *v1alpha1.Machine) bool {
if !machine.GetDeletionTimestamp().IsZero() {
klog.Infof("Machine %q is already being terminated, and hence skipping the deletion", machine.Name)
// isMachineFailedOrTerminating returns true if machine is already being terminated or considered for termination by autoscaler.
func isMachineFailedOrTerminating(machine *v1alpha1.Machine) bool {
if !machine.GetDeletionTimestamp().IsZero() || machine.Status.CurrentStatus.Phase == v1alpha1.MachineFailed {
klog.Infof("Machine %q is already being terminated or in a failed phase, and hence skipping the deletion", machine.Name)
return true
}
return false
Expand Down

0 comments on commit 46ee449

Please sign in to comment.