Skip to content

Commit

Permalink
Bugfix/machinedeployment freeze sync (#264)
Browse files Browse the repository at this point in the history
* Machine deployment freeze state synchronizer

Machine deployment was sometimes found in a partial freeze state where either of the label or status was missing.
This change fixes this by creating a machine deployment freeze state synchronizer

* Added tests for machine deployment freeze state synchronizer

* Removed LastUpdateTimestamp for machine sets

This field/functionality was no longer being used and hence being removed from the MCM code
  • Loading branch information
prashanth26 authored and hardikdr committed May 13, 2019
1 parent b8b56a5 commit f9e82c9
Show file tree
Hide file tree
Showing 4 changed files with 219 additions and 82 deletions.
7 changes: 1 addition & 6 deletions pkg/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ import (
machinelisters "github.com/gardener/machine-controller-manager/pkg/client/listers/machine/v1alpha1"
"github.com/golang/glog"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
coreinformers "k8s.io/client-go/informers/core/v1"
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
corelisters "k8s.io/client-go/listers/core/v1"
Expand Down Expand Up @@ -372,11 +372,6 @@ func NewController(
DeleteFunc: controller.deleteMachineToSafety,
})

machineSetInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: controller.addMachineSetToSafety,
UpdateFunc: controller.updateMachineSetToSafety,
})

return controller, nil
}

Expand Down
1 change: 0 additions & 1 deletion pkg/controller/deployment_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,6 @@ var annotationsToSkip = map[string]bool{
RevisionHistoryAnnotation: true,
DesiredReplicasAnnotation: true,
MaxReplicasAnnotation: true,
LastReplicaUpdate: true,
PreferNoScheduleKey: true,
UnfreezeAnnotation: true,
}
Expand Down
177 changes: 102 additions & 75 deletions pkg/controller/machine_safety.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,10 @@ import (
const (
// OverShootingReplicaCount freeze reason when replica count overshoots
OverShootingReplicaCount = "OverShootingReplicaCount"
// MachineDeploymentStateSync freeze reason when machineDeployment was found with inconsistent state
MachineDeploymentStateSync = "MachineDeploymentStateSync"
// TimeoutOccurred freeze reason when machineSet timeout occurs
TimeoutOccurred = "MachineSetTimeoutOccurred"
// LastReplicaUpdate contains the last timestamp when the
// number of replicas was changed
LastReplicaUpdate = "safety.machine.sapcloud.io/lastreplicaupdate"
// UnfreezeAnnotation indicates the controllers to unfreeze this object
UnfreezeAnnotation = "safety.machine.sapcloud.io/unfreeze"
)
Expand Down Expand Up @@ -74,6 +73,12 @@ func (c *controller) reconcileClusterMachineSafetyOvershooting(key string) error
}
cache.WaitForCacheSync(stopCh, c.machineSetSynced, c.machineDeploymentSynced)

err = c.syncMachineDeploymentFreezeState()
if err != nil {
glog.Errorf("SafetyController: %v", err)
}
cache.WaitForCacheSync(stopCh, c.machineDeploymentSynced)

err = c.unfreezeMachineDeploymentsWithUnfreezeAnnotation()
if err != nil {
glog.Errorf("SafetyController: %v", err)
Expand Down Expand Up @@ -201,7 +206,7 @@ func (c *controller) unfreezeMachineDeploymentsWithUnfreezeAnnotation() error {
if _, exists := machineDeployment.Annotations[UnfreezeAnnotation]; exists {
glog.V(2).Infof("SafetyController: UnFreezing MachineDeployment %q due to setting unfreeze annotation", machineDeployment.Name)

err := c.unfreezeMachineDeployment(machineDeployment)
err := c.unfreezeMachineDeployment(machineDeployment, "UnfreezeAnnotation")
if err != nil {
return err
}
Expand Down Expand Up @@ -257,6 +262,60 @@ func (c *controller) unfreezeMachineSetsWithUnfreezeAnnotation() error {
return nil
}

// syncMachineDeploymentFreezeState syncs freeze labels and conditions to keep it consistent
func (c *controller) syncMachineDeploymentFreezeState() error {
machineDeployments, err := c.machineDeploymentLister.List(labels.Everything())
if err != nil {
glog.Error("SafetyController: Error while trying to LIST machineDeployments - ", err)
return err
}

for _, machineDeployment := range machineDeployments {

machineDeploymentFreezeLabelPresent := (machineDeployment.Labels["freeze"] == "True")
machineDeploymentFrozenConditionPresent := (GetMachineDeploymentCondition(machineDeployment.Status, v1alpha1.MachineDeploymentFrozen) != nil)

machineDeploymentHasFrozenMachineSet := false
machineSets, err := c.getMachineSetsForMachineDeployment(machineDeployment)
if err == nil {
for _, machineSet := range machineSets {
machineSetFreezeLabelPresent := (machineSet.Labels["freeze"] == "True")
machineSetFrozenConditionPresent := (GetCondition(&machineSet.Status, v1alpha1.MachineSetFrozen) != nil)

if machineSetFreezeLabelPresent || machineSetFrozenConditionPresent {
machineDeploymentHasFrozenMachineSet = true
break
}
}
}

if machineDeploymentHasFrozenMachineSet {
// If machineDeployment has atleast one frozen machine set backing it

if !machineDeploymentFreezeLabelPresent || !machineDeploymentFrozenConditionPresent {
// Either the freeze label or freeze condition is not present on the machineDeployment
message := "MachineDeployment State was inconsistent, hence safety controller has fixed this and frozen it"
err := c.freezeMachineDeployment(machineDeployment, MachineDeploymentStateSync, message)
if err != nil {
return err
}
}
} else {
// If machineDeployment has no frozen machine set backing it

if machineDeploymentFreezeLabelPresent || machineDeploymentFrozenConditionPresent {
// Either the freeze label or freeze condition is present present on the machineDeployment
err := c.unfreezeMachineDeployment(machineDeployment, MachineDeploymentStateSync)
if err != nil {
return err
}
}
}
}

return nil
}

// checkAndFreezeORUnfreezeMachineSets freezes/unfreezes machineSets/machineDeployments
// which have much greater than desired number of replicas of machine objects
func (c *controller) checkAndFreezeORUnfreezeMachineSets() error {
Expand Down Expand Up @@ -552,21 +611,6 @@ func (c *controller) checkMachineClass(
}
}

// addMachineSetToSafety enqueues into machineSafetyQueue when a new machineSet is added
func (c *controller) addMachineSetToSafety(obj interface{}) {
machineSet := obj.(*v1alpha1.MachineSet)
c.updateTimeStamp(machineSet)
}

// updateMachineSetToSafety adds update timestamp
func (c *controller) updateMachineSetToSafety(old, new interface{}) {
oldMS := old.(*v1alpha1.MachineSet)
newMS := new.(*v1alpha1.MachineSet)
if oldMS.Spec.Replicas != newMS.Spec.Replicas {
c.updateTimeStamp(newMS)
}
}

// addMachineToSafety enqueues into machineSafetyQueue when a new machine is added
func (c *controller) addMachineToSafety(obj interface{}) {
machine := obj.(*v1alpha1.Machine)
Expand All @@ -579,31 +623,6 @@ func (c *controller) deleteMachineToSafety(obj interface{}) {
c.enqueueMachineSafetyOrphanVMsKey(machine)
}

// updateTimeStamp adds an annotation indicating the last time the number of replicas
// of machineSet was changed
func (c *controller) updateTimeStamp(ms *v1alpha1.MachineSet) {
for {
// Get the latest version of the machineSet so that we can avoid conflicts
ms, err := c.controlMachineClient.MachineSets(ms.Namespace).Get(ms.Name, metav1.GetOptions{})
if err != nil {
// Some error occurred while fetching object from API server
glog.Error(err)
break
}
clone := ms.DeepCopy()
if clone.Annotations == nil {
clone.Annotations = make(map[string]string)
}
clone.Annotations[LastReplicaUpdate] = metav1.Now().Format("2006-01-02 15:04:05 MST")
_, err = c.controlMachineClient.MachineSets(clone.Namespace).Update(clone)
if err == nil {
break
}
// Keep retrying until update goes through
glog.Error("SafetyController: Failure while trying to UPDATE timestamp, Error: ", err)
}
}

// enqueueMachineSafetyOvershootingKey enqueues into machineSafetyOvershootingQueue
func (c *controller) enqueueMachineSafetyOvershootingKey(obj interface{}) {
c.machineSafetyOvershootingQueue.Add("")
Expand Down Expand Up @@ -680,37 +699,10 @@ func (c *controller) freezeMachineSetAndDeployment(machineSet *v1alpha1.MachineS
if len(machineDeployments) >= 1 {
machineDeployment := machineDeployments[0]
if machineDeployment != nil {

// Get the latest version of the machineDeployment so that we can avoid conflicts
machineDeployment, err := c.controlMachineClient.MachineDeployments(machineDeployment.Namespace).Get(machineDeployment.Name, metav1.GetOptions{})
if err != nil {
glog.Errorf("SafetyController: Failed to GET machineDeployment. Error: %s", err)
return err
}

clone := machineDeployment.DeepCopy()
newStatus := clone.Status
mdcond := NewMachineDeploymentCondition(v1alpha1.MachineDeploymentFrozen, v1alpha1.ConditionTrue, reason, message)
SetMachineDeploymentCondition(&newStatus, *mdcond)
clone.Status = newStatus
machineDeployment, err = c.controlMachineClient.MachineDeployments(clone.Namespace).UpdateStatus(clone)
if err != nil {
glog.Errorf("SafetyController: MachineDeployment/status UPDATE failed. Error: %s", err)
return err
}

clone = machineDeployment.DeepCopy()
if clone.Labels == nil {
clone.Labels = make(map[string]string)
}
clone.Labels["freeze"] = "True"
_, err = c.controlMachineClient.MachineDeployments(clone.Namespace).Update(clone)
err := c.freezeMachineDeployment(machineDeployment, reason, message)
if err != nil {
glog.Errorf("SafetyController: MachineDeployment UPDATE failed. Error: %s", err)
return err
}

glog.V(2).Infof("SafetyController: Froze MachineDeployment %q due to overshooting of replicas", machineSet.Name)
}
}

Expand All @@ -726,7 +718,7 @@ func (c *controller) unfreezeMachineSetAndDeployment(machineSet *v1alpha1.Machin
machineDeployments := c.getMachineDeploymentsForMachineSet(machineSet)
if len(machineDeployments) >= 1 {
machineDeployment := machineDeployments[0]
err := c.unfreezeMachineDeployment(machineDeployment)
err := c.unfreezeMachineDeployment(machineDeployment, "UnderShootingReplicaCount")
if err != nil {
return err
}
Expand Down Expand Up @@ -786,8 +778,43 @@ func (c *controller) unfreezeMachineSet(machineSet *v1alpha1.MachineSet) error {
return nil
}

// freezeMachineDeployment freezes the machineDeployment
func (c *controller) freezeMachineDeployment(machineDeployment *v1alpha1.MachineDeployment, reason string, message string) error {
// Get the latest version of the machineDeployment so that we can avoid conflicts
machineDeployment, err := c.controlMachineClient.MachineDeployments(machineDeployment.Namespace).Get(machineDeployment.Name, metav1.GetOptions{})
if err != nil {
glog.Errorf("SafetyController: Failed to GET machineDeployment. Error: %s", err)
return err
}

clone := machineDeployment.DeepCopy()
newStatus := clone.Status
mdcond := NewMachineDeploymentCondition(v1alpha1.MachineDeploymentFrozen, v1alpha1.ConditionTrue, reason, message)
SetMachineDeploymentCondition(&newStatus, *mdcond)
clone.Status = newStatus
machineDeployment, err = c.controlMachineClient.MachineDeployments(clone.Namespace).UpdateStatus(clone)
if err != nil {
glog.Errorf("SafetyController: MachineDeployment/status UPDATE failed. Error: %s", err)
return err
}

clone = machineDeployment.DeepCopy()
if clone.Labels == nil {
clone.Labels = make(map[string]string)
}
clone.Labels["freeze"] = "True"
_, err = c.controlMachineClient.MachineDeployments(clone.Namespace).Update(clone)
if err != nil {
glog.Errorf("SafetyController: MachineDeployment UPDATE failed. Error: %s", err)
return err
}

glog.V(2).Infof("SafetyController: Froze MachineDeployment %q due to %s", machineDeployment.Name, reason)
return nil
}

// unfreezeMachineDeployment unfreezes the machineDeployment
func (c *controller) unfreezeMachineDeployment(machineDeployment *v1alpha1.MachineDeployment) error {
func (c *controller) unfreezeMachineDeployment(machineDeployment *v1alpha1.MachineDeployment, reason string) error {

if machineDeployment == nil {
err := fmt.Errorf("SafetyController: Machine Deployment not passed")
Expand Down Expand Up @@ -828,6 +855,6 @@ func (c *controller) unfreezeMachineDeployment(machineDeployment *v1alpha1.Machi
return err
}

glog.V(2).Infof("SafetyController: Unfroze MachineDeployment %q", machineDeployment.Name)
glog.V(2).Infof("SafetyController: Unfroze MachineDeployment %q due to %s", machineDeployment.Name, reason)
return nil
}
Loading

0 comments on commit f9e82c9

Please sign in to comment.