Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
More feedback
Browse files Browse the repository at this point in the history
fabriziopandini committed Jan 17, 2025
1 parent 911c457 commit dca9560
Showing 6 changed files with 101 additions and 15 deletions.
22 changes: 12 additions & 10 deletions controlplane/kubeadm/internal/control_plane.go
Original file line number Diff line number Diff line change
@@ -392,12 +392,12 @@ func (c *ControlPlane) InjectTestManagementCluster(managementCluster ManagementC

// StatusToLogKeyAndValues returns following key/value pairs describing the overall status of the control plane:
// - machines is the list of KCP machines; each machine might have additional notes surfacing
// - if the machine has been created in the current reconcile (new)
// - if machines node is not yet (node ref not set)
// - if the machine has bee marked for remediation (health check failed)
// - if the machine has been created in the current reconcile
// - if machines node ref is not yet set
// - if the machine has been marked for remediation
// - if there are unhealthy control plane component on the machine
// - if the machine has a deletion timestamp/has been deleted in the current reconcile (deleting)
// - if the machine is not up to date with the KCP spec (not up to date)
// - if the machine has a deletion timestamp/has been deleted in the current reconcile
// - if the machine is not up to date with the KCP spec
//
// - etcdMembers list as reported by etcd.
func (c *ControlPlane) StatusToLogKeyAndValues(newMachine, deletedMachine *clusterv1.Machine) []any {
@@ -418,11 +418,11 @@ func (c *ControlPlane) StatusToLogKeyAndValues(newMachine, deletedMachine *clust
notes := []string{}

if m.Status.NodeRef == nil {
notes = append(notes, "node ref not set")
notes = append(notes, "status.nodeRef not set")
}

if c.MachinesToBeRemediatedByKCP().Has(m) {
notes = append(notes, "health check failed")
notes = append(notes, "marked for remediation")
}

for _, condition := range controlPlaneMachineHealthConditions {
@@ -435,10 +435,12 @@ func (c *ControlPlane) StatusToLogKeyAndValues(newMachine, deletedMachine *clust
}

if !c.UpToDateMachines().Has(m) {
notes = append(notes, "not up to date")
notes = append(notes, "not up-to-date")
}

if !m.DeletionTimestamp.IsZero() || (deletedMachine != nil && m.Name == deletedMachine.Name) {
if deletedMachine != nil && m.Name == deletedMachine.Name {
notes = append(notes, "just deleted")
} else if !m.DeletionTimestamp.IsZero() {
notes = append(notes, "deleting")
}

@@ -450,7 +452,7 @@ func (c *ControlPlane) StatusToLogKeyAndValues(newMachine, deletedMachine *clust
}

if newMachine != nil {
machines = append(machines, fmt.Sprintf("%s (new)", newMachine.Name))
machines = append(machines, fmt.Sprintf("%s (just created)", newMachine.Name))
}
sort.Strings(machines)

72 changes: 72 additions & 0 deletions controlplane/kubeadm/internal/control_plane_test.go
Original file line number Diff line number Diff line change
@@ -17,15 +17,18 @@ limitations under the License.
package internal

import (
"strings"
"testing"

"github.com/google/go-cmp/cmp"
. "github.com/onsi/gomega"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"

clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd"
"sigs.k8s.io/cluster-api/util/collections"
"sigs.k8s.io/cluster-api/util/conditions"
)
@@ -252,6 +255,75 @@ func TestHasHealthyMachineStillProvisioning(t *testing.T) {
})
}

func TestStatusToLogKeyAndValues(t *testing.T) {
healthyMachine := &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{Name: "healthy"},
Status: clusterv1.MachineStatus{
NodeRef: &corev1.ObjectReference{Name: "healthy-node"},
Conditions: []clusterv1.Condition{
{Type: controlplanev1.MachineAPIServerPodHealthyCondition, Status: corev1.ConditionTrue},
{Type: controlplanev1.MachineControllerManagerPodHealthyCondition, Status: corev1.ConditionTrue},
{Type: controlplanev1.MachineSchedulerPodHealthyCondition, Status: corev1.ConditionTrue},
{Type: controlplanev1.MachineEtcdPodHealthyCondition, Status: corev1.ConditionTrue},
{Type: controlplanev1.MachineEtcdMemberHealthyCondition, Status: corev1.ConditionTrue},
},
},
}

machineWithoutNode := &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{Name: "without-node"},
Status: clusterv1.MachineStatus{
NodeRef: nil,
Conditions: []clusterv1.Condition{
{Type: controlplanev1.MachineAPIServerPodHealthyCondition, Status: corev1.ConditionUnknown},
{Type: controlplanev1.MachineControllerManagerPodHealthyCondition, Status: corev1.ConditionUnknown},
{Type: controlplanev1.MachineSchedulerPodHealthyCondition, Status: corev1.ConditionUnknown},
{Type: controlplanev1.MachineEtcdPodHealthyCondition, Status: corev1.ConditionUnknown},
{Type: controlplanev1.MachineEtcdMemberHealthyCondition, Status: corev1.ConditionFalse}, // not a real use case, but used to test a code branch.
},
},
}

machineJustCreated := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "just-created"}}

machineJustDeleted := healthyMachine.DeepCopy()
machineJustDeleted.Name = "just-deleted"

machineNotUpToDate := healthyMachine.DeepCopy()
machineNotUpToDate.Name = "not-up-to-date"

machineMarkedForRemediation := healthyMachine.DeepCopy()
machineMarkedForRemediation.Name = "marked-for-remediation"
machineMarkedForRemediation.Status.Conditions = append(machineMarkedForRemediation.Status.Conditions,
clusterv1.Condition{Type: clusterv1.MachineHealthCheckSucceededCondition, Status: corev1.ConditionFalse},
clusterv1.Condition{Type: clusterv1.MachineOwnerRemediatedCondition, Status: corev1.ConditionFalse},
)

g := NewWithT(t)
c := &ControlPlane{
KCP: &controlplanev1.KubeadmControlPlane{},
Machines: collections.FromMachines(healthyMachine, machineWithoutNode, machineJustDeleted, machineNotUpToDate, machineMarkedForRemediation),
machinesNotUptoDate: collections.FromMachines(machineNotUpToDate),
EtcdMembers: []*etcd.Member{{Name: "m1"}, {Name: "m2"}, {Name: "m3"}},
}

got := c.StatusToLogKeyAndValues(machineJustCreated, machineJustDeleted)

g.Expect(got).To((HaveLen(4)))
g.Expect(got[0]).To(Equal("machines"))
machines := strings.Join([]string{
"healthy",
"just-created (just created)",
"just-deleted (just deleted)",
"marked-for-remediation (marked for remediation)",
"not-up-to-date (not up-to-date)",
"without-node (status.nodeRef not set, APIServerPod health unknown, ControllerManagerPod health unknown, SchedulerPod health unknown, EtcdPod health unknown, EtcdMember not healthy)",
}, ", ")
g.Expect(got[1]).To(Equal(machines), cmp.Diff(got[1], machines))
g.Expect(got[2]).To(Equal("etcdMembers"))
g.Expect(got[3]).To(Equal("m1, m2, m3"))
}

type machineOpt func(*clusterv1.Machine)

func failureDomain(controlPlane bool) clusterv1.FailureDomainSpec {
6 changes: 4 additions & 2 deletions controlplane/kubeadm/internal/controllers/controller.go
Original file line number Diff line number Diff line change
@@ -672,11 +672,13 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, con
continue
}

log.WithValues(controlPlane.StatusToLogKeyAndValues(nil, machineToDelete)...).
Info("Deleting Machine (KCP deleted)")
if err := r.Client.Delete(ctx, machineToDelete); err != nil && !apierrors.IsNotFound(err) {
errs = append(errs, errors.Wrapf(err, "failed to delete control plane Machine %s", klog.KObj(machineToDelete)))
}
// Note: We intentionally log after Delete because we want this log line to show up only after DeletionTimestamp has been set.
// Also, setting DeletionTimestamp doesn't mean the Machine is actually deleted (deletion takes some time).
log.WithValues(controlPlane.StatusToLogKeyAndValues(nil, machineToDelete)...).
Info("Deleting Machine (KubeadmControlPlane deleted)")
}
if len(errs) > 0 {
err := kerrors.NewAggregate(errs)
2 changes: 2 additions & 0 deletions controlplane/kubeadm/internal/controllers/remediation.go
Original file line number Diff line number Diff line change
@@ -311,6 +311,8 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
}

// Surface the operation is in progress.
// Note: We intentionally log after Delete because we want this log line to show up only after DeletionTimestamp has been set.
// Also, setting DeletionTimestamp doesn't mean the Machine is actually deleted (deletion takes some time).
log.WithValues(controlPlane.StatusToLogKeyAndValues(nil, machineToBeRemediated)...).
Info("Deleting Machine (remediating unhealthy Machine)")
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
2 changes: 2 additions & 0 deletions controlplane/kubeadm/internal/controllers/scale.go
Original file line number Diff line number Diff line change
@@ -158,6 +158,8 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
"Failed to delete control plane Machine %s for cluster %s control plane: %v", machineToDelete.Name, klog.KObj(controlPlane.Cluster), err)
return ctrl.Result{}, err
}
// Note: We intentionally log after Delete because we want this log line to show up only after DeletionTimestamp has been set.
// Also, setting DeletionTimestamp doesn't mean the Machine is actually deleted (deletion takes some time).
logger.WithValues(controlPlane.StatusToLogKeyAndValues(nil, machineToDelete)...).
Info("Deleting Machine (scale down)", "Machine", klog.KObj(machineToDelete))

12 changes: 9 additions & 3 deletions internal/controllers/machineset/machineset_controller.go
Original file line number Diff line number Diff line change
@@ -427,10 +427,12 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, s *scope) (ctrl.Result
// else delete owned machines.
for _, machine := range machineList {
if machine.DeletionTimestamp.IsZero() {
log.Info("Deleting Machine (MachineSet deleted)", "Machine", klog.KObj(machine))
if err := r.Client.Delete(ctx, machine); err != nil && !apierrors.IsNotFound(err) {
return ctrl.Result{}, errors.Wrapf(err, "failed to delete Machine %s", klog.KObj(machine))
}
// Note: We intentionally log after Delete because we want this log line to show up only after DeletionTimestamp has been set.
// Also, setting DeletionTimestamp doesn't mean the Machine is actually deleted (deletion takes some time).
log.Info("Deleting Machine (MachineSet deleted)", "Machine", klog.KObj(machine))
}
}

@@ -816,13 +818,15 @@ func (r *Reconciler) syncReplicas(ctx context.Context, s *scope) (ctrl.Result, e
for i, machine := range machinesToDelete {
log := log.WithValues("Machine", klog.KObj(machine))
if machine.GetDeletionTimestamp().IsZero() {
log.Info(fmt.Sprintf("Deleting Machine (scale down, deleting %d of %d)", i+1, diff))
if err := r.Client.Delete(ctx, machine); err != nil {
log.Error(err, "Unable to delete Machine")
r.recorder.Eventf(ms, corev1.EventTypeWarning, "FailedDelete", "Failed to delete machine %q: %v", machine.Name, err)
errs = append(errs, err)
continue
}
// Note: We intentionally log after Delete because we want this log line to show up only after DeletionTimestamp has been set.
// Also, setting DeletionTimestamp doesn't mean the Machine is actually deleted (deletion takes some time).
log.Info(fmt.Sprintf("Deleting Machine (scale down, deleting %d of %d)", i+1, diff))
r.recorder.Eventf(ms, corev1.EventTypeNormal, "SuccessfulDelete", "Deleted machine %q", machine.Name)
} else {
log.Info(fmt.Sprintf("Waiting for Machine to be deleted (scale down, deleting %d of %d)", i+1, diff))
@@ -1492,10 +1496,12 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
}
var errs []error
for _, m := range machinesToRemediate {
log.Info("Deleting Machine (remediating unhealthy Machine)", "Machine", klog.KObj(m))
if err := r.Client.Delete(ctx, m); err != nil && !apierrors.IsNotFound(err) {
errs = append(errs, errors.Wrapf(err, "failed to delete Machine %s", klog.KObj(m)))
}
// Note: We intentionally log after Delete because we want this log line to show up only after DeletionTimestamp has been set.
// Also, setting DeletionTimestamp doesn't mean the Machine is actually deleted (deletion takes some time).
log.Info("Deleting Machine (remediating unhealthy Machine)", "Machine", klog.KObj(m))
}
if len(errs) > 0 {
return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errs), "failed to delete unhealthy Machines")

0 comments on commit dca9560

Please sign in to comment.