From f8b7debf9cdb1df235cb5610b71df2976cb0bedb Mon Sep 17 00:00:00 2001 From: Christian Schlotter Date: Mon, 30 Sep 2024 21:41:24 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20machine:=20Introduce=20Deletion=20s?= =?UTF-8?q?tatus=20field=20and=20add=20timestamps=20for=20drain=20and=20vo?= =?UTF-8?q?lumeDetach=20instead=20of=20using=20the=20condition=20(#11166)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * machine: Introduce Deletion status field and add timestamps for drain and volumeDetach instead of using the condition * fix tests * make generate * review fixes * fix openapi gen * review fixes * fix --- api/v1beta1/machine_types.go | 22 +++++++++ api/v1beta1/zz_generated.deepcopy.go | 28 +++++++++++ api/v1beta1/zz_generated.openapi.go | 36 +++++++++++++- .../crd/bases/cluster.x-k8s.io_machines.yaml | 22 +++++++++ internal/apis/core/v1alpha3/conversion.go | 1 + .../core/v1alpha3/zz_generated.conversion.go | 1 + internal/apis/core/v1alpha4/conversion.go | 1 + .../core/v1alpha4/zz_generated.conversion.go | 1 + .../controllers/machine/machine_controller.go | 45 ++++++++++------- .../machine/machine_controller_test.go | 48 +++++-------------- 10 files changed, 150 insertions(+), 55 deletions(-) diff --git a/api/v1beta1/machine_types.go b/api/v1beta1/machine_types.go index ebe3030d6eb1..53dfabfb4cc9 100644 --- a/api/v1beta1/machine_types.go +++ b/api/v1beta1/machine_types.go @@ -230,10 +230,32 @@ type MachineStatus struct { // Conditions defines current service state of the Machine. // +optional Conditions Conditions `json:"conditions,omitempty"` + + // deletion contains information relating to removal of the Machine. + // Only present when the Machine has a deletionTimestamp and drain or wait for volume detach started. + // +optional + Deletion *MachineDeletionStatus `json:"deletion,omitempty"` } // ANCHOR_END: MachineStatus +// MachineDeletionStatus is the deletion state of the Machine. +type MachineDeletionStatus struct { + // nodeDrainStartTime is the time when the drain of the node started and is used to determine + // if the NodeDrainTimeout is exceeded. + // Only present when the Machine has a deletionTimestamp and draining the node had been started. + // +optional + NodeDrainStartTime *metav1.Time `json:"nodeDrainStartTime,omitempty"` + + // waitForNodeVolumeDetachStartTime is the time when waiting for volume detachment started + // and is used to determine if the NodeVolumeDetachTimeout is exceeded. + // Detaching volumes from nodes is usually done by CSI implementations and the current state + // is observed from the node's `.Status.VolumesAttached` field. + // Only present when the Machine has a deletionTimestamp and waiting for volume detachments had been started. + // +optional + WaitForNodeVolumeDetachStartTime *metav1.Time `json:"waitForNodeVolumeDetachStartTime,omitempty"` +} + // SetTypedPhase sets the Phase field to the string representation of MachinePhase. func (m *MachineStatus) SetTypedPhase(p MachinePhase) { m.Phase = string(p) diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index bb0ecf062264..4337861cbf45 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -973,6 +973,29 @@ func (in MachineAddresses) DeepCopy() MachineAddresses { return *out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MachineDeletionStatus) DeepCopyInto(out *MachineDeletionStatus) { + *out = *in + if in.NodeDrainStartTime != nil { + in, out := &in.NodeDrainStartTime, &out.NodeDrainStartTime + *out = (*in).DeepCopy() + } + if in.WaitForNodeVolumeDetachStartTime != nil { + in, out := &in.WaitForNodeVolumeDetachStartTime, &out.WaitForNodeVolumeDetachStartTime + *out = (*in).DeepCopy() + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MachineDeletionStatus. +func (in *MachineDeletionStatus) DeepCopy() *MachineDeletionStatus { + if in == nil { + return nil + } + out := new(MachineDeletionStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *MachineDeployment) DeepCopyInto(out *MachineDeployment) { *out = *in @@ -1912,6 +1935,11 @@ func (in *MachineStatus) DeepCopyInto(out *MachineStatus) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.Deletion != nil { + in, out := &in.Deletion, &out.Deletion + *out = new(MachineDeletionStatus) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MachineStatus. diff --git a/api/v1beta1/zz_generated.openapi.go b/api/v1beta1/zz_generated.openapi.go index 4e1017dec6d6..7a309c8370e1 100644 --- a/api/v1beta1/zz_generated.openapi.go +++ b/api/v1beta1/zz_generated.openapi.go @@ -58,6 +58,7 @@ func GetOpenAPIDefinitions(ref common.ReferenceCallback) map[string]common.OpenA "sigs.k8s.io/cluster-api/api/v1beta1.LocalObjectTemplate": schema_sigsk8sio_cluster_api_api_v1beta1_LocalObjectTemplate(ref), "sigs.k8s.io/cluster-api/api/v1beta1.Machine": schema_sigsk8sio_cluster_api_api_v1beta1_Machine(ref), "sigs.k8s.io/cluster-api/api/v1beta1.MachineAddress": schema_sigsk8sio_cluster_api_api_v1beta1_MachineAddress(ref), + "sigs.k8s.io/cluster-api/api/v1beta1.MachineDeletionStatus": schema_sigsk8sio_cluster_api_api_v1beta1_MachineDeletionStatus(ref), "sigs.k8s.io/cluster-api/api/v1beta1.MachineDeployment": schema_sigsk8sio_cluster_api_api_v1beta1_MachineDeployment(ref), "sigs.k8s.io/cluster-api/api/v1beta1.MachineDeploymentClass": schema_sigsk8sio_cluster_api_api_v1beta1_MachineDeploymentClass(ref), "sigs.k8s.io/cluster-api/api/v1beta1.MachineDeploymentClassNamingStrategy": schema_sigsk8sio_cluster_api_api_v1beta1_MachineDeploymentClassNamingStrategy(ref), @@ -1650,6 +1651,33 @@ func schema_sigsk8sio_cluster_api_api_v1beta1_MachineAddress(ref common.Referenc } } +func schema_sigsk8sio_cluster_api_api_v1beta1_MachineDeletionStatus(ref common.ReferenceCallback) common.OpenAPIDefinition { + return common.OpenAPIDefinition{ + Schema: spec.Schema{ + SchemaProps: spec.SchemaProps{ + Description: "MachineDeletionStatus is the deletion state of the Machine.", + Type: []string{"object"}, + Properties: map[string]spec.Schema{ + "nodeDrainStartTime": { + SchemaProps: spec.SchemaProps{ + Description: "nodeDrainStartTime is the time when the drain of the node started and is used to determine if the NodeDrainTimeout is exceeded. Only present when the Machine has a deletionTimestamp and draining the node had been started.", + Ref: ref("k8s.io/apimachinery/pkg/apis/meta/v1.Time"), + }, + }, + "waitForNodeVolumeDetachStartTime": { + SchemaProps: spec.SchemaProps{ + Description: "waitForNodeVolumeDetachStartTime is the time when waiting for volume detachment started and is used to determine if the NodeVolumeDetachTimeout is exceeded. Detaching volumes from nodes is usually done by CSI implementations and the current state is observed from the node's `.Status.VolumesAttached` field. Only present when the Machine has a deletionTimestamp and waiting for volume detachments had been started.", + Ref: ref("k8s.io/apimachinery/pkg/apis/meta/v1.Time"), + }, + }, + }, + }, + }, + Dependencies: []string{ + "k8s.io/apimachinery/pkg/apis/meta/v1.Time"}, + } +} + func schema_sigsk8sio_cluster_api_api_v1beta1_MachineDeployment(ref common.ReferenceCallback) common.OpenAPIDefinition { return common.OpenAPIDefinition{ Schema: spec.Schema{ @@ -3353,11 +3381,17 @@ func schema_sigsk8sio_cluster_api_api_v1beta1_MachineStatus(ref common.Reference }, }, }, + "deletion": { + SchemaProps: spec.SchemaProps{ + Description: "deletion contains information relating to removal of the Machine. Only present when the Machine has a deletionTimestamp and drain or wait for volume detach started.", + Ref: ref("sigs.k8s.io/cluster-api/api/v1beta1.MachineDeletionStatus"), + }, + }, }, }, }, Dependencies: []string{ - "k8s.io/api/core/v1.NodeSystemInfo", "k8s.io/api/core/v1.ObjectReference", "k8s.io/apimachinery/pkg/apis/meta/v1.Time", "sigs.k8s.io/cluster-api/api/v1beta1.Condition", "sigs.k8s.io/cluster-api/api/v1beta1.MachineAddress"}, + "k8s.io/api/core/v1.NodeSystemInfo", "k8s.io/api/core/v1.ObjectReference", "k8s.io/apimachinery/pkg/apis/meta/v1.Time", "sigs.k8s.io/cluster-api/api/v1beta1.Condition", "sigs.k8s.io/cluster-api/api/v1beta1.MachineAddress", "sigs.k8s.io/cluster-api/api/v1beta1.MachineDeletionStatus"}, } } diff --git a/config/crd/bases/cluster.x-k8s.io_machines.yaml b/config/crd/bases/cluster.x-k8s.io_machines.yaml index 493d6aca6646..cc5aab3633ad 100644 --- a/config/crd/bases/cluster.x-k8s.io_machines.yaml +++ b/config/crd/bases/cluster.x-k8s.io_machines.yaml @@ -1113,6 +1113,28 @@ spec: - type type: object type: array + deletion: + description: |- + deletion contains information relating to removal of the Machine. + Only present when the Machine has a deletionTimestamp and drain or wait for volume detach started. + properties: + nodeDrainStartTime: + description: |- + nodeDrainStartTime is the time when the drain of the node started and is used to determine + if the NodeDrainTimeout is exceeded. + Only present when the Machine has a deletionTimestamp and draining the node had been started. + format: date-time + type: string + waitForNodeVolumeDetachStartTime: + description: |- + waitForNodeVolumeDetachStartTime is the time when waiting for volume detachment started + and is used to determine if the NodeVolumeDetachTimeout is exceeded. + Detaching volumes from nodes is usually done by CSI implementations and the current state + is observed from the node's `.Status.VolumesAttached` field. + Only present when the Machine has a deletionTimestamp and waiting for volume detachments had been started. + format: date-time + type: string + type: object failureMessage: description: |- FailureMessage will be set in the event that there is a terminal problem diff --git a/internal/apis/core/v1alpha3/conversion.go b/internal/apis/core/v1alpha3/conversion.go index 405cf123f813..c325b48cee21 100644 --- a/internal/apis/core/v1alpha3/conversion.go +++ b/internal/apis/core/v1alpha3/conversion.go @@ -101,6 +101,7 @@ func (src *Machine) ConvertTo(dstRaw conversion.Hub) error { dst.Spec.NodeVolumeDetachTimeout = restored.Spec.NodeVolumeDetachTimeout dst.Status.NodeInfo = restored.Status.NodeInfo dst.Status.CertificatesExpiryDate = restored.Status.CertificatesExpiryDate + dst.Status.Deletion = restored.Status.Deletion return nil } diff --git a/internal/apis/core/v1alpha3/zz_generated.conversion.go b/internal/apis/core/v1alpha3/zz_generated.conversion.go index 1d8c7451e31c..ec286c3c42fb 100644 --- a/internal/apis/core/v1alpha3/zz_generated.conversion.go +++ b/internal/apis/core/v1alpha3/zz_generated.conversion.go @@ -1241,6 +1241,7 @@ func autoConvert_v1beta1_MachineStatus_To_v1alpha3_MachineStatus(in *v1beta1.Mac out.InfrastructureReady = in.InfrastructureReady out.ObservedGeneration = in.ObservedGeneration out.Conditions = *(*Conditions)(unsafe.Pointer(&in.Conditions)) + // WARNING: in.Deletion requires manual conversion: does not exist in peer-type return nil } diff --git a/internal/apis/core/v1alpha4/conversion.go b/internal/apis/core/v1alpha4/conversion.go index ba4469e52445..8043349a9678 100644 --- a/internal/apis/core/v1alpha4/conversion.go +++ b/internal/apis/core/v1alpha4/conversion.go @@ -190,6 +190,7 @@ func (src *Machine) ConvertTo(dstRaw conversion.Hub) error { dst.Spec.NodeDeletionTimeout = restored.Spec.NodeDeletionTimeout dst.Status.CertificatesExpiryDate = restored.Status.CertificatesExpiryDate dst.Spec.NodeVolumeDetachTimeout = restored.Spec.NodeVolumeDetachTimeout + dst.Status.Deletion = restored.Status.Deletion return nil } diff --git a/internal/apis/core/v1alpha4/zz_generated.conversion.go b/internal/apis/core/v1alpha4/zz_generated.conversion.go index e8f640274d79..4b504ce878c8 100644 --- a/internal/apis/core/v1alpha4/zz_generated.conversion.go +++ b/internal/apis/core/v1alpha4/zz_generated.conversion.go @@ -1653,6 +1653,7 @@ func autoConvert_v1beta1_MachineStatus_To_v1alpha4_MachineStatus(in *v1beta1.Mac out.InfrastructureReady = in.InfrastructureReady out.ObservedGeneration = in.ObservedGeneration out.Conditions = *(*Conditions)(unsafe.Pointer(&in.Conditions)) + // WARNING: in.Deletion requires manual conversion: does not exist in peer-type return nil } diff --git a/internal/controllers/machine/machine_controller.go b/internal/controllers/machine/machine_controller.go index 777f4f9053c9..9ae12162d18b 100644 --- a/internal/controllers/machine/machine_controller.go +++ b/internal/controllers/machine/machine_controller.go @@ -32,6 +32,7 @@ import ( "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/tools/record" "k8s.io/klog/v2" + "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" @@ -388,13 +389,18 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu return ctrl.Result{}, err } - // The DrainingSucceededCondition never exists before the node is drained for the first time, - // so its transition time can be used to record the first time draining. - // This `if` condition prevents the transition time to be changed more than once. + // The DrainingSucceededCondition never exists before the node is drained for the first time. if conditions.Get(m, clusterv1.DrainingSucceededCondition) == nil { conditions.MarkFalse(m, clusterv1.DrainingSucceededCondition, clusterv1.DrainingReason, clusterv1.ConditionSeverityInfo, "Draining the node before deletion") } + if m.Status.Deletion == nil { + m.Status.Deletion = &clusterv1.MachineDeletionStatus{} + } + if m.Status.Deletion.NodeDrainStartTime == nil { + m.Status.Deletion.NodeDrainStartTime = ptr.To(metav1.Now()) + } + if err := patchMachine(ctx, patchHelper, m); err != nil { return ctrl.Result{}, errors.Wrap(err, "failed to patch Machine") } @@ -418,13 +424,18 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu // volumes are detached before proceeding to delete the Node. // In case the node is unreachable, the detachment is skipped. if r.isNodeVolumeDetachingAllowed(m) { - // The VolumeDetachSucceededCondition never exists before we wait for volume detachment for the first time, - // so its transition time can be used to record the first time we wait for volume detachment. - // This `if` condition prevents the transition time to be changed more than once. + // The VolumeDetachSucceededCondition never exists before we wait for volume detachment for the first time. if conditions.Get(m, clusterv1.VolumeDetachSucceededCondition) == nil { conditions.MarkFalse(m, clusterv1.VolumeDetachSucceededCondition, clusterv1.WaitingForVolumeDetachReason, clusterv1.ConditionSeverityInfo, "Waiting for node volumes to be detached") } + if m.Status.Deletion == nil { + m.Status.Deletion = &clusterv1.MachineDeletionStatus{} + } + if m.Status.Deletion.WaitForNodeVolumeDetachStartTime == nil { + m.Status.Deletion.WaitForNodeVolumeDetachStartTime = ptr.To(metav1.Now()) + } + if ok, err := r.shouldWaitForNodeVolumes(ctx, cluster, m.Status.NodeRef.Name); ok || err != nil { if err != nil { r.recorder.Eventf(m, corev1.EventTypeWarning, "FailedWaitForVolumeDetach", "error waiting for node volumes detaching, Machine's node %q: %v", m.Status.NodeRef.Name, err) @@ -540,38 +551,36 @@ func (r *Reconciler) isNodeVolumeDetachingAllowed(m *clusterv1.Machine) bool { func (r *Reconciler) nodeDrainTimeoutExceeded(machine *clusterv1.Machine) bool { // if the NodeDrainTimeout type is not set by user - if machine.Spec.NodeDrainTimeout == nil || machine.Spec.NodeDrainTimeout.Seconds() <= 0 { + if machine.Status.Deletion == nil || machine.Spec.NodeDrainTimeout == nil || machine.Spec.NodeDrainTimeout.Seconds() <= 0 { return false } - // if the draining succeeded condition does not exist - if conditions.Get(machine, clusterv1.DrainingSucceededCondition) == nil { + // if the NodeDrainStartTime does not exist + if machine.Status.Deletion.NodeDrainStartTime == nil { return false } now := time.Now() - firstTimeDrain := conditions.GetLastTransitionTime(machine, clusterv1.DrainingSucceededCondition) - diff := now.Sub(firstTimeDrain.Time) + diff := now.Sub(machine.Status.Deletion.NodeDrainStartTime.Time) return diff.Seconds() >= machine.Spec.NodeDrainTimeout.Seconds() } // nodeVolumeDetachTimeoutExceeded returns False if either NodeVolumeDetachTimeout is set to nil or <=0 OR -// VolumeDetachSucceededCondition is not set on the Machine. Otherwise returns true if the timeout is expired -// since the last transition time of VolumeDetachSucceededCondition. +// WaitForNodeVolumeDetachStartTime is not set on the Machine. Otherwise returns true if the timeout is expired +// since the WaitForNodeVolumeDetachStartTime. func (r *Reconciler) nodeVolumeDetachTimeoutExceeded(machine *clusterv1.Machine) bool { // if the NodeVolumeDetachTimeout type is not set by user - if machine.Spec.NodeVolumeDetachTimeout == nil || machine.Spec.NodeVolumeDetachTimeout.Seconds() <= 0 { + if machine.Status.Deletion == nil || machine.Spec.NodeVolumeDetachTimeout == nil || machine.Spec.NodeVolumeDetachTimeout.Seconds() <= 0 { return false } - // if the volume detaching succeeded condition does not exist - if conditions.Get(machine, clusterv1.VolumeDetachSucceededCondition) == nil { + // if the WaitForNodeVolumeDetachStartTime does not exist + if machine.Status.Deletion.WaitForNodeVolumeDetachStartTime == nil { return false } now := time.Now() - firstTimeDetach := conditions.GetLastTransitionTime(machine, clusterv1.VolumeDetachSucceededCondition) - diff := now.Sub(firstTimeDetach.Time) + diff := now.Sub(machine.Status.Deletion.WaitForNodeVolumeDetachStartTime.Time) return diff.Seconds() >= machine.Spec.NodeVolumeDetachTimeout.Seconds() } diff --git a/internal/controllers/machine/machine_controller_test.go b/internal/controllers/machine/machine_controller_test.go index ec72dee7cc50..7ab8096215fa 100644 --- a/internal/controllers/machine/machine_controller_test.go +++ b/internal/controllers/machine/machine_controller_test.go @@ -1386,12 +1386,8 @@ func TestIsNodeDrainedAllowed(t *testing.T) { }, Status: clusterv1.MachineStatus{ - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.DrainingSucceededCondition, - Status: corev1.ConditionFalse, - LastTransitionTime: metav1.Time{Time: time.Now().Add(-(time.Second * 70)).UTC()}, - }, + Deletion: &clusterv1.MachineDeletionStatus{ + NodeDrainStartTime: &metav1.Time{Time: time.Now().Add(-(time.Second * 70)).UTC()}, }, }, }, @@ -1412,12 +1408,8 @@ func TestIsNodeDrainedAllowed(t *testing.T) { NodeDrainTimeout: &metav1.Duration{Duration: time.Second * 60}, }, Status: clusterv1.MachineStatus{ - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.DrainingSucceededCondition, - Status: corev1.ConditionFalse, - LastTransitionTime: metav1.Time{Time: time.Now().Add(-(time.Second * 30)).UTC()}, - }, + Deletion: &clusterv1.MachineDeletionStatus{ + NodeDrainStartTime: &metav1.Time{Time: time.Now().Add(-(time.Second * 30)).UTC()}, }, }, }, @@ -1437,12 +1429,8 @@ func TestIsNodeDrainedAllowed(t *testing.T) { Bootstrap: clusterv1.Bootstrap{DataSecretName: ptr.To("data")}, }, Status: clusterv1.MachineStatus{ - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.DrainingSucceededCondition, - Status: corev1.ConditionFalse, - LastTransitionTime: metav1.Time{Time: time.Now().Add(-(time.Second * 1000)).UTC()}, - }, + Deletion: &clusterv1.MachineDeletionStatus{ + NodeDrainStartTime: &metav1.Time{Time: time.Now().Add(-(time.Second * 1000)).UTC()}, }, }, }, @@ -1896,12 +1884,8 @@ func TestIsNodeVolumeDetachingAllowed(t *testing.T) { }, Status: clusterv1.MachineStatus{ - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.VolumeDetachSucceededCondition, - Status: corev1.ConditionFalse, - LastTransitionTime: metav1.Time{Time: time.Now().Add(-(time.Second * 60)).UTC()}, - }, + Deletion: &clusterv1.MachineDeletionStatus{ + WaitForNodeVolumeDetachStartTime: &metav1.Time{Time: time.Now().Add(-(time.Second * 60)).UTC()}, }, }, }, @@ -1922,12 +1906,8 @@ func TestIsNodeVolumeDetachingAllowed(t *testing.T) { NodeVolumeDetachTimeout: &metav1.Duration{Duration: time.Second * 60}, }, Status: clusterv1.MachineStatus{ - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.VolumeDetachSucceededCondition, - Status: corev1.ConditionFalse, - LastTransitionTime: metav1.Time{Time: time.Now().Add(-(time.Second * 30)).UTC()}, - }, + Deletion: &clusterv1.MachineDeletionStatus{ + WaitForNodeVolumeDetachStartTime: &metav1.Time{Time: time.Now().Add(-(time.Second * 30)).UTC()}, }, }, }, @@ -1947,12 +1927,8 @@ func TestIsNodeVolumeDetachingAllowed(t *testing.T) { Bootstrap: clusterv1.Bootstrap{DataSecretName: ptr.To("data")}, }, Status: clusterv1.MachineStatus{ - Conditions: clusterv1.Conditions{ - { - Type: clusterv1.VolumeDetachSucceededCondition, - Status: corev1.ConditionFalse, - LastTransitionTime: metav1.Time{Time: time.Now().Add(-(time.Second * 1000)).UTC()}, - }, + Deletion: &clusterv1.MachineDeletionStatus{ + WaitForNodeVolumeDetachStartTime: &metav1.Time{Time: time.Now().Add(-(time.Second * 1000)).UTC()}, }, }, },