Skip to content

Commit

Permalink
Address comments
Browse files Browse the repository at this point in the history
  • Loading branch information
fabriziopandini committed Nov 6, 2024
1 parent 88bcbf0 commit 7803764
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 101 deletions.
18 changes: 13 additions & 5 deletions controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,25 +20,33 @@ import clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"

// KubeadmControlPlane's Available condition and corresponding reasons that will be used in v1Beta2 API version.
const (
// KubeadmControlPlaneAvailableV1Beta2Condition is True if KubeadmControlPlane not delete, `CertificatesAvailable` is true,
// at least one Kubernetes API server, scheduler and controller manager control plane are healthy,
// KubeadmControlPlaneAvailableV1Beta2Condition is true if KubeadmControlPlane not deleted, `CertificatesAvailable` is true,
// at least one Machine with Kubernetes API server, scheduler and controller manager healthy,
// and etcd has enough operational members to meet quorum requirements.
// More specifically, considering how kubeadm layouts components:
// - Kubernetes API server, scheduler and controller manager health is inferred by the status of
// the corresponding Pods hosted on each machine.
// - In case of managed etcd, also an healthy etcd Pod and an healthy etcd member must exist on the same
// machine with the healthy Kubernetes API server, scheduler and controller manager, otherwise the k8s control
// plane cannot be considered operational (if etcd is not operational on machine, most likely also API server,
// scheduler and controller manager on the same machine will be impacted).
// - In case of external etcd, KCP cannot make any assumption on etcd status, so all the etcd checks are skipped.
KubeadmControlPlaneAvailableV1Beta2Condition = clusterv1.AvailableV1Beta2Condition

// KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason documents a failure when inspecting the status of the
// etcd cluster hosted on KubeadmControlPlane controlled machines.
KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason = clusterv1.InspectionFailedV1Beta2Reason

// KubeadmControlPlaneAvailableV1Beta2Reason surfaces when a Deployment is available.
// KubeadmControlPlaneAvailableV1Beta2Reason surfaces when the KubeadmControlPlane is available.
KubeadmControlPlaneAvailableV1Beta2Reason = clusterv1.AvailableV1Beta2Reason

// KubeadmControlPlaneNotAvailableV1Beta2Reason surfaces when a Deployment is not available.
// KubeadmControlPlaneNotAvailableV1Beta2Reason surfaces when the KubeadmControlPlane is not available.
KubeadmControlPlaneNotAvailableV1Beta2Reason = clusterv1.NotAvailableV1Beta2Reason
)

// KubeadmControlPlane's Initialized condition and corresponding reasons that will be used in v1Beta2 API version.
const (
// KubeadmControlPlaneInitializedV1Beta2Condition is True when the control plane is functional enough to accept
// KubeadmControlPlaneInitializedV1Beta2Condition is true when the control plane is functional enough to accept
// requests. This information is usually used as a signal for starting all the provisioning operations that
// depend on a functional API server, but do not require a full HA control plane to exist.
KubeadmControlPlaneInitializedV1Beta2Condition = "Initialized"
Expand Down
4 changes: 3 additions & 1 deletion controlplane/kubeadm/internal/control_plane.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ type ControlPlane struct {

// EtcdMembers is the list of members read while computing reconcileControlPlaneConditions; also additional info below
// comes from the same func.
// NOTE: Those info are computed on what we know, so we can reason about availability eve if with a certain degree of problems in the cluster
// NOTE: Those info are computed based on the info KCP was able to collect during inspection (e.g. if on a 3 CP
// control plane one etcd member is down, those info are based on the answer collected from two members only).
// NOTE: Those info are specifically designed for computing KCP's Available condition.
EtcdMembers []*etcd.Member
EtcdMembersAgreeOnMemberList bool
EtcdMembersAgreeOnClusterID bool
Expand Down
86 changes: 44 additions & 42 deletions controlplane/kubeadm/internal/controllers/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ func setRemediatingCondition(ctx context.Context, kcp *controlplanev1.KubeadmCon
})
}

func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControlPlane, etcdIsManaged bool, etcdMembers []*etcd.Member, etcdMembersAgreeOnMemberList bool, etcdMembersAgreeOnClusterID bool, etcdMembersAndMachinesAreMatching bool, machines collections.Machines) {
func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControlPlane, etcdIsManaged bool, etcdMembers []*etcd.Member, etcdMembersAgreeOnMemberList, etcdMembersAgreeOnClusterID, etcdMembersAndMachinesAreMatching bool, machines collections.Machines) {
if !kcp.Status.Initialized {
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Expand All @@ -435,50 +435,52 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
return
}

if etcdIsManaged && etcdMembers == nil {
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionUnknown,
Reason: controlplanev1.KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason,
Message: "Failed to get etcd members",
})
return
}
if etcdIsManaged {
if etcdMembers == nil {
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionUnknown,
Reason: controlplanev1.KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason,
Message: "Failed to get etcd members",
})
return
}

if etcdIsManaged && !etcdMembersAgreeOnMemberList {
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
Message: "At least one etcd member reports a list of etcd members different than the list reported by other members",
})
return
}
if !etcdMembersAgreeOnMemberList {
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
Message: "At least one etcd member reports a list of etcd members different than the list reported by other members",
})
return
}

if etcdIsManaged && !etcdMembersAgreeOnClusterID {
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
Message: "At least one etcd member reports a cluster ID different than the cluster ID reported by other members",
})
return
}
if !etcdMembersAgreeOnClusterID {
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
Message: "At least one etcd member reports a cluster ID different than the cluster ID reported by other members",
})
return
}

if etcdIsManaged && !etcdMembersAndMachinesAreMatching {
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
Message: "The list of etcd members does not match the list of Machines and Nodes",
})
return
if !etcdMembersAndMachinesAreMatching {
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneAvailableV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: controlplanev1.KubeadmControlPlaneNotAvailableV1Beta2Reason,
Message: "The list of etcd members does not match the list of Machines and Nodes",
})
return
}
}

// Determine control plane availability looking at machines conditions, which at this stage are
// already surfacing status from etcd member and all control plane pods hosted on every machine.
// Note: we intentionally use the number of etcd members for determine the etcd quorum because
// etcd members could not match with machines, e.g. while provisioning a new machine.
// Note: we intentionally use the number of etcd members to determine the etcd quorum because
// etcd members might not match with machines, e.g. while provisioning a new machine.
etcdQuorum := (len(etcdMembers) / 2.0) + 1
k8sControlPlaneHealthy := 0
etcdMembersHealthy := 0
Expand Down Expand Up @@ -537,16 +539,16 @@ func setAvailableCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
if etcdIsManaged && etcdMembersHealthy < etcdQuorum {
switch etcdMembersHealthy {
case 0:
messages = append(messages, fmt.Sprintf("There are no healthy etcd member, at least %d required", etcdQuorum))
messages = append(messages, fmt.Sprintf("There are no healthy etcd member, at least %d required for etcd quorum", etcdQuorum))
case 1:
messages = append(messages, fmt.Sprintf("There is 1 healthy etcd member, at least %d required", etcdQuorum))
messages = append(messages, fmt.Sprintf("There is 1 healthy etcd member, at least %d required for etcd quorum", etcdQuorum))
default:
messages = append(messages, fmt.Sprintf("There are %d healthy etcd members, at least %d required", etcdMembersHealthy, etcdQuorum))
messages = append(messages, fmt.Sprintf("There are %d healthy etcd members, at least %d required for etcd quorum", etcdMembersHealthy, etcdQuorum))
}
}

if k8sControlPlaneHealthy < 1 {
messages = append(messages, "There are no healthy control plane instances, at least 1 required")
messages = append(messages, "There are no Machines with healthy control plane components, at least 1 required")
}

v1beta2conditions.Set(kcp, metav1.Condition{
Expand Down
Loading

0 comments on commit 7803764

Please sign in to comment.