diff --git a/bootstrap/api/v1alpha1/rke2config_types.go b/bootstrap/api/v1alpha1/rke2config_types.go index edfe2f63..075b29ff 100644 --- a/bootstrap/api/v1alpha1/rke2config_types.go +++ b/bootstrap/api/v1alpha1/rke2config_types.go @@ -68,6 +68,14 @@ type RKE2AgentConfig struct { //+optional NodeLabels []string `json:"nodeLabels,omitempty"` + // NodeAnnotations Aplying annotations on created nodes post bootstrap phase + // + // Unfortunately it is not possible to apply annotations via kubelet + // using current bootstrap configurations. + // Issue: https://github.com/kubernetes/kubernetes/issues/108046 + //+optional + NodeAnnotations map[string]string `json:"nodeAnnotations,omitempty"` + // NodeTaints Registering kubelet with set of taints. //+optional NodeTaints []string `json:"nodeTaints,omitempty"` diff --git a/controlplane/api/v1alpha1/condition_consts.go b/controlplane/api/v1alpha1/condition_consts.go index bd537c83..49d018dd 100644 --- a/controlplane/api/v1alpha1/condition_consts.go +++ b/controlplane/api/v1alpha1/condition_consts.go @@ -54,9 +54,16 @@ const ( // is up to date. Whe this condition is false, the RKE2ControlPlane is executing a rolling upgrade. MachinesSpecUpToDateCondition clusterv1.ConditionType = "MachinesSpecUpToDate" + // NodeMetadataUpToDate documents that the metadata of the nodes controlled by the RKE2 machines + // is up to date. Whe this condition is false, the node metadata is not propagated. + NodeMetadataUpToDate clusterv1.ConditionType = "NodeMetadataUpToDate" + // MachineAgentHealthyCondition reports a machine's rke2 agent's operational status. MachineAgentHealthyCondition clusterv1.ConditionType = "AgentHealthy" + // NodePatchFailedReason (Severity=Error) documents reasong why Node object could not be patched + NodePatchFailedReason = "NodePatchFailed" + // PodInspectionFailedReason documents a failure in inspecting the pod status. PodInspectionFailedReason = "PodInspectionFailed" diff --git a/controlplane/internal/controllers/rke2controlplane_controller.go b/controlplane/internal/controllers/rke2controlplane_controller.go index b68eb4cf..3073a54d 100644 --- a/controlplane/internal/controllers/rke2controlplane_controller.go +++ b/controlplane/internal/controllers/rke2controlplane_controller.go @@ -649,7 +649,7 @@ func (r *RKE2ControlPlaneReconciler) reconcileKubeconfig( // reconcileControlPlaneConditions is responsible of reconciling conditions reporting the status of static pods and // the status of the etcd cluster. -func (r *RKE2ControlPlaneReconciler) reconcileControlPlaneConditions(ctx context.Context, controlPlane *rke2.ControlPlane) (ctrl.Result, error) { +func (r *RKE2ControlPlaneReconciler) reconcileControlPlaneConditions(ctx context.Context, controlPlane *rke2.ControlPlane) (res ctrl.Result, retErr error) { logger := log.FromContext(ctx) readyCPMachines := controlPlane.Machines.Filter(collections.IsReady()) @@ -679,17 +679,32 @@ func (r *RKE2ControlPlaneReconciler) reconcileControlPlaneConditions(ctx context workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(controlPlane.Cluster)) if err != nil { - logger.Info("Unable to get Workload cluster") + logger.Error(err, "Unable to get Workload cluster") return ctrl.Result{}, errors.Wrap(err, "cannot get remote client to workload cluster") } + defer func() { + // Always attempt to Patch the Machine conditions after each reconcile. + if err := controlPlane.PatchMachines(ctx); err != nil { + retErr = kerrors.NewAggregate([]error{retErr, err}) + } + }() + + if err := workloadCluster.InitWorkload(ctx, controlPlane); err != nil { + logger.Error(err, "Unable to initialize workload cluster") + + return ctrl.Result{}, err + } + // Update conditions status - workloadCluster.UpdateAgentConditions(ctx, controlPlane) - workloadCluster.UpdateEtcdConditions(ctx, controlPlane) + workloadCluster.UpdateAgentConditions(controlPlane) + workloadCluster.UpdateEtcdConditions(controlPlane) + + // Patch nodes metadata + if err := workloadCluster.UpdateNodeMetadata(ctx, controlPlane); err != nil { + logger.Error(err, "Unable to update node metadata") - // Patch machines with the updated conditions. - if err := controlPlane.PatchMachines(ctx); err != nil { return ctrl.Result{}, err } @@ -721,11 +736,11 @@ func (r *RKE2ControlPlaneReconciler) upgradeControlPlane( return ctrl.Result{}, err } - status, err := workloadCluster.ClusterStatus(ctx) - if err != nil { + if err := workloadCluster.InitWorkload(ctx, controlPlane); err != nil { return ctrl.Result{}, err } + status := workloadCluster.ClusterStatus() if status.Nodes <= *rcp.Spec.Replicas { // scaleUp ensures that we don't continue scaling up while waiting for Machines to have NodeRefs return r.scaleUpControlPlane(ctx, cluster, rcp, controlPlane) diff --git a/pkg/rke2/control_plane.go b/pkg/rke2/control_plane.go index 1ba97b3d..5cae4a5f 100644 --- a/pkg/rke2/control_plane.go +++ b/pkg/rke2/control_plane.go @@ -352,6 +352,7 @@ func (c *ControlPlane) PatchMachines(ctx context.Context) error { if err := helper.Patch(ctx, machine, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{ controlplanev1.MachineAgentHealthyCondition, controlplanev1.MachineEtcdMemberHealthyCondition, + controlplanev1.NodeMetadataUpToDate, }}); err != nil { errList = append(errList, errors.Wrapf(err, "failed to patch machine %s", machine.Name)) } diff --git a/pkg/rke2/management_cluster.go b/pkg/rke2/management_cluster.go index 1e9177ab..e9f46817 100644 --- a/pkg/rke2/management_cluster.go +++ b/pkg/rke2/management_cluster.go @@ -113,7 +113,5 @@ func (m *Management) GetWorkloadCluster(ctx context.Context, clusterKey ctrlclie return nil, &RemoteClusterConnectionError{Name: clusterKey.String(), Err: err} } - return &Workload{ - Client: c, - }, nil + return NewWorkload(c), nil } diff --git a/pkg/rke2/workload_cluster.go b/pkg/rke2/workload_cluster.go index f2a20e61..944d875c 100644 --- a/pkg/rke2/workload_cluster.go +++ b/pkg/rke2/workload_cluster.go @@ -22,16 +22,18 @@ import ( "strings" "github.com/pkg/errors" + corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + kerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/sets" - ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util" + "sigs.k8s.io/cluster-api/util/annotations" "sigs.k8s.io/cluster-api/util/collections" "sigs.k8s.io/cluster-api/util/conditions" + "sigs.k8s.io/cluster-api/util/patch" controlplanev1 "github.com/rancher-sandbox/cluster-api-provider-rke2/controlplane/api/v1alpha1" ) @@ -46,9 +48,12 @@ var ErrControlPlaneMinNodes = errors.New("cluster has fewer than 2 control plane // WorkloadCluster defines all behaviors necessary to upgrade kubernetes on a workload cluster. type WorkloadCluster interface { // Basic health and status checks. - ClusterStatus(ctx context.Context) (ClusterStatus, error) - UpdateAgentConditions(ctx context.Context, controlPlane *ControlPlane) - UpdateEtcdConditions(ctx context.Context, controlPlane *ControlPlane) + InitWorkload(ctx context.Context, controlPlane *ControlPlane) error + UpdateNodeMetadata(ctx context.Context, controlPlane *ControlPlane) error + + ClusterStatus() ClusterStatus + UpdateAgentConditions(controlPlane *ControlPlane) + UpdateEtcdConditions(controlPlane *ControlPlane) // Upgrade related tasks. // RemoveEtcdMemberForMachine(ctx context.Context, machine *clusterv1.Machine) error @@ -62,7 +67,53 @@ type WorkloadCluster interface { // Workload defines operations on workload clusters. type Workload struct { - Client ctrlclient.Client + client.Client + + Nodes map[string]*corev1.Node + nodePatchHelpers map[string]*patch.Helper +} + +// NewWorkload is creating a new ClusterWorkload instance. +func NewWorkload(cl client.Client) *Workload { + return &Workload{ + Client: cl, + Nodes: map[string]*corev1.Node{}, + nodePatchHelpers: map[string]*patch.Helper{}, + } +} + +// InitWorkload prepares workload for evaluating status conditions. +func (w *Workload) InitWorkload(ctx context.Context, cp *ControlPlane) error { + nodes, err := w.getControlPlaneNodes(ctx) + if err != nil { + conditions.MarkUnknown( + cp.RCP, + controlplanev1.ControlPlaneComponentsHealthyCondition, + controlplanev1.ControlPlaneComponentsInspectionFailedReason, "Failed to list nodes which are hosting control plane components") + + return err + } + + for _, node := range nodes.Items { + nodeCopy := node + w.Nodes[node.Name] = &nodeCopy + } + + for _, node := range w.Nodes { + patchHelper, err := patch.NewHelper(node, w.Client) + if err != nil { + conditions.MarkUnknown( + cp.RCP, + controlplanev1.ControlPlaneComponentsHealthyCondition, + controlplanev1.ControlPlaneComponentsInspectionFailedReason, "Failed to create patch helpers for control plane nodes") + + return errors.Wrapf(err, "failed to create patch helper for node %s", node.Name) + } + + w.nodePatchHelpers[node.Name] = patchHelper + } + + return nil } // ClusterStatus holds stats information about the cluster. @@ -79,33 +130,57 @@ func (w *Workload) getControlPlaneNodes(ctx context.Context) (*corev1.NodeList, labelNodeRoleControlPlane: "true", } - if err := w.Client.List(ctx, nodes, ctrlclient.MatchingLabels(labels)); err != nil { + if err := w.Client.List(ctx, nodes, client.MatchingLabels(labels)); err != nil { return nil, err } return nodes, nil } +// PatchNodes patches the nodes in the workload cluster. +func (w *Workload) PatchNodes(ctx context.Context, cp *ControlPlane) error { + errList := []error{} + + for i := range w.Nodes { + node := w.Nodes[i] + if helper, ok := w.nodePatchHelpers[node.Name]; ok { + if err := helper.Patch(ctx, node); err != nil { + conditions.MarkUnknown( + cp.Machines[node.Name], + controlplanev1.NodeMetadataUpToDate, + controlplanev1.NodePatchFailedReason, errors.Wrapf(err, "failed to patch node %s", node.Name).Error()) + + errList = append(errList, errors.Wrapf(err, "failed to patch node %s", node.Name)) + } + + conditions.MarkTrue( + cp.Machines[node.Name], + controlplanev1.NodeMetadataUpToDate) + + continue + } + + errList = append(errList, errors.Errorf("failed to get patch helper for node %s", node.Name)) + } + + return kerrors.NewAggregate(errList) +} + // ClusterStatus returns the status of the cluster. -func (w *Workload) ClusterStatus(ctx context.Context) (ClusterStatus, error) { +func (w *Workload) ClusterStatus() ClusterStatus { status := ClusterStatus{} // count the control plane nodes - nodes, err := w.getControlPlaneNodes(ctx) - if err != nil { - return status, err - } - - for _, node := range nodes.Items { + for _, node := range w.Nodes { nodeCopy := node status.Nodes++ - if util.IsNodeReady(&nodeCopy) { + if util.IsNodeReady(nodeCopy) { status.ReadyNodes++ } } - return status, nil + return status } func hasProvisioningMachine(machines collections.Machines) bool { @@ -132,39 +207,21 @@ func nodeHasUnreachableTaint(node corev1.Node) bool { // UpdateAgentConditions is responsible for updating machine conditions reflecting the status of all the control plane // components running in a static pod generated by RKE2. This operation is best effort, in the sense that in case // of problems in retrieving the pod status, it sets the condition to Unknown state without returning any error. -func (w *Workload) UpdateAgentConditions(ctx context.Context, controlPlane *ControlPlane) { +func (w *Workload) UpdateAgentConditions(controlPlane *ControlPlane) { allMachinePodConditions := []clusterv1.ConditionType{ controlplanev1.MachineAgentHealthyCondition, } - // NOTE: this fun uses control plane nodes from the workload cluster as a source of truth for the current state. - controlPlaneNodes, err := w.getControlPlaneNodes(ctx) - if err != nil { - conditions.MarkUnknown( - controlPlane.RCP, - controlplanev1.ControlPlaneComponentsHealthyCondition, - controlplanev1.ControlPlaneComponentsInspectionFailedReason, "Failed to list nodes which are hosting control plane components") - - return - } - // Update conditions for control plane components hosted as static pods on the nodes. var rcpErrors []string - for _, node := range controlPlaneNodes.Items { - // Search for the machine corresponding to the node. - var machine *clusterv1.Machine - - for _, m := range controlPlane.Machines { - if m.Status.NodeRef != nil && m.Status.NodeRef.Name == node.Name { - machine = m - - break - } - } + for k := range w.Nodes { + node := w.Nodes[k] + // Search for the machine corresponding to the node. + machine, found := controlPlane.Machines[node.Name] // If there is no machine corresponding to a node, determine if this is an error or not. - if machine == nil { + if !found { // If there are machines still provisioning there is the chance that a chance that a node might be linked to a machine soon, // otherwise report the error at RCP level given that there is no machine to report on. if hasProvisioningMachine(controlPlane.Machines) { @@ -186,7 +243,7 @@ func (w *Workload) UpdateAgentConditions(ctx context.Context, controlPlane *Cont } // If the node is Unreachable, information about static pods could be stale so set all conditions to unknown. - if nodeHasUnreachableTaint(node) { + if nodeHasUnreachableTaint(*node) { // NOTE: We are assuming unreachable as a temporary condition, leaving to MHC // the responsibility to determine if the node is unhealthy or not. for _, condition := range allMachinePodConditions { @@ -195,37 +252,6 @@ func (w *Workload) UpdateAgentConditions(ctx context.Context, controlPlane *Cont continue } - - targetnode := corev1.Node{} - nodeKey := ctrlclient.ObjectKey{ - Namespace: metav1.NamespaceSystem, - Name: node.Name, - } - - if err := w.Client.Get(ctx, nodeKey, &targetnode); err != nil { - // If there is an error getting the Pod, do not set any conditions. - if apierrors.IsNotFound(err) { - conditions.MarkFalse(machine, - controlplanev1.MachineAgentHealthyCondition, - controlplanev1.PodMissingReason, - clusterv1.ConditionSeverityError, - "Node %s is missing", nodeKey.Name) - - return - } - - conditions.MarkUnknown(machine, - controlplanev1.MachineAgentHealthyCondition, - controlplanev1.PodInspectionFailedReason, "Failed to get node status") - - return - } - - for _, condition := range targetnode.Status.Conditions { - if condition.Type == corev1.NodeReady && condition.Status == corev1.ConditionTrue { - conditions.MarkTrue(machine, controlplanev1.MachineAgentHealthyCondition) - } - } } // If there are provisioned machines without corresponding nodes, report this as a failing conditions with SeverityError. @@ -235,20 +261,19 @@ func (w *Workload) UpdateAgentConditions(ctx context.Context, controlPlane *Cont continue } - found := false - - for _, node := range controlPlaneNodes.Items { - if machine.Status.NodeRef.Name == node.Name { - found = true - - break - } - } - + node, found := w.Nodes[machine.Status.NodeRef.Name] if !found { for _, condition := range allMachinePodConditions { conditions.MarkFalse(machine, condition, controlplanev1.PodFailedReason, clusterv1.ConditionSeverityError, "Missing node") } + + continue + } + + for _, condition := range node.Status.Conditions { + if condition.Type == corev1.NodeReady && condition.Status == corev1.ConditionTrue { + conditions.MarkTrue(machine, controlplanev1.MachineAgentHealthyCondition) + } } } @@ -378,39 +403,17 @@ func aggregateFromMachinesToRCP(input aggregateFromMachinesToRCPInput) { // UpdateEtcdConditions is responsible for updating machine conditions reflecting the status of all the etcd members. // This operation is best effort, in the sense that in case of problems in retrieving member status, it sets // the condition to Unknown state without returning any error. -func (w *Workload) UpdateEtcdConditions(ctx context.Context, controlPlane *ControlPlane) { - w.updateManagedEtcdConditions(ctx, controlPlane) +func (w *Workload) UpdateEtcdConditions(controlPlane *ControlPlane) { + w.updateManagedEtcdConditions(controlPlane) } -func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane *ControlPlane) { +func (w *Workload) updateManagedEtcdConditions(controlPlane *ControlPlane) { // NOTE: This methods uses control plane nodes only to get in contact with etcd but then it relies on etcd // as ultimate source of truth for the list of members and for their health. - controlPlaneNodes, err := w.getControlPlaneNodes(ctx) - if err != nil { - conditions.MarkUnknown( - controlPlane.RCP, - controlplanev1.EtcdClusterHealthyCondition, - controlplanev1.EtcdClusterInspectionFailedReason, "Failed to list nodes which are hosting the etcd members") - - for _, m := range controlPlane.Machines { - conditions.MarkUnknown(m, - controlplanev1.MachineEtcdMemberHealthyCondition, - controlplanev1.EtcdMemberInspectionFailedReason, "Failed to get the node which is hosting the etcd member") - } - - return - } - - for _, node := range controlPlaneNodes.Items { - var machine *clusterv1.Machine - - for _, m := range controlPlane.Machines { - if m.Status.NodeRef != nil && m.Status.NodeRef.Name == node.Name { - machine = m - } - } - - if machine == nil { + for k := range w.Nodes { + node := w.Nodes[k] + machine, found := controlPlane.Machines[node.Name] + if !found { // If there are machines still provisioning there is the chance that a chance that a node might be linked to a machine soon, // otherwise report the error at RCP level given that there is no machine to report on. if hasProvisioningMachine(controlPlane.Machines) { @@ -430,3 +433,30 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane conditions.MarkTrue(machine, controlplanev1.MachineEtcdMemberHealthyCondition) } } + +// UpdateNodeMetadata is responsible for populating node metadata after +// it is referenced from machine object. +func (w *Workload) UpdateNodeMetadata(ctx context.Context, controlPlane *ControlPlane) error { + for commonName, rkeConfig := range controlPlane.rke2Configs { + node, nodeFound := w.Nodes[commonName] + if !nodeFound { + conditions.MarkUnknown( + controlPlane.Machines[commonName], + controlplanev1.NodeMetadataUpToDate, + controlplanev1.NodePatchFailedReason, "associated node not found") + + continue + } else if name, ok := node.Labels[clusterv1.MachineAnnotation]; !ok || name != commonName { + conditions.MarkUnknown( + controlPlane.Machines[commonName], + controlplanev1.NodeMetadataUpToDate, + controlplanev1.NodePatchFailedReason, fmt.Sprintf("node object is missing %s annotation", clusterv1.MachineAnnotation)) + + continue + } + + annotations.AddAnnotations(node, rkeConfig.Spec.AgentConfig.NodeAnnotations) + } + + return w.PatchNodes(ctx, controlPlane) +}