diff --git a/api/v1alpha1/upgradeplan_types.go b/api/v1alpha1/upgradeplan_types.go index 9202a4c..ff9acf3 100644 --- a/api/v1alpha1/upgradeplan_types.go +++ b/api/v1alpha1/upgradeplan_types.go @@ -21,6 +21,8 @@ import ( ) const ( + OperatingSystemUpgradedCondition = "OSUpgraded" + KubernetesUpgradedCondition = "KubernetesUpgraded" RancherUpgradedCondition = "RancherUpgraded" diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 71093fa..3238517 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -20,6 +20,7 @@ rules: - delete - get - list + - watch - apiGroups: - batch resources: diff --git a/internal/controller/reconcile_os.go b/internal/controller/reconcile_os.go index 5155397..245d054 100644 --- a/internal/controller/reconcile_os.go +++ b/internal/controller/reconcile_os.go @@ -4,23 +4,85 @@ import ( "context" "fmt" + lifecyclev1alpha1 "github.com/suse-edge/upgrade-controller/api/v1alpha1" "github.com/suse-edge/upgrade-controller/internal/upgrade" "github.com/suse-edge/upgrade-controller/pkg/release" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" ) //lint:ignore U1000 - Temporary ignore "unused" linter error. Will be removed when function is ready to be used. -func (r *UpgradePlanReconciler) reconcileOS(ctx context.Context, releaseOS *release.OperatingSystem) (ctrl.Result, error) { - secret, err := upgrade.OSUpgradeSecret(releaseOS) +func (r *UpgradePlanReconciler) reconcileOS(ctx context.Context, upgradePlan *lifecyclev1alpha1.UpgradePlan, release *release.Release) (ctrl.Result, error) { + secret, err := upgrade.OSUpgradeSecret(&release.Components.OperatingSystem) if err != nil { return ctrl.Result{}, fmt.Errorf("generating OS upgrade secret: %w", err) } - if err = r.Create(ctx, secret); err != nil { - return ctrl.Result{}, fmt.Errorf("creating OS upgrade secret: %w", err) + if err = r.Get(ctx, client.ObjectKeyFromObject(secret), secret); err != nil { + if !errors.IsNotFound(err) { + return ctrl.Result{}, err + } + + return ctrl.Result{}, r.createSecret(ctx, upgradePlan, secret) } - // TODO: OS upgrade logic + controlPlanePlan := upgrade.OSControlPlanePlan(release.ReleaseVersion, secret.Name, &release.Components.OperatingSystem) + if err = r.Get(ctx, client.ObjectKeyFromObject(controlPlanePlan), controlPlanePlan); err != nil { + if !errors.IsNotFound(err) { + return ctrl.Result{}, err + } + + setInProgressCondition(upgradePlan, lifecyclev1alpha1.OperatingSystemUpgradedCondition, "Control plane nodes are being upgraded") + return ctrl.Result{}, r.createPlan(ctx, upgradePlan, controlPlanePlan) + } + selector, err := metav1.LabelSelectorAsSelector(controlPlanePlan.Spec.NodeSelector) + if err != nil { + return ctrl.Result{}, fmt.Errorf("parsing node selector: %w", err) + } + + nodeList := &corev1.NodeList{} + if err := r.List(ctx, nodeList); err != nil { + return ctrl.Result{}, fmt.Errorf("listing nodes: %w", err) + } + + if !isOSUpgraded(nodeList, selector, release.Components.OperatingSystem.PrettyName) { + return ctrl.Result{}, nil + } else if controlPlaneOnlyCluster(nodeList) { + setSuccessfulCondition(upgradePlan, lifecyclev1alpha1.OperatingSystemUpgradedCondition, "All cluster nodes are upgraded") + return ctrl.Result{Requeue: true}, nil + } + + // TODO: worker upgrade return ctrl.Result{Requeue: true}, nil } + +func isOSUpgraded(nodeList *corev1.NodeList, selector labels.Selector, osPrettyName string) bool { + for _, node := range nodeList.Items { + if !selector.Matches(labels.Set(node.Labels)) { + continue + } + + var nodeReadyStatus corev1.ConditionStatus + + for _, condition := range node.Status.Conditions { + if condition.Type == corev1.NodeReady { + nodeReadyStatus = condition.Status + break + } + } + + if nodeReadyStatus != corev1.ConditionTrue || node.Spec.Unschedulable || node.Status.NodeInfo.OSImage != osPrettyName { + // Upgrade is still in progress. + // TODO: Adjust to looking at the `Complete` condition of the + // `plans.upgrade.cattle.io` resources once system-upgrade-controller v0.13.4 is released. + return false + } + } + + return true +} diff --git a/internal/controller/upgradeplan_controller.go b/internal/controller/upgradeplan_controller.go index 08e457e..fd9233b 100644 --- a/internal/controller/upgradeplan_controller.go +++ b/internal/controller/upgradeplan_controller.go @@ -58,7 +58,7 @@ type UpgradePlanReconciler struct { // +kubebuilder:rbac:groups=lifecycle.suse.com,resources=upgradeplans/finalizers,verbs=update // +kubebuilder:rbac:groups=upgrade.cattle.io,resources=plans,verbs=create;list;get;watch // +kubebuilder:rbac:groups="",resources=nodes,verbs=watch;list -// +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;delete;create +// +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;delete;create;watch // +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch // +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch // +kubebuilder:rbac:groups=batch,resources=jobs/status,verbs=get @@ -90,6 +90,7 @@ func (r *UpgradePlanReconciler) executePlan(ctx context.Context, upgradePlan *li } if len(upgradePlan.Status.Conditions) == 0 { + setPendingCondition(upgradePlan, lifecyclev1alpha1.OperatingSystemUpgradedCondition, "OS upgrade is not yet started") setPendingCondition(upgradePlan, lifecyclev1alpha1.KubernetesUpgradedCondition, "Kubernetes upgrade is not yet started") setPendingCondition(upgradePlan, lifecyclev1alpha1.RancherUpgradedCondition, "Rancher upgrade is not yet started") @@ -97,6 +98,9 @@ func (r *UpgradePlanReconciler) executePlan(ctx context.Context, upgradePlan *li } switch { + // TODO: uncomment once OS upgrades support multi node clusters + // case !meta.IsStatusConditionTrue(upgradePlan.Status.Conditions, lifecyclev1alpha1.OperatingSystemUpgradedCondition): + // return r.reconcileOS(ctx, upgradePlan, release) case !meta.IsStatusConditionTrue(upgradePlan.Status.Conditions, lifecyclev1alpha1.KubernetesUpgradedCondition): return r.reconcileKubernetes(ctx, upgradePlan, &release.Components.Kubernetes) case !isHelmUpgradeFinished(upgradePlan, lifecyclev1alpha1.RancherUpgradedCondition): @@ -109,24 +113,40 @@ func (r *UpgradePlanReconciler) executePlan(ctx context.Context, upgradePlan *li return ctrl.Result{}, nil } -func (r *UpgradePlanReconciler) recordCreatedPlan(upgradePlan *lifecyclev1alpha1.UpgradePlan, name, namespace string) { - r.Recorder.Eventf(upgradePlan, corev1.EventTypeNormal, "PlanCreated", "Upgrade plan created: %s/%s", namespace, name) +func (r *UpgradePlanReconciler) createSecret(ctx context.Context, upgradePlan *lifecyclev1alpha1.UpgradePlan, secret *corev1.Secret) error { + if err := r.createObject(ctx, upgradePlan, secret); err != nil { + return fmt.Errorf("creating secret: %w", err) + } + + r.recordCreatedObject(upgradePlan, "SecretCreated", fmt.Sprintf("Secret created: %s/%s", secret.Namespace, secret.Name)) + return nil } func (r *UpgradePlanReconciler) createPlan(ctx context.Context, upgradePlan *lifecyclev1alpha1.UpgradePlan, plan *upgradecattlev1.Plan) error { - if err := ctrl.SetControllerReference(upgradePlan, plan, r.Scheme); err != nil { - return fmt.Errorf("setting controller reference: %w", err) + if err := r.createObject(ctx, upgradePlan, plan); err != nil { + return fmt.Errorf("creating upgrade plan: %w", err) } - if err := r.Create(ctx, plan); err != nil { - return fmt.Errorf("creating upgrade plan: %w", err) + r.recordCreatedObject(upgradePlan, "PlanCreated", fmt.Sprintf("Upgrade plan created: %s/%s", plan.Namespace, plan.Name)) + return nil +} + +func (r *UpgradePlanReconciler) createObject(ctx context.Context, upgradePlan *lifecyclev1alpha1.UpgradePlan, obj client.Object) error { + if err := ctrl.SetControllerReference(upgradePlan, obj, r.Scheme); err != nil { + return fmt.Errorf("setting controller reference: %w", err) } - r.recordCreatedPlan(upgradePlan, plan.Name, plan.Namespace) + if err := r.Create(ctx, obj); err != nil { + return fmt.Errorf("creating object: %w", err) + } return nil } +func (r *UpgradePlanReconciler) recordCreatedObject(upgradePlan *lifecyclev1alpha1.UpgradePlan, reason, msg string) { + r.Recorder.Eventf(upgradePlan, corev1.EventTypeNormal, reason, msg) +} + func isHelmUpgradeFinished(plan *lifecyclev1alpha1.UpgradePlan, conditionType string) bool { condition := meta.FindStatusCondition(plan.Status.Conditions, conditionType) @@ -201,6 +221,9 @@ func (r *UpgradePlanReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). For(&lifecyclev1alpha1.UpgradePlan{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})). Owns(&upgradecattlev1.Plan{}, builder.WithPredicates(predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + return false + }, UpdateFunc: func(e event.UpdateEvent) bool { // Upgrade plans are being constantly updated on every node change. // Ensure that the reconciliation only covers the scenarios @@ -232,5 +255,6 @@ func (r *UpgradePlanReconciler) SetupWithManager(mgr ctrl.Manager) error { return false }, })). + Owns(&corev1.Secret{}). Complete(r) } diff --git a/internal/upgrade/base.go b/internal/upgrade/base.go index 66cee69..5a7e0b7 100644 --- a/internal/upgrade/base.go +++ b/internal/upgrade/base.go @@ -7,8 +7,10 @@ import ( ) const ( - planNamespace = "cattle-system" - PlanAnnotation = "lifecycle.suse.com/upgrade-plan" + planNamespace = "cattle-system" + PlanAnnotation = "lifecycle.suse.com/upgrade-plan" + controlPlaneKey = "control-plane" + workersKey = "workers" ControlPlaneLabel = "node-role.kubernetes.io/control-plane" ) diff --git a/internal/upgrade/kubernetes.go b/internal/upgrade/kubernetes.go index 4bcde12..1b9c91f 100644 --- a/internal/upgrade/kubernetes.go +++ b/internal/upgrade/kubernetes.go @@ -12,9 +12,6 @@ import ( const ( rke2UpgradeImage = "rancher/rke2-upgrade" k3sUpgradeImage = "rancher/k3s-upgrade" - - controlPlaneKey = "control-plane" - workersKey = "workers" ) func kubernetesPlanName(typeKey, version string) string { diff --git a/internal/upgrade/os.go b/internal/upgrade/os.go index 36f5878..d3e3522 100644 --- a/internal/upgrade/os.go +++ b/internal/upgrade/os.go @@ -4,11 +4,18 @@ import ( "bytes" _ "embed" "fmt" + "path/filepath" + "strings" "text/template" + upgradecattlev1 "github.com/rancher/system-upgrade-controller/pkg/apis/upgrade.cattle.io/v1" "github.com/suse-edge/upgrade-controller/pkg/release" corev1 "k8s.io/api/core/v1" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +const ( + scriptName = "os-upgrade.sh" ) //go:embed templates/os-upgrade.sh.tpl @@ -16,7 +23,6 @@ var osUpgradeScript string func OSUpgradeSecret(releaseOS *release.OperatingSystem) (*corev1.Secret, error) { const ( - scriptName = "os-upgrade.sh" secretName = "os-upgrade-secret" ) @@ -45,7 +51,7 @@ func OSUpgradeSecret(releaseOS *release.OperatingSystem) (*corev1.Secret, error) } secret := &corev1.Secret{ - ObjectMeta: v1.ObjectMeta{ + ObjectMeta: metav1.ObjectMeta{ Name: secretName, Namespace: planNamespace, }, @@ -57,3 +63,71 @@ func OSUpgradeSecret(releaseOS *release.OperatingSystem) (*corev1.Secret, error) return secret, nil } + +func OSControlPlanePlan(releaseVersion, secretName string, releaseOS *release.OperatingSystem) *upgradecattlev1.Plan { + const ( + planImage = "registry.suse.com/bci/bci-base:15.5" + ) + + controlPlanePlanName := osPlanName(controlPlaneKey, releaseOS.ZypperID, releaseOS.Version) + controlPlanePlan := baseUpgradePlan(controlPlanePlanName) + controlPlanePlan.Labels = map[string]string{ + "os-upgrade": "control-plane", + } + controlPlanePlan.Spec.Concurrency = 1 + controlPlanePlan.Spec.NodeSelector = &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: ControlPlaneLabel, + Operator: "In", + Values: []string{ + "true", + }, + }, + }, + } + controlPlanePlan.Spec.Tolerations = []corev1.Toleration{ + { + Key: "CriticalAddonsOnly", + Operator: "Equal", + Value: "true", + Effect: "NoExecute", + }, + { + Key: ControlPlaneLabel, + Operator: "Equal", + Value: "", + Effect: "NoSchedule", + }, + { + Key: "node-role.kubernetes.io/etcd", + Operator: "Equal", + Value: "", + Effect: "NoExecute", + }, + } + + secretPathRelativeToHost := fmt.Sprintf("/run/system-upgrade/secrets/%s", secretName) + mountPath := filepath.Join("/host", secretPathRelativeToHost) + controlPlanePlan.Spec.Secrets = []upgradecattlev1.SecretSpec{ + { + Name: secretName, + Path: mountPath, + }, + } + controlPlanePlan.Spec.Cordon = true + controlPlanePlan.Spec.Version = releaseVersion + + controlPlanePlan.Spec.JobActiveDeadlineSecs = 3600 + + controlPlanePlan.Spec.Upgrade = &upgradecattlev1.ContainerSpec{ + Image: planImage, + Command: []string{"chroot", "/host"}, + Args: []string{"sh", filepath.Join(secretPathRelativeToHost, scriptName)}, + } + return controlPlanePlan +} + +func osPlanName(typeKey, osName, osVersion string) string { + return fmt.Sprintf("%s-%s-%s", typeKey, strings.ToLower(osName), strings.ReplaceAll(osVersion, ".", "-")) +} diff --git a/internal/upgrade/templates/os-upgrade.sh.tpl b/internal/upgrade/templates/os-upgrade.sh.tpl index 14351da..b82cf99 100644 --- a/internal/upgrade/templates/os-upgrade.sh.tpl +++ b/internal/upgrade/templates/os-upgrade.sh.tpl @@ -1,44 +1,68 @@ #!/bin/sh -# Common Platform Enumeration (CPE) comming from the release manifest -RELEASE_CPE={{.CPEScheme}} -# Common Platform Enumeration (CPE) that the system is currently running with -CURRENT_CPE=`cat /etc/os-release | grep -w CPE_NAME | cut -d "=" -f 2 | tr -d '"'` - -# Determine whether architecture is supported -SYSTEM_ARCH=`arch` -IFS=' ' read -r -a SUPPORTED_ARCH_ARRAY <<< $(echo "{{.SupportedArchs}}" | tr -d '[]') - -found=false -for arch in "${SUPPORTED_ARCH_ARRAY[@]}"; do - if [ "${SYSTEM_ARCH}" == ${arch} ]; then - found=true - break - fi -done +OS_UPGRADED_PLACEHOLDER_PATH="/etc/os-upgrade-successful" -if [ ${found} == false ]; then - echo "Operating system is running an unsupported architecture. System arch: ${SYSTEM_ARCH}. Supported archs: ${SUPPORTED_ARCH_ARRAY[*]}" - exit 1 +if [ -f ${OS_UPGRADED_PLACEHOLDER_PATH} ]; then + # Due to the nature of how SUC handles OS upgrades, + # the OS upgrade pod will be restarted after an OS reboot. + # Within the new Pod we only need to check whether the upgrade + # has been done. This is done by checking for the '/run/os-upgrade-successful' + # file which will only be present on the system if a successful upgrade + # of the OS has taken place. + echo "Upgrade has already been done. Exiting.." + rm ${OS_UPGRADED_PLACEHOLDER_PATH} + exit 0 fi -# Determine whether this is a package update or a migration -if [ "${RELEASE_CPE}" == "${CURRENT_CPE}" ]; then - # Package update if both CPEs are the same - EXEC_START_PRE="" - EXEC_START="/usr/sbin/transactional-update cleanup up" - SERVICE_NAME="os-pkg-update.service" -else - # Migration if the CPEs are different - EXEC_START_PRE="/usr/sbin/transactional-update run rpm --import {{.RepoGPGKey}}" - EXEC_START="/usr/sbin/transactional-update --continue run zypper migration --non-interactive --product {{.ZypperID}}/{{.Version}}/${SYSTEM_ARCH} --root /" - SERVICE_NAME="os-migration.service" -fi +cleanupService(){ + rm ${1} + systemctl daemon-reload +} + +executeUpgrade(){ + # Common Platform Enumeration (CPE) coming from the release manifest + RELEASE_CPE={{.CPEScheme}} + # Common Platform Enumeration (CPE) that the system is currently running with + CURRENT_CPE=`cat /etc/os-release | grep -w CPE_NAME | cut -d "=" -f 2 | tr -d '"'` + + # Determine whether architecture is supported + SYSTEM_ARCH=`arch` + IFS=' ' read -r -a SUPPORTED_ARCH_ARRAY <<< $(echo "{{.SupportedArchs}}" | tr -d '[]') + + found=false + for arch in "${SUPPORTED_ARCH_ARRAY[@]}"; do + if [ "${SYSTEM_ARCH}" == ${arch} ]; then + found=true + break + fi + done + + if [ ${found} == false ]; then + echo "Operating system is running an unsupported architecture. System arch: ${SYSTEM_ARCH}. Supported archs: ${SUPPORTED_ARCH_ARRAY[*]}" + exit 1 + fi + + # Determine whether this is a package update or a migration + if [ "${RELEASE_CPE}" == "${CURRENT_CPE}" ]; then + # Package update if both CPEs are the same + EXEC_START_PRE="" + EXEC_START="/usr/sbin/transactional-update cleanup up" + SERVICE_NAME="os-pkg-update.service" + else + # Migration if the CPEs are different + EXEC_START_PRE="/usr/sbin/transactional-update cleanup run rpm --import {{.RepoGPGKey}}" + EXEC_START="/usr/sbin/transactional-update --continue run zypper migration --non-interactive --product {{.ZypperID}}/{{.Version}}/${SYSTEM_ARCH} --root /" + SERVICE_NAME="os-migration.service" + fi + + UPDATE_SERVICE_PATH=/etc/systemd/system/${SERVICE_NAME} -UPDATE_SERVICE_PATH=/etc/systemd/system/${SERVICE_NAME} + # Make sure that even after a non-zero exit of the script + # we will do a cleanup of the service + trap "cleanupService ${UPDATE_SERVICE_PATH}" EXIT -echo "Creating ${SERVICE_NAME}..." -cat < ${UPDATE_SERVICE_PATH} + echo "Creating ${SERVICE_NAME}..." + cat < ${UPDATE_SERVICE_PATH} [Unit] Description=SUSE Edge Upgrade Service ConditionACPower=true @@ -49,16 +73,34 @@ After=network.target Type=oneshot ExecStartPre=${EXEC_START_PRE} ExecStart=${EXEC_START} -ExecStartPost=-/bin/bash -c '[ -f /run/reboot-needed ] && shutdown -r +1' IOSchedulingClass=best-effort IOSchedulingPriority=7 EOF -echo "Starting ${SERVICE_NAME}..." -systemctl start ${SERVICE_NAME} & -tail --pid $! -f cat /var/log/transactional-update.log + echo "Starting ${SERVICE_NAME}..." + systemctl start ${SERVICE_NAME} & + + BACKGROUND_PROC_PID=$! + tail --pid ${BACKGROUND_PROC_PID} -f /var/log/transactional-update.log + + # Waits for the background process with pid to finish and propagates its exit code to '$?' + wait ${BACKGROUND_PROC_PID} + + # Get exit code of backgroup process + BACKGROUND_PROC_EXIT=$? + if [ ${BACKGROUND_PROC_EXIT} -ne 0 ]; then + exit ${BACKGROUND_PROC_EXIT} + fi + + # Check if reboot is needed. + # Will only be needed when transactional-update has successfully + # done any package upgrades/updates. + if [ -f /run/reboot-needed ]; then + # Create a placeholder indicating that the os upgrade + # has finished succesfully + touch ${OS_UPGRADED_PLACEHOLDER_PATH} + /usr/sbin/reboot + fi +} -echo "Cleaning up..." -# Remove service after it has finished its work -rm ${UPDATE_SERVICE_PATH} -systemctl daemon-reload +executeUpgrade diff --git a/manifests/release-3.0.1.yaml b/manifests/release-3.0.1.yaml index 056b799..8346fd0 100644 --- a/manifests/release-3.0.1.yaml +++ b/manifests/release-3.0.1.yaml @@ -15,6 +15,7 @@ components: zypperID: SL-Micro cpeScheme: cpe:/o:suse:sl-micro:6.0 repoGPGPath: /usr/lib/rpm/gnupg/keys/gpg-pubkey-09d9ea69-645b99ce.asc + prettyName: "SUSE Linux Micro 6.0" supportedArchs: - x86_64 # - aarch64 TODO: add when we start supporting it diff --git a/pkg/release/release.go b/pkg/release/release.go index 7a0b519..e269e3a 100644 --- a/pkg/release/release.go +++ b/pkg/release/release.go @@ -27,6 +27,7 @@ type OperatingSystem struct { CPEScheme string `yaml:"cpeScheme"` RepoGPGPath string `yaml:"repoGPGPath"` SupportedArchs []string `yaml:"supportedArchs"` + PrettyName string `yaml:"prettyName"` } type HelmChart struct {