From 831b562d164b4b2db5385c0324f1d9c38c5d657c Mon Sep 17 00:00:00 2001 From: Huaqing Mo Date: Tue, 12 Nov 2024 10:15:57 +0800 Subject: [PATCH] SKS-3001 & SKS-2691: Support inplace-update and cpu and memory expansion (#185) --- api/v1beta1/conditions_consts.go | 20 +- api/v1beta1/elfmachine_types.go | 9 + api/v1beta1/types.go | 5 + api/v1beta1/zz_generated.deepcopy.go | 3 +- ...tructure.cluster.x-k8s.io_elfmachines.yaml | 13 + config/rbac/role.yaml | 26 - controllers/elfmachine_controller.go | 11 +- .../elfmachine_controller_resources.go | 128 ++++- .../elfmachine_controller_resources_test.go | 295 ++++++++-- controllers/elfmachine_controller_test.go | 23 +- controllers/elfmachinetemplate_controller.go | 542 ------------------ .../elfmachinetemplate_controller_test.go | 507 ---------------- controllers/suite_test.go | 16 - main.go | 4 - pkg/hostagent/service.go | 61 +- pkg/hostagent/tasks/restart_kubelet.yaml | 9 + pkg/hostagent/tasks/tasks.go | 5 + pkg/service/util.go | 27 +- pkg/service/vm.go | 14 +- pkg/util/machine/machine.go | 35 -- pkg/util/machine/machine_test.go | 49 -- test/fake/tower.go | 4 +- webhooks/elfmachine_webhook_mutation.go | 4 + webhooks/elfmachine_webhook_mutation_test.go | 4 +- webhooks/elfmachine_webhook_validation.go | 54 +- .../elfmachine_webhook_validation_test.go | 134 ++++- .../elfmachinetemplate_webhook_mutation.go | 4 + ...lfmachinetemplate_webhook_mutation_test.go | 20 +- .../elfmachinetemplate_webhook_validation.go | 32 +- ...machinetemplate_webhook_validation_test.go | 72 ++- 30 files changed, 779 insertions(+), 1351 deletions(-) delete mode 100644 controllers/elfmachinetemplate_controller.go delete mode 100644 controllers/elfmachinetemplate_controller_test.go create mode 100644 pkg/hostagent/tasks/restart_kubelet.yaml diff --git a/api/v1beta1/conditions_consts.go b/api/v1beta1/conditions_consts.go index 2a52625c..fd652600 100644 --- a/api/v1beta1/conditions_consts.go +++ b/api/v1beta1/conditions_consts.go @@ -125,9 +125,6 @@ const ( // ResourcesHotUpdatedCondition documents the status of the hot updating resources of a VM. ResourcesHotUpdatedCondition = "ResourceHotUpdated" - // WaitingForResourcesHotUpdateReason (Severity=Info) documents an ElfMachine waiting for updating resources. - WaitingForResourcesHotUpdateReason = "WaitingForResourcesHotUpdate" - // ExpandingVMDiskReason documents (Severity=Info) ElfMachine currently executing the expand disk operation. ExpandingVMDiskReason = "ExpandingVMDisk" @@ -144,6 +141,23 @@ const ( // detecting an error while adding new disk capacity to root directory; those kind of errors are // usually transient and failed updating are automatically re-tried by the controller. ExpandingRootPartitionFailedReason = "ExpandingRootPartitionFailed" + + // ExpandingVMComputeResourcesReason documents (Severity=Info) ElfMachine currently executing the + // expand resources(CPU/memory) operation. + ExpandingVMComputeResourcesReason = "ExpandingVMComputeResources" + + // ExpandingVMComputeResourcesFailedReason (Severity=Warning) documents an ElfMachine controller detecting + // an error while expanding resources(CPU/memory); those kind of errors are usually transient and + // failed updating are automatically re-tried by the controller. + ExpandingVMComputeResourcesFailedReason = "ExpandingVMComputeResourcesFailed" + + // RestartingKubeletReason documents (Severity=Info) ElfMachine currently executing the restart kubelet operation. + RestartingKubeletReason = "RestartingKubelet" + + // RestartingKubeletFailedReason (Severity=Warning) documents an ElfMachine controller detecting + // an error while restarting kubelet; those kind of errors are usually transient and failed restarting + // are automatically re-tried by the controller. + RestartingKubeletFailedReason = "RestartingKubeletFailed" ) // Conditions and Reasons related to make connections to a Tower. Can currently be used by ElfCluster and ElfMachine diff --git a/api/v1beta1/elfmachine_types.go b/api/v1beta1/elfmachine_types.go index 0a25429e..cfa6a4a1 100644 --- a/api/v1beta1/elfmachine_types.go +++ b/api/v1beta1/elfmachine_types.go @@ -19,6 +19,7 @@ package v1beta1 import ( "time" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" capierrors "sigs.k8s.io/cluster-api/errors" @@ -259,6 +260,14 @@ func (m *ElfMachine) IsHotUpdating() bool { return false } +// IsResourcesUpToDate returns whether the machine resources are up to date. +func (m *ElfMachine) IsResourcesUpToDate() bool { + specMemory := *resource.NewQuantity(m.Spec.MemoryMiB*1024*1024, resource.BinarySI) + return m.Spec.DiskGiB == m.Status.Resources.Disk && + m.Spec.NumCPUs == m.Status.Resources.CPUCores && + specMemory.Equal(m.Status.Resources.Memory) +} + func (m *ElfMachine) SetVMDisconnectionTimestamp(timestamp *metav1.Time) { if m.Annotations == nil { m.Annotations = make(map[string]string) diff --git a/api/v1beta1/types.go b/api/v1beta1/types.go index 039b763b..4a27c4ff 100644 --- a/api/v1beta1/types.go +++ b/api/v1beta1/types.go @@ -18,6 +18,7 @@ package v1beta1 import ( corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" ) // CloneMode is the type of clone operation used to clone a VM from a template. @@ -199,6 +200,10 @@ type GPUStatus struct { // ResourcesStatus records the resources allocated to the virtual machine. type ResourcesStatus struct { Disk int32 `json:"disk,omitempty"` + // CPUCores is the total number of CPU cores allocated for the virtual machine. + CPUCores int32 `json:"cpu,omitempty"` + // Memory is the total number of memory in MiB allocated for the virtual machine. + Memory resource.Quantity `json:"memory,omitempty"` } //+kubebuilder:object:generate=false diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index 7e292e8d..ff9c16f3 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -247,7 +247,7 @@ func (in *ElfMachineStatus) DeepCopyInto(out *ElfMachineStatus) { *out = make([]GPUStatus, len(*in)) copy(*out, *in) } - out.Resources = in.Resources + in.Resources.DeepCopyInto(&out.Resources) if in.FailureReason != nil { in, out := &in.FailureReason, &out.FailureReason *out = new(errors.MachineStatusError) @@ -487,6 +487,7 @@ func (in *NetworkStatus) DeepCopy() *NetworkStatus { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ResourcesStatus) DeepCopyInto(out *ResourcesStatus) { *out = *in + out.Memory = in.Memory.DeepCopy() } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourcesStatus. diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachines.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachines.yaml index 60b98f31..1f9ab988 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachines.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachines.yaml @@ -440,9 +440,22 @@ spec: resources: description: Resources records the resources allocated for the machine. properties: + cpu: + description: CPUCores is the total number of CPU cores allocated + for the virtual machine. + format: int32 + type: integer disk: format: int32 type: integer + memory: + anyOf: + - type: integer + - type: string + description: Memory is the total number of memory in MiB allocated + for the virtual machine. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object taskRef: description: |- diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index f6a7881d..30280427 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -129,29 +129,3 @@ rules: - get - patch - update -- apiGroups: - - infrastructure.cluster.x-k8s.io - resources: - - elfmachinetemplates - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - infrastructure.cluster.x-k8s.io - resources: - - elfmachinetemplates/finalizers - verbs: - - update -- apiGroups: - - infrastructure.cluster.x-k8s.io - resources: - - elfmachinetemplates/status - verbs: - - get - - patch - - update diff --git a/controllers/elfmachine_controller.go b/controllers/elfmachine_controller.go index f8392a11..d455f6cf 100644 --- a/controllers/elfmachine_controller.go +++ b/controllers/elfmachine_controller.go @@ -206,8 +206,8 @@ func (r *ElfMachineReconciler) Reconcile(ctx goctx.Context, req ctrl.Request) (r // always update the readyCondition. conditions.SetSummary(machineCtx.ElfMachine, conditions.WithConditions( - infrav1.VMProvisionedCondition, infrav1.ResourcesHotUpdatedCondition, + infrav1.VMProvisionedCondition, infrav1.TowerAvailableCondition, ), ) @@ -973,7 +973,7 @@ func (r *ElfMachineReconciler) reconcileVMTask(ctx goctx.Context, machineCtx *co machineCtx.ElfMachine.SetVMFirstBootTimestamp(&now) } - if service.IsCloneVMTask(task) || service.IsPowerOnVMTask(task) { + if service.IsCloneVMTask(task) || service.IsPowerOnVMTask(task) || service.IsUpdateVMTask(task) { releaseTicketForCreateVM(machineCtx.ElfMachine.Name) recordElfClusterStorageInsufficient(machineCtx, false) recordElfClusterMemoryInsufficient(machineCtx, false) @@ -1024,7 +1024,12 @@ func (r *ElfMachineReconciler) reconcileVMFailedTask(ctx goctx.Context, machineC case service.IsUpdateVMDiskTask(task, machineCtx.ElfMachine.Name): reason := conditions.GetReason(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition) if reason == infrav1.ExpandingVMDiskReason || reason == infrav1.ExpandingVMDiskFailedReason { - conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskFailedReason, clusterv1.ConditionSeverityInfo, errorMessage) + conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskFailedReason, clusterv1.ConditionSeverityWarning, errorMessage) + } + case service.IsUpdateVMTask(task) && conditions.IsFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition): + reason := conditions.GetReason(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition) + if reason == infrav1.ExpandingVMComputeResourcesReason || reason == infrav1.ExpandingVMComputeResourcesFailedReason { + conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMComputeResourcesFailedReason, clusterv1.ConditionSeverityWarning, errorMessage) } case service.IsPowerOnVMTask(task) || service.IsUpdateVMTask(task) || service.IsVMColdMigrationTask(task): if machineCtx.ElfMachine.RequiresGPUDevices() { diff --git a/controllers/elfmachine_controller_resources.go b/controllers/elfmachine_controller_resources.go index aacb7c25..6901387b 100644 --- a/controllers/elfmachine_controller_resources.go +++ b/controllers/elfmachine_controller_resources.go @@ -22,6 +22,7 @@ import ( "github.com/smartxworks/cloudtower-go-sdk/v2/models" agentv1 "github.com/smartxworks/host-config-agent-api/api/v1alpha1" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" capiremote "sigs.k8s.io/cluster-api/controllers/remote" "sigs.k8s.io/cluster-api/util/conditions" @@ -32,38 +33,26 @@ import ( "github.com/smartxworks/cluster-api-provider-elf/pkg/context" "github.com/smartxworks/cluster-api-provider-elf/pkg/hostagent" "github.com/smartxworks/cluster-api-provider-elf/pkg/service" - machineutil "github.com/smartxworks/cluster-api-provider-elf/pkg/util/machine" ) func (r *ElfMachineReconciler) reconcileVMResources(ctx goctx.Context, machineCtx *context.MachineContext, vm *models.VM) (bool, error) { - log := ctrl.LoggerFrom(ctx) - - hotUpdatedCondition := conditions.Get(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition) - if hotUpdatedCondition != nil && - hotUpdatedCondition.Reason == infrav1.WaitingForResourcesHotUpdateReason && - hotUpdatedCondition.Message != "" { - log.Info("Waiting for hot updating resources", "message", hotUpdatedCondition.Message) - - return false, nil + if ok, err := r.reconcileVMCPUAndMemory(ctx, machineCtx, vm); err != nil || !ok { + return ok, err } - if ok, err := r.reconcieVMVolume(ctx, machineCtx, vm, infrav1.ResourcesHotUpdatedCondition); err != nil || !ok { + if ok, err := r.restartKubelet(ctx, machineCtx); err != nil || !ok { return ok, err } - // Agent needs to wait for the node exists before it can run and execute commands. - if machineutil.IsUpdatingElfMachineResources(machineCtx.ElfMachine) && - machineCtx.Machine.Status.NodeInfo == nil { - log.Info("Waiting for node exists for host agent expand vm root partition") - - return false, nil + if ok, err := r.reconcieVMVolume(ctx, machineCtx, vm, infrav1.ResourcesHotUpdatedCondition); err != nil || !ok { + return ok, err } if ok, err := r.expandVMRootPartition(ctx, machineCtx); err != nil || !ok { return ok, err } - if machineutil.IsUpdatingElfMachineResources(machineCtx.ElfMachine) { + if conditions.IsFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition) { conditions.MarkTrue(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition) } @@ -148,8 +137,6 @@ func (r *ElfMachineReconciler) resizeVMVolume(ctx goctx.Context, machineCtx *con // expandVMRootPartition adds new disk capacity to root partition. func (r *ElfMachineReconciler) expandVMRootPartition(ctx goctx.Context, machineCtx *context.MachineContext) (bool, error) { - log := ctrl.LoggerFrom(ctx) - reason := conditions.GetReason(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition) if reason == "" { return true, nil @@ -164,35 +151,118 @@ func (r *ElfMachineReconciler) expandVMRootPartition(ctx goctx.Context, machineC conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingRootPartitionReason, clusterv1.ConditionSeverityInfo, "") } + return r.reconcileHostJob(ctx, machineCtx, hostagent.HostAgentJobTypeExpandRootPartition) +} + +// reconcileVMCPUAndMemory ensures that the vm CPU and memory are as expected. +func (r *ElfMachineReconciler) reconcileVMCPUAndMemory(ctx goctx.Context, machineCtx *context.MachineContext, vm *models.VM) (bool, error) { + machineCtx.ElfMachine.Status.Resources.CPUCores = *vm.Vcpu + machineCtx.ElfMachine.Status.Resources.Memory = *resource.NewQuantity(service.ByteToMiB(*vm.Memory)*1024*1024, resource.BinarySI) + + if !(machineCtx.ElfMachine.Spec.NumCPUs > *vm.Vcpu || + machineCtx.ElfMachine.Spec.MemoryMiB > service.ByteToMiB(*vm.Memory)) { + return true, nil + } + + reason := conditions.GetReason(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition) + if reason == "" || + (reason != infrav1.ExpandingVMComputeResourcesReason && reason != infrav1.ExpandingVMComputeResourcesFailedReason) { + conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMComputeResourcesReason, clusterv1.ConditionSeverityInfo, "") + + // Save the condition first, and then expand the resources capacity. + // This prevents the resources expansion from succeeding but failing to save the + // condition, causing ElfMachine to not record the condition. + return false, nil + } + + log := ctrl.LoggerFrom(ctx) + + if ok := acquireTicketForUpdatingVM(machineCtx.ElfMachine.Name); !ok { + log.V(1).Info(fmt.Sprintf("The VM operation reaches rate limit, skip updating VM %s CPU and memory", machineCtx.ElfMachine.Status.VMRef)) + + return false, nil + } + + withTaskVM, err := machineCtx.VMService.UpdateVM(vm, machineCtx.ElfMachine) + if err != nil { + conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMComputeResourcesFailedReason, clusterv1.ConditionSeverityWarning, err.Error()) + + return false, errors.Wrapf(err, "failed to trigger update CPU and memory for VM %s", *vm.Name) + } + + machineCtx.ElfMachine.SetTask(*withTaskVM.TaskID) + + log.Info("Waiting for the VM to be updated CPU and memory", "vmRef", machineCtx.ElfMachine.Status.VMRef, "taskRef", machineCtx.ElfMachine.Status.TaskRef) + + return false, nil +} + +func (r *ElfMachineReconciler) restartKubelet(ctx goctx.Context, machineCtx *context.MachineContext) (bool, error) { + reason := conditions.GetReason(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition) + if reason == "" { + return true, nil + } else if reason != infrav1.ExpandingVMComputeResourcesReason && + reason != infrav1.ExpandingVMComputeResourcesFailedReason && + reason != infrav1.RestartingKubeletReason && + reason != infrav1.RestartingKubeletFailedReason { + return true, nil + } + + if reason != infrav1.RestartingKubeletFailedReason { + conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.RestartingKubeletReason, clusterv1.ConditionSeverityInfo, "") + } + + return r.reconcileHostJob(ctx, machineCtx, hostagent.HostAgentJobTypeRestartKubelet) +} + +func (r *ElfMachineReconciler) reconcileHostJob(ctx goctx.Context, machineCtx *context.MachineContext, jobType hostagent.HostAgentJobType) (bool, error) { + log := ctrl.LoggerFrom(ctx) + failReason := "" + switch jobType { + case hostagent.HostAgentJobTypeExpandRootPartition: + failReason = infrav1.ExpandingRootPartitionFailedReason + case hostagent.HostAgentJobTypeRestartKubelet: + failReason = infrav1.RestartingKubeletFailedReason + } + + // Agent needs to wait for the node exists before it can run and execute commands. + if machineCtx.Machine.Status.NodeInfo == nil { + log.Info("Waiting for node exists for host agent job", "jobType", jobType) + + return false, nil + } + kubeClient, err := capiremote.NewClusterClient(ctx, "", r.Client, client.ObjectKey{Namespace: machineCtx.Cluster.Namespace, Name: machineCtx.Cluster.Name}) if err != nil { + conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, failReason, clusterv1.ConditionSeverityWarning, "failed to create kubeClient: "+err.Error()) return false, err } - agentJob, err := hostagent.GetHostJob(ctx, kubeClient, machineCtx.ElfMachine.Namespace, hostagent.GetExpandRootPartitionJobName(machineCtx.ElfMachine)) + agentJob, err := hostagent.GetHostJob(ctx, kubeClient, machineCtx.ElfMachine.Namespace, hostagent.GetJobName(machineCtx.ElfMachine, jobType)) if err != nil && !apierrors.IsNotFound(err) { + conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, failReason, clusterv1.ConditionSeverityWarning, "failed to get HostOperationJob: "+err.Error()) return false, err } if agentJob == nil { - agentJob, err = hostagent.ExpandRootPartition(ctx, kubeClient, machineCtx.ElfMachine) - if err != nil { + agentJob = hostagent.GenerateJob(machineCtx.ElfMachine, jobType) + if err = kubeClient.Create(ctx, agentJob); err != nil { conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingRootPartitionFailedReason, clusterv1.ConditionSeverityInfo, err.Error()) return false, err } - log.Info("Waiting for expanding root partition", "hostAgentJob", agentJob.Name) + log.Info("Waiting for job to complete", "hostAgentJob", agentJob.Name) return false, nil } switch agentJob.Status.Phase { case agentv1.PhaseSucceeded: - log.Info("Expand root partition to root succeeded", "hostAgentJob", agentJob.Name) + log.Info("HostJob succeeded", "hostAgentJob", agentJob.Name) case agentv1.PhaseFailed: - conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingRootPartitionFailedReason, clusterv1.ConditionSeverityWarning, agentJob.Status.FailureMessage) - log.Info("Expand root partition failed, will try again after three minutes", "hostAgentJob", agentJob.Name, "failureMessage", agentJob.Status.FailureMessage) + conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, failReason, clusterv1.ConditionSeverityWarning, agentJob.Status.FailureMessage) + log.Info("HostJob failed, will try again after three minutes", "hostAgentJob", agentJob.Name, "failureMessage", agentJob.Status.FailureMessage) lastExecutionTime := agentJob.Status.LastExecutionTime if lastExecutionTime == nil { @@ -201,13 +271,13 @@ func (r *ElfMachineReconciler) expandVMRootPartition(ctx goctx.Context, machineC // Three minutes after the job fails, delete the job and try again. if time.Now().After(lastExecutionTime.Add(3 * time.Minute)) { if err := kubeClient.Delete(ctx, agentJob); err != nil { - return false, errors.Wrapf(err, "failed to delete expand root partition job %s/%s for retry", agentJob.Namespace, agentJob.Name) + return false, errors.Wrapf(err, "failed to delete hostJob %s/%s for retry", agentJob.Namespace, agentJob.Name) } } return false, nil default: - log.Info("Waiting for expanding root partition job done", "hostAgentJob", agentJob.Name, "jobStatus", agentJob.Status.Phase) + log.Info("Waiting for HostJob done", "hostAgentJob", agentJob.Name, "jobStatus", agentJob.Status.Phase) return false, nil } diff --git a/controllers/elfmachine_controller_resources_test.go b/controllers/elfmachine_controller_resources_test.go index 6b502c09..fa04b589 100644 --- a/controllers/elfmachine_controller_resources_test.go +++ b/controllers/elfmachine_controller_resources_test.go @@ -26,6 +26,7 @@ import ( agentv1 "github.com/smartxworks/host-config-agent-api/api/v1alpha1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/klog/v2" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" @@ -55,6 +56,9 @@ var _ = Describe("ElfMachineReconciler", func() { mockNewVMService service.NewVMServiceFunc ) + _, err := testEnv.CreateNamespace(goctx.Background(), "sks-system") + Expect(err).NotTo(HaveOccurred()) + BeforeEach(func() { logBuffer = new(bytes.Buffer) klog.SetOutput(logBuffer) @@ -74,37 +78,6 @@ var _ = Describe("ElfMachineReconciler", func() { }) Context("reconcileVMResources", func() { - It("should reconcile when WaitingForResourcesHotUpdateReason is not empty", func() { - conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, "xx") - ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) - vm := fake.NewTowerVMFromElfMachine(elfMachine) - reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} - ok, err := reconciler.reconcileVMResources(ctx, machineContext, vm) - Expect(ok).To(BeFalse()) - Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring("Waiting for hot updating resources")) - }) - - It("should wait for node exists", func() { - conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "") - ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) - vmVolume := fake.NewVMVolume(elfMachine) - vmDisk := fake.NewVMDisk(vmVolume) - vm := fake.NewTowerVMFromElfMachine(elfMachine) - vm.VMDisks = []*models.NestedVMDisk{{ID: vmDisk.ID}} - mockVMService.EXPECT().GetVMDisks([]string{*vmDisk.ID}).Return([]*models.VMDisk{vmDisk}, nil) - mockVMService.EXPECT().GetVMVolume(*vmVolume.ID).Return(vmVolume, nil) - reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} - ok, err := reconciler.reconcileVMResources(ctx, machineContext, vm) - Expect(ok).To(BeFalse()) - Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring("Waiting for node exists for host agent expand vm root partition")) - }) - It("should mark ResourcesHotUpdatedCondition to true", func() { agentJob := newExpandRootPartitionJob(elfMachine) Expect(testEnv.CreateAndWait(ctx, agentJob)).NotTo(HaveOccurred()) @@ -127,9 +100,9 @@ var _ = Describe("ElfMachineReconciler", func() { mockVMService.EXPECT().GetVMVolume(*vmVolume.ID).Return(vmVolume, nil) reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} ok, err := reconciler.reconcileVMResources(ctx, machineContext, vm) - Expect(ok).To(BeTrue()) - Expect(err).NotTo(HaveOccurred()) expectConditions(elfMachine, []conditionAssertion{{conditionType: infrav1.ResourcesHotUpdatedCondition, status: corev1.ConditionTrue}}) + Expect(err).NotTo(HaveOccurred()) + Expect(ok).To(BeTrue()) }) }) @@ -219,17 +192,29 @@ var _ = Describe("ElfMachineReconciler", func() { fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) vmVolume := fake.NewVMVolume(elfMachine) - mockVMService.EXPECT().ResizeVMVolume(*vmVolume.ID, int64(10)).Return(nil, unexpectedError) + task := fake.NewTowerTask("") + withTaskVMVolume := fake.NewWithTaskVMVolume(vmVolume, task) + mockVMService.EXPECT().ResizeVMVolume(*vmVolume.ID, int64(10)).Return(withTaskVMVolume, nil) reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} err := reconciler.resizeVMVolume(ctx, machineContext, vmVolume, 10, infrav1.ResourcesHotUpdatedCondition) + Expect(err).ToNot(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for the vm volume")) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingVMDiskReason}}) + + mockVMService.EXPECT().ResizeVMVolume(*vmVolume.ID, int64(10)).Return(nil, unexpectedError) + ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext = newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler = &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + err = reconciler.resizeVMVolume(ctx, machineContext, vmVolume, 10, infrav1.ResourcesHotUpdatedCondition) Expect(err).To(HaveOccurred()) Expect(err.Error()).To(ContainSubstring("failed to trigger expand size from")) + Expect(elfMachine.Status.TaskRef).To(Equal(*withTaskVMVolume.TaskID)) expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.ExpandingVMDiskFailedReason}}) - task := fake.NewTowerTask("") - withTaskVMVolume := fake.NewWithTaskVMVolume(vmVolume, task) + task = fake.NewTowerTask("") + withTaskVMVolume = fake.NewWithTaskVMVolume(vmVolume, task) mockVMService.EXPECT().ResizeVMVolume(*vmVolume.ID, int64(10)).Return(withTaskVMVolume, nil) - conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "") ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) machineContext = newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) @@ -238,7 +223,7 @@ var _ = Describe("ElfMachineReconciler", func() { Expect(err).NotTo(HaveOccurred()) Expect(logBuffer.String()).To(ContainSubstring("Waiting for the vm volume")) Expect(elfMachine.Status.TaskRef).To(Equal(*withTaskVMVolume.TaskID)) - expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingVMDiskReason}}) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.ExpandingVMDiskFailedReason}}) }) }) @@ -261,7 +246,22 @@ var _ = Describe("ElfMachineReconciler", func() { expectConditions(elfMachine, []conditionAssertion{}) }) + It("should wait for node exists", func() { + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "") + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kubeConfigSecret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.expandVMRootPartition(ctx, machineContext) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for node exists for host agent job")) + Expect(logBuffer.String()).To(ContainSubstring("expand-root-partition")) + }) + It("should create agent job to expand root partition", func() { + machine.Status.NodeInfo = &corev1.NodeSystemInfo{} conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "") ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kubeConfigSecret) fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) @@ -271,7 +271,8 @@ var _ = Describe("ElfMachineReconciler", func() { ok, err := reconciler.expandVMRootPartition(ctx, machineContext) Expect(ok).To(BeFalse()) Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring("Waiting for expanding root partition")) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for job to complete")) + Expect(logBuffer.String()).To(ContainSubstring("expand-root-partition")) expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingRootPartitionReason}}) var agentJob *agentv1.HostOperationJob Eventually(func() error { @@ -283,6 +284,7 @@ var _ = Describe("ElfMachineReconciler", func() { }) It("should retry when job failed", func() { + machine.Status.NodeInfo = &corev1.NodeSystemInfo{} conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "") agentJob := newExpandRootPartitionJob(elfMachine) Expect(testEnv.CreateAndWait(ctx, agentJob)).NotTo(HaveOccurred()) @@ -293,7 +295,7 @@ var _ = Describe("ElfMachineReconciler", func() { ok, err := reconciler.expandVMRootPartition(ctx, machineContext) Expect(ok).To(BeFalse()) Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring("Waiting for expanding root partition job done")) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for HostJob done")) expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingRootPartitionReason}}) logBuffer.Reset() @@ -304,7 +306,7 @@ var _ = Describe("ElfMachineReconciler", func() { ok, err = reconciler.expandVMRootPartition(ctx, machineContext) Expect(ok).To(BeFalse()) Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring("Expand root partition failed, will try again")) + Expect(logBuffer.String()).To(ContainSubstring("HostJob failed, will try again")) expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.ExpandingRootPartitionFailedReason}}) Expect(testEnv.Get(ctx, client.ObjectKey{Namespace: agentJob.Namespace, Name: agentJob.Name}, agentJob)).NotTo(HaveOccurred()) @@ -322,6 +324,7 @@ var _ = Describe("ElfMachineReconciler", func() { }) It("should record job succeeded", func() { + machine.Status.NodeInfo = &corev1.NodeSystemInfo{} conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "") agentJob := newExpandRootPartitionJob(elfMachine) Expect(testEnv.CreateAndWait(ctx, agentJob)).NotTo(HaveOccurred()) @@ -336,10 +339,199 @@ var _ = Describe("ElfMachineReconciler", func() { ok, err := reconciler.expandVMRootPartition(ctx, machineContext) Expect(ok).To(BeTrue()) Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring("Expand root partition to root succeeded")) + Expect(logBuffer.String()).To(ContainSubstring("HostJob succeeded")) expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingRootPartitionReason}}) }) }) + + Context("restartKubelet", func() { + BeforeEach(func() { + var err error + kubeConfigSecret, err = helpers.NewKubeConfigSecret(testEnv, cluster.Namespace, cluster.Name) + Expect(err).ShouldNot(HaveOccurred()) + }) + + It("should not restart kubelet without restartKubelet", func() { + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kubeConfigSecret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.restartKubelet(ctx, machineContext) + Expect(ok).To(BeTrue()) + Expect(err).NotTo(HaveOccurred()) + expectConditions(elfMachine, []conditionAssertion{}) + }) + + It("should wait for node exists", func() { + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMComputeResourcesReason, clusterv1.ConditionSeverityInfo, "") + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kubeConfigSecret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.restartKubelet(ctx, machineContext) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for node exists for host agent job")) + Expect(logBuffer.String()).To(ContainSubstring("restart-kubelet")) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.RestartingKubeletReason}}) + }) + + It("should create agent job to restart kubelet", func() { + machine.Status.NodeInfo = &corev1.NodeSystemInfo{} + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMComputeResourcesReason, clusterv1.ConditionSeverityInfo, "") + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kubeConfigSecret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.restartKubelet(ctx, machineContext) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for job to complete")) + Expect(logBuffer.String()).To(ContainSubstring("restart-kubelet")) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.RestartingKubeletReason}}) + var agentJob *agentv1.HostOperationJob + Eventually(func() error { + var err error + agentJob, err = hostagent.GetHostJob(ctx, testEnv.Client, elfMachine.Namespace, hostagent.GetRestartKubeletJobName(elfMachine)) + return err + }, timeout).Should(BeNil()) + Expect(agentJob.Name).To(Equal(hostagent.GetRestartKubeletJobName(elfMachine))) + }) + + It("should retry when job failed", func() { + machine.Status.NodeInfo = &corev1.NodeSystemInfo{} + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMComputeResourcesReason, clusterv1.ConditionSeverityInfo, "") + agentJob := newRestartKubelet(elfMachine) + Expect(testEnv.CreateAndWait(ctx, agentJob)).NotTo(HaveOccurred()) + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kubeConfigSecret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.restartKubelet(ctx, machineContext) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for HostJob done")) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.RestartingKubeletReason}}) + + logBuffer.Reset() + Expect(testEnv.Get(ctx, client.ObjectKey{Namespace: agentJob.Namespace, Name: agentJob.Name}, agentJob)).NotTo(HaveOccurred()) + agentJobPatchSource := agentJob.DeepCopy() + agentJob.Status.Phase = agentv1.PhaseFailed + Expect(testEnv.PatchAndWait(ctx, agentJob, agentJobPatchSource)).To(Succeed()) + ok, err = reconciler.restartKubelet(ctx, machineContext) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("HostJob failed, will try again after three minutes")) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.RestartingKubeletFailedReason}}) + + Expect(testEnv.Get(ctx, client.ObjectKey{Namespace: agentJob.Namespace, Name: agentJob.Name}, agentJob)).NotTo(HaveOccurred()) + agentJobPatchSource = agentJob.DeepCopy() + agentJob.Status.LastExecutionTime = &metav1.Time{Time: time.Now().Add(-3 * time.Minute).UTC()} + Expect(testEnv.PatchAndWait(ctx, agentJob, agentJobPatchSource)).To(Succeed()) + ok, err = reconciler.restartKubelet(ctx, machineContext) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + + Eventually(func() bool { + err := testEnv.Get(ctx, client.ObjectKey{Namespace: agentJob.Namespace, Name: agentJob.Name}, agentJob) + return apierrors.IsNotFound(err) + }, timeout).Should(BeTrue()) + }) + + It("should record job succeeded", func() { + machine.Status.NodeInfo = &corev1.NodeSystemInfo{} + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMComputeResourcesReason, clusterv1.ConditionSeverityInfo, "") + agentJob := newRestartKubelet(elfMachine) + Expect(testEnv.CreateAndWait(ctx, agentJob)).NotTo(HaveOccurred()) + Expect(testEnv.Get(ctx, client.ObjectKey{Namespace: agentJob.Namespace, Name: agentJob.Name}, agentJob)).NotTo(HaveOccurred()) + agentJobPatchSource := agentJob.DeepCopy() + agentJob.Status.Phase = agentv1.PhaseSucceeded + Expect(testEnv.PatchAndWait(ctx, agentJob, agentJobPatchSource)).To(Succeed()) + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kubeConfigSecret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.restartKubelet(ctx, machineContext) + Expect(ok).To(BeTrue()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("HostJob succeeded")) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.RestartingKubeletReason}}) + }) + }) + + Context("reconcileVMCPUAndMemory", func() { + It("should not reconcile when numCPUs or memory is excepted", func() { + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineCtx := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + vm := fake.NewTowerVMFromElfMachine(elfMachine) + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.reconcileVMCPUAndMemory(ctx, machineCtx, vm) + Expect(ok).To(BeTrue()) + Expect(err).NotTo(HaveOccurred()) + Expect(elfMachine.Status.Resources.CPUCores).To(Equal(*vm.Vcpu)) + specMemory := *resource.NewQuantity(service.ByteToMiB(*vm.Memory)*1024*1024, resource.BinarySI) + Expect(elfMachine.Status.Resources.Memory.Equal(specMemory)).To(BeTrue()) + }) + + It("should save the conditionType first", func() { + vm := fake.NewTowerVMFromElfMachine(elfMachine) + elfMachine.Spec.NumCPUs += 1 + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineCtx := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.reconcileVMCPUAndMemory(ctx, machineCtx, vm) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingVMComputeResourcesReason}}) + }) + + It("should wait task done", func() { + vm := fake.NewTowerVMFromElfMachine(elfMachine) + elfMachine.Spec.MemoryMiB += 1 + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMComputeResourcesReason, clusterv1.ConditionSeverityInfo, "") + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineCtx := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + task := fake.NewTowerTask("") + withTaskVM := fake.NewWithTaskVM(vm, task) + mockVMService.EXPECT().UpdateVM(vm, elfMachine).Return(withTaskVM, nil) + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.reconcileVMCPUAndMemory(ctx, machineCtx, vm) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for the VM to be updated CPU and memory")) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingVMComputeResourcesReason}}) + + logBuffer.Reset() + inMemoryCache.Flush() + mockVMService.EXPECT().UpdateVM(vm, elfMachine).Return(nil, unexpectedError) + reconciler = &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err = reconciler.reconcileVMCPUAndMemory(ctx, machineCtx, vm) + Expect(ok).To(BeFalse()) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("failed to trigger update CPU and memory for VM")) + Expect(elfMachine.Status.TaskRef).To(Equal(*task.ID)) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.ExpandingVMComputeResourcesFailedReason}}) + + logBuffer.Reset() + inMemoryCache.Flush() + task = fake.NewTowerTask("") + withTaskVM = fake.NewWithTaskVM(vm, task) + mockVMService.EXPECT().UpdateVM(vm, elfMachine).Return(withTaskVM, nil) + reconciler = &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err = reconciler.reconcileVMCPUAndMemory(ctx, machineCtx, vm) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for the VM to be updated CPU and memory")) + Expect(elfMachine.Status.TaskRef).To(Equal(*task.ID)) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.ExpandingVMComputeResourcesFailedReason}}) + }) + }) }) func newExpandRootPartitionJob(elfMachine *infrav1.ElfMachine) *agentv1.HostOperationJob { @@ -360,3 +552,22 @@ func newExpandRootPartitionJob(elfMachine *infrav1.ElfMachine) *agentv1.HostOper }, } } + +func newRestartKubelet(elfMachine *infrav1.ElfMachine) *agentv1.HostOperationJob { + return &agentv1.HostOperationJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: hostagent.GetRestartKubeletJobName(elfMachine), + Namespace: "default", + }, + Spec: agentv1.HostOperationJobSpec{ + NodeName: elfMachine.Name, + Operation: agentv1.Operation{ + Ansible: &agentv1.Ansible{ + LocalPlaybookText: &agentv1.YAMLText{ + Inline: tasks.RestartKubeletTask, + }, + }, + }, + }, + } +} diff --git a/controllers/elfmachine_controller_test.go b/controllers/elfmachine_controller_test.go index 1a9d49ae..8915d31a 100644 --- a/controllers/elfmachine_controller_test.go +++ b/controllers/elfmachine_controller_test.go @@ -3418,11 +3418,30 @@ var _ = Describe("ElfMachineReconciler", func() { ok, err = reconciler.reconcileVMTask(ctx, machineContext, nil) Expect(ok).Should(BeTrue()) Expect(err).ShouldNot(HaveOccurred()) - expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingVMDiskFailedReason}}) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.ExpandingVMDiskFailedReason}}) + + // Edit VM CPU/Memory + task.Status = models.NewTaskStatus(models.TaskStatusFAILED) + task.Description = service.TowerString("Edit VM") + task.ErrorMessage = service.TowerString(service.VMDuplicateError) + elfMachine.Status.TaskRef = *task.ID + ok, err = reconciler.reconcileVMTask(ctx, machineContext, nil) + Expect(ok).Should(BeTrue()) + Expect(err).ShouldNot(HaveOccurred()) + expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.TaskFailureReason}}) + + elfMachine.Status.TaskRef = *task.ID + elfMachine.Status.Conditions = nil + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMComputeResourcesReason, clusterv1.ConditionSeverityInfo, "") + ok, err = reconciler.reconcileVMTask(ctx, machineContext, nil) + Expect(ok).Should(BeTrue()) + Expect(err).ShouldNot(HaveOccurred()) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.ExpandingVMComputeResourcesFailedReason}}) // GPU gpuDeviceInfo := &service.GPUDeviceInfo{ID: "gpu", AllocatedCount: 0, AvailableCount: 1} gpuDeviceInfos := []*service.GPUDeviceInfo{gpuDeviceInfo} + elfMachine.Status.Conditions = nil tests := []struct { description string @@ -3447,7 +3466,7 @@ var _ = Describe("ElfMachineReconciler", func() { ok, err = reconciler.reconcileVMTask(ctx, machineContext, nil) Expect(ok).Should(BeTrue()) Expect(err).ShouldNot(HaveOccurred()) - Expect(getGPUDevicesLockedByVM(elfCluster.Spec.Cluster, elfMachine.Name)).To(BeNil()) + Expect(getGPUDevicesLockedByVM(elfCluster.Spec.Cluster, elfMachine.Name)).To(BeNil(), tc.description) } }) }) diff --git a/controllers/elfmachinetemplate_controller.go b/controllers/elfmachinetemplate_controller.go deleted file mode 100644 index 0723baf2..00000000 --- a/controllers/elfmachinetemplate_controller.go +++ /dev/null @@ -1,542 +0,0 @@ -/* -Copyright 2024. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package controllers - -import ( - goctx "context" - "fmt" - - "github.com/pkg/errors" - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - apitypes "k8s.io/apimachinery/pkg/types" - kerrors "k8s.io/apimachinery/pkg/util/errors" - "k8s.io/klog/v2" - clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" - controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" - capiutil "sigs.k8s.io/cluster-api/util" - "sigs.k8s.io/cluster-api/util/annotations" - "sigs.k8s.io/cluster-api/util/collections" - "sigs.k8s.io/cluster-api/util/conditions" - "sigs.k8s.io/cluster-api/util/patch" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/controller" - ctrlmgr "sigs.k8s.io/controller-runtime/pkg/manager" - "sigs.k8s.io/controller-runtime/pkg/reconcile" - - infrav1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1" - "github.com/smartxworks/cluster-api-provider-elf/pkg/config" - "github.com/smartxworks/cluster-api-provider-elf/pkg/context" - kcputil "github.com/smartxworks/cluster-api-provider-elf/pkg/util/kcp" - machineutil "github.com/smartxworks/cluster-api-provider-elf/pkg/util/machine" - mdutil "github.com/smartxworks/cluster-api-provider-elf/pkg/util/md" -) - -const ( - anotherMachineHotUpdateInProgressMessage = "another machine resources hot updating is in progress" -) - -// ElfMachineTemplateReconciler reconciles a ElfMachineTemplate object. -type ElfMachineTemplateReconciler struct { - *context.ControllerManagerContext -} - -//+kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=elfmachinetemplates,verbs=get;list;watch;create;update;patch;delete -//+kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=elfmachinetemplates/status,verbs=get;update;patch -//+kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=elfmachinetemplates/finalizers,verbs=update - -// AddMachineTemplateControllerToManager adds the ElfMachineTemplate controller to the provided -// manager. -func AddMachineTemplateControllerToManager(ctx goctx.Context, ctrlMgrCtx *context.ControllerManagerContext, mgr ctrlmgr.Manager, options controller.Options) error { - var ( - controlledType = &infrav1.ElfMachineTemplate{} - ) - - reconciler := &ElfMachineTemplateReconciler{ - ControllerManagerContext: ctrlMgrCtx, - } - - return ctrl.NewControllerManagedBy(mgr). - // Watch the controlled, infrastructure resource. - For(controlledType). - WithOptions(options). - // WithEventFilter(predicates.ResourceNotPausedAndHasFilterLabel(ctrl.LoggerFrom(ctx), emtCtx.WatchFilterValue)). - Complete(reconciler) -} - -func (r *ElfMachineTemplateReconciler) Reconcile(ctx goctx.Context, req ctrl.Request) (_ ctrl.Result, reterr error) { - log := ctrl.LoggerFrom(ctx) - - // Get the ElfMachineTemplate resource for this request. - var elfMachineTemplate infrav1.ElfMachineTemplate - if err := r.Client.Get(ctx, req.NamespacedName, &elfMachineTemplate); err != nil { - if apierrors.IsNotFound(err) { - log.Info("ElfMachineTemplate not found, won't reconcile", "key", req.NamespacedName) - - return reconcile.Result{}, nil - } - - return reconcile.Result{}, err - } - - // Fetch the CAPI Cluster. - cluster, err := capiutil.GetOwnerCluster(ctx, r.Client, elfMachineTemplate.ObjectMeta) - if err != nil { - return reconcile.Result{}, err - } - if cluster == nil { - log.Info("Waiting for Cluster Controller to set OwnerRef on ElfMachineTemplate") - - return reconcile.Result{}, nil - } - log = log.WithValues("Cluster", klog.KObj(cluster)) - ctx = ctrl.LoggerInto(ctx, log) - - if annotations.IsPaused(cluster, &elfMachineTemplate) { - log.V(4).Info("ElfMachineTemplate linked to a cluster that is paused") - - return reconcile.Result{}, nil - } - - // Fetch the ElfCluster - var elfCluster infrav1.ElfCluster - if err := r.Client.Get(ctx, client.ObjectKey{ - Namespace: cluster.Namespace, - Name: cluster.Spec.InfrastructureRef.Name, - }, &elfCluster); err != nil { - if apierrors.IsNotFound(err) { - log.Info("ElfMachineTemplate Waiting for ElfCluster") - return reconcile.Result{}, nil - } - - return reconcile.Result{}, err - } - log = log.WithValues("ElfCluster", klog.KObj(cluster)) - ctx = ctrl.LoggerInto(ctx, log) - - // Create the machine context for this request. - emtCtx := &context.MachineTemplateContext{ - Cluster: cluster, - ElfCluster: &elfCluster, - ElfMachineTemplate: &elfMachineTemplate, - } - - // Handle deleted machines - if !elfMachineTemplate.ObjectMeta.DeletionTimestamp.IsZero() { - return ctrl.Result{}, nil - } - - // Handle non-deleted machines - return r.reconcileMachineResources(ctx, emtCtx) -} - -// reconcileMachineResources ensures that the resources(disk capacity) of the -// virtual machines are the same as expected by ElfMachine. -// TODO: CPU and memory will be supported in the future. -func (r *ElfMachineTemplateReconciler) reconcileMachineResources(ctx goctx.Context, emtCtx *context.MachineTemplateContext) (reconcile.Result, error) { - // The disk size is 0, it means the disk size is the same as the virtual machine template. - // So if the capacity is 0, it means that the disk size has not changed and returns directly. - if emtCtx.ElfMachineTemplate.Spec.Template.Spec.DiskGiB == 0 { - return reconcile.Result{}, nil - } - - if ok, err := r.reconcileCPResources(ctx, emtCtx); err != nil { - return reconcile.Result{}, err - } else if !ok { - return reconcile.Result{RequeueAfter: config.DefaultRequeueTimeout}, nil - } - - if ok, err := r.reconcileWorkerResources(ctx, emtCtx); err != nil { - return reconcile.Result{}, err - } else if !ok { - return reconcile.Result{RequeueAfter: config.DefaultRequeueTimeout}, nil - } - - return reconcile.Result{}, nil -} - -// reconcileCPResources ensures that the resources(disk capacity) of the -// control plane virtual machines are the same as expected by ElfMachine. -func (r *ElfMachineTemplateReconciler) reconcileCPResources(ctx goctx.Context, emtCtx *context.MachineTemplateContext) (bool, error) { - log := ctrl.LoggerFrom(ctx) - - var kcp controlplanev1.KubeadmControlPlane - if err := r.Client.Get(ctx, apitypes.NamespacedName{ - Namespace: emtCtx.Cluster.Spec.ControlPlaneRef.Namespace, - Name: emtCtx.Cluster.Spec.ControlPlaneRef.Name, - }, &kcp); err != nil { - return false, err - } - - if kcp.Spec.MachineTemplate.InfrastructureRef.Namespace != emtCtx.ElfMachineTemplate.Namespace || - kcp.Spec.MachineTemplate.InfrastructureRef.Name != emtCtx.ElfMachineTemplate.Name { - return true, nil - } - - elfMachines, err := machineutil.GetControlPlaneElfMachinesInCluster(ctx, r.Client, emtCtx.Cluster.Namespace, emtCtx.Cluster.Name) - if err != nil { - return false, err - } - - updatingResourcesElfMachines, needUpdatedResourcesElfMachines, err := r.selectResourcesNotUpToDateElfMachines(ctx, emtCtx.ElfMachineTemplate, elfMachines) - if err != nil { - return false, err - } else if len(updatingResourcesElfMachines) == 0 && len(needUpdatedResourcesElfMachines) == 0 { - log.V(4).Info(fmt.Sprintf("ElfMachines resources of kcp %s are up to date", klog.KObj(&kcp))) - - return true, nil - } - - // Only one CP ElfMachine is allowed to update resources at the same time. - if len(updatingResourcesElfMachines) > 0 { - log.V(1).Info("Waiting for control plane ElfMachines to be updated resources", "updatingCount", len(updatingResourcesElfMachines), "needUpdatedCount", len(needUpdatedResourcesElfMachines)) - - if err := r.markElfMachinesResourcesNotUpToDate(ctx, emtCtx.ElfMachineTemplate, needUpdatedResourcesElfMachines); err != nil { - return false, err - } - - return false, nil - } - - checksPassed, err := r.preflightChecksForCP(ctx, emtCtx, &kcp) - if err != nil { - return false, err - } - - var toBeUpdatedElfMachine *infrav1.ElfMachine - if checksPassed { - toBeUpdatedElfMachine = needUpdatedResourcesElfMachines[0] - needUpdatedResourcesElfMachines = needUpdatedResourcesElfMachines[1:] - } - - if err := r.markElfMachinesResourcesNotUpToDate(ctx, emtCtx.ElfMachineTemplate, needUpdatedResourcesElfMachines); err != nil { - return false, err - } - - updatingCount := 0 - if toBeUpdatedElfMachine != nil { - updatingCount = 1 - if err := r.markElfMachinesToBeUpdatedResources(ctx, emtCtx.ElfMachineTemplate, []*infrav1.ElfMachine{toBeUpdatedElfMachine}); err != nil { - return false, err - } - } - - log.V(1).Info("Waiting for control plane ElfMachines to be updated resources", "updatingCount", updatingCount, "needUpdatedCount", len(needUpdatedResourcesElfMachines)) - - return false, err -} - -// preflightChecksForCP checks if the control plane is stable before proceeding with a updating resources operation, -// where stable means that: -// - KCP not in rolling update. -// - There are no machine deletion in progress. -// - All the health conditions on KCP are true. -// - All the health conditions on the control plane machines are true. -// If the control plane is not passing preflight checks, it requeue. -func (r *ElfMachineTemplateReconciler) preflightChecksForCP(ctx goctx.Context, emtCtx *context.MachineTemplateContext, kcp *controlplanev1.KubeadmControlPlane) (bool, error) { - log := ctrl.LoggerFrom(ctx) - // During the rolling update process, it is impossible to determine which - // machines are new and which are old machines. Complete the rolling update - // first and then update the resources to avoid updating resources for old - // machines that are about to be deleted. - if kcputil.IsKCPInRollingUpdate(kcp) { - log.Info("KCP rolling update in progress, skip updating resources") - - return false, nil - } - - cpMachines, err := machineutil.GetControlPlaneMachinesForCluster(ctx, r.Client, emtCtx.Cluster) - if err != nil { - return false, err - } - - machines := collections.FromMachines(cpMachines...) - deletingMachines := machines.Filter(collections.HasDeletionTimestamp) - if len(deletingMachines) > 0 { - log.Info("Waiting for machines to be deleted", "machines", deletingMachines.Names()) - - return false, nil - } - - allMachineHealthConditions := []clusterv1.ConditionType{ - controlplanev1.MachineAPIServerPodHealthyCondition, - controlplanev1.MachineControllerManagerPodHealthyCondition, - controlplanev1.MachineSchedulerPodHealthyCondition, - controlplanev1.MachineEtcdPodHealthyCondition, - controlplanev1.MachineEtcdMemberHealthyCondition, - } - machineErrors := []error{} - for _, machine := range machines { - if machine.Status.NodeRef == nil { - // The conditions will only ever be set on a Machine if we're able to correlate a Machine to a Node. - // Correlating Machines to Nodes requires the nodeRef to be set. - // Instead of confusing users with errors about that the conditions are not set, let's point them - // towards the unset nodeRef (which is the root cause of the conditions not being there). - machineErrors = append(machineErrors, errors.Errorf("Machine %s does not have a corresponding Node yet (Machine.status.nodeRef not set)", machine.Name)) - } else { - for _, condition := range allMachineHealthConditions { - if err := preflightCheckCondition("Machine", machine, condition); err != nil { - machineErrors = append(machineErrors, err) - } - } - } - } - - if len(machineErrors) > 0 { - aggregatedError := kerrors.NewAggregate(machineErrors) - log.Info("Waiting for control plane to pass preflight checks", "failures", aggregatedError.Error()) - - return false, nil - } - - return true, nil -} - -func preflightCheckCondition(kind string, obj conditions.Getter, condition clusterv1.ConditionType) error { - c := conditions.Get(obj, condition) - if c == nil { - return errors.Errorf("%s %s does not have %s condition", kind, obj.GetName(), condition) - } - if c.Status == corev1.ConditionFalse { - return errors.Errorf("%s %s reports %s condition is false (%s, %s)", kind, obj.GetName(), condition, c.Severity, c.Message) - } - if c.Status == corev1.ConditionUnknown { - return errors.Errorf("%s %s reports %s condition is unknown (%s)", kind, obj.GetName(), condition, c.Message) - } - return nil -} - -// reconcileWorkerResources ensures that the resources(disk capacity) of the -// worker virtual machines are the same as expected by ElfMachine. -func (r *ElfMachineTemplateReconciler) reconcileWorkerResources(ctx goctx.Context, emtCtx *context.MachineTemplateContext) (bool, error) { - mds, err := machineutil.GetMDsForCluster(ctx, r.Client, emtCtx.Cluster.Namespace, emtCtx.Cluster.Name) - if err != nil { - return false, err - } - - allElfMachinesUpToDate := true - for i := range len(mds) { - if emtCtx.ElfMachineTemplate.Name != mds[i].Spec.Template.Spec.InfrastructureRef.Name { - continue - } - - if ok, err := r.reconcileWorkerResourcesForMD(ctx, emtCtx, mds[i]); err != nil { - return false, err - } else if !ok { - allElfMachinesUpToDate = false - } - } - - return allElfMachinesUpToDate, nil -} - -// reconcileWorkerResourcesForMD ensures that the resources(disk capacity) of the -// worker virtual machines managed by the md are the same as expected by ElfMachine. -func (r *ElfMachineTemplateReconciler) reconcileWorkerResourcesForMD(ctx goctx.Context, emtCtx *context.MachineTemplateContext, md *clusterv1.MachineDeployment) (bool, error) { - log := ctrl.LoggerFrom(ctx) - - elfMachines, err := machineutil.GetElfMachinesForMD(ctx, r.Client, emtCtx.Cluster, md) - if err != nil { - return false, err - } - - updatingResourcesElfMachines, needUpdatedResourcesElfMachines, err := r.selectResourcesNotUpToDateElfMachines(ctx, emtCtx.ElfMachineTemplate, elfMachines) - if err != nil { - return false, err - } else if len(updatingResourcesElfMachines) == 0 && len(needUpdatedResourcesElfMachines) == 0 { - log.V(4).Info(fmt.Sprintf("ElfMachines resources of md %s are up to date", klog.KObj(md))) - - return true, nil - } - - maxSurge := getMaxSurge(md) - checksPassed := r.preflightChecksForWorker(ctx, md, updatingResourcesElfMachines) - - toBeUpdatedElfMachines, needUpdatedResourcesElfMachines := selectToBeUpdatedAndNeedUpdatedElfMachines(checksPassed, maxSurge, updatingResourcesElfMachines, needUpdatedResourcesElfMachines) - - if err := r.markElfMachinesResourcesNotUpToDate(ctx, emtCtx.ElfMachineTemplate, needUpdatedResourcesElfMachines); err != nil { - return false, err - } - - if err := r.markElfMachinesToBeUpdatedResources(ctx, emtCtx.ElfMachineTemplate, toBeUpdatedElfMachines); err != nil { - return false, err - } - - log.V(1).Info("Waiting for worker ElfMachines to be updated resources", "md", md.Name, "updatingCount", len(updatingResourcesElfMachines)+len(toBeUpdatedElfMachines), "needUpdatedCount", len(needUpdatedResourcesElfMachines), "maxSurge", maxSurge) - - return false, nil -} - -func getMaxSurge(md *clusterv1.MachineDeployment) int { - maxSurge := mdutil.MaxSurge(*md) - if maxSurge <= 0 { - return 1 - } - - return int(maxSurge) -} - -func selectToBeUpdatedAndNeedUpdatedElfMachines( - checksPassed bool, maxSurge int, - updatingResourcesElfMachines, needUpdatedResourcesElfMachines []*infrav1.ElfMachine, -) ([]*infrav1.ElfMachine, []*infrav1.ElfMachine) { - var toBeUpdatedElfMachines, needUpdatedElfMachines []*infrav1.ElfMachine - if checksPassed { - toBeUpdatedCount := maxSurge - len(updatingResourcesElfMachines) - if toBeUpdatedCount > 0 { - if toBeUpdatedCount >= len(needUpdatedResourcesElfMachines) { - toBeUpdatedElfMachines = needUpdatedResourcesElfMachines - needUpdatedElfMachines = nil - } else { - toBeUpdatedElfMachines = needUpdatedResourcesElfMachines[:toBeUpdatedCount] - needUpdatedElfMachines = needUpdatedResourcesElfMachines[toBeUpdatedCount:] - } - } else { - needUpdatedElfMachines = needUpdatedResourcesElfMachines - } - } else { - needUpdatedElfMachines = needUpdatedResourcesElfMachines - } - - return toBeUpdatedElfMachines, needUpdatedElfMachines -} - -// preflightChecksForWorker checks if the worker is stable before proceeding with a updating resources operation, -// where stable means that: -// - MD not in rolling update. -// - The number of machines updating resources is not greater than maxSurge. -// - The number of unavailable machines is no greater than maxUnavailable. -// If the worker is not passing preflight checks, it requeue. -func (r *ElfMachineTemplateReconciler) preflightChecksForWorker(ctx goctx.Context, md *clusterv1.MachineDeployment, updatingResourcesElfMachines []*infrav1.ElfMachine) bool { - log := ctrl.LoggerFrom(ctx) - - if mdutil.IsMDInRollingUpdate(md) { - log.Info("MD rolling update in progress, skip updating resources", "md", md.Name) - - return false - } - - // Use maxSurge of rolling update to control the maximum number of concurrent - // update resources to avoid updating too many machines at the same time. - // If an exception occurs during the resource update process, all machines will - // not be affected. - if maxSurge := getMaxSurge(md); len(updatingResourcesElfMachines) >= maxSurge { - log.V(1).Info("Hot updated worker ElfMachine has reached the max number of concurrencies, so waiting for worker ElfMachines to be updated resources", "md", md.Name, "maxSurge", maxSurge, "updatingCount", len(updatingResourcesElfMachines)) - - return false - } - - maxUnavailable := mdutil.MaxUnavailable(*md) - if md.Status.UnavailableReplicas > maxUnavailable { - log.Info(fmt.Sprintf("MD unavailable replicas %d is greater than expected %d, skip updating resources", md.Status.UnavailableReplicas, maxUnavailable), "md", md.Name) - - return false - } - - return true -} - -// selectResourcesNotUpToDateElfMachines returns elfMachines whose resources are -// not as expected. -func (r *ElfMachineTemplateReconciler) selectResourcesNotUpToDateElfMachines(ctx goctx.Context, elfMachineTemplate *infrav1.ElfMachineTemplate, elfMachines []*infrav1.ElfMachine) ([]*infrav1.ElfMachine, []*infrav1.ElfMachine, error) { - var updatingResourcesElfMachines []*infrav1.ElfMachine - var needUpdatedResourcesElfMachines []*infrav1.ElfMachine - for i := range len(elfMachines) { - elfMachine := elfMachines[i] - - machine, err := capiutil.GetOwnerMachine(ctx, r.Client, elfMachine.ObjectMeta) - if err != nil { - return nil, nil, err - } - - // No need to update the resources of deleted and failed machines. - if machine == nil || - !machine.DeletionTimestamp.IsZero() || - clusterv1.MachinePhase(machine.Status.Phase) == clusterv1.MachinePhaseFailed { - continue - } - - if machineutil.IsUpdatingElfMachineResources(elfMachine) && - machineutil.IsResourcesUpToDate(elfMachineTemplate, elfMachine) { - updatingResourcesElfMachines = append(updatingResourcesElfMachines, elfMachine) - } else if machineutil.NeedUpdateElfMachineResources(elfMachineTemplate, elfMachine) { - needUpdatedResourcesElfMachines = append(needUpdatedResourcesElfMachines, elfMachine) - } - } - - return updatingResourcesElfMachines, needUpdatedResourcesElfMachines, nil -} - -// markElfMachinesToBeUpdatedResources synchronizes the expected resource values -// from the ElfMachineTemplate and marks the machines to be updated resources. -func (r *ElfMachineTemplateReconciler) markElfMachinesToBeUpdatedResources(ctx goctx.Context, elfMachineTemplate *infrav1.ElfMachineTemplate, elfMachines []*infrav1.ElfMachine) error { - log := ctrl.LoggerFrom(ctx) - - for i := range len(elfMachines) { - elfMachine := elfMachines[i] - - patchHelper, err := patch.NewHelper(elfMachine, r.Client) - if err != nil { - return err - } - - // Ensure resources are up to date. - orignalDiskGiB := elfMachine.Spec.DiskGiB - elfMachine.Spec.DiskGiB = elfMachineTemplate.Spec.Template.Spec.DiskGiB - conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, "") - - log.Info(fmt.Sprintf("Resources of ElfMachine is not up to date, marking for updating resources(disk: %d -> %d)", orignalDiskGiB, elfMachine.Spec.DiskGiB), "elfMachine", elfMachine.Name) - - if err := patchHelper.Patch(ctx, elfMachine); err != nil { - return errors.Wrapf(err, "failed to patch ElfMachine %s to mark for updating resources", elfMachine.Name) - } - } - - return nil -} - -// markElfMachinesResourcesNotUpToDate synchronizes the expected resource values -// from the ElfMachineTemplate and marks the machines waiting for updated resources. -func (r *ElfMachineTemplateReconciler) markElfMachinesResourcesNotUpToDate(ctx goctx.Context, elfMachineTemplate *infrav1.ElfMachineTemplate, elfMachines []*infrav1.ElfMachine) error { - log := ctrl.LoggerFrom(ctx) - - for i := range len(elfMachines) { - elfMachine := elfMachines[i] - if machineutil.IsResourcesUpToDate(elfMachineTemplate, elfMachine) { - continue - } - - patchHelper, err := patch.NewHelper(elfMachine, r.Client) - if err != nil { - return err - } - - // Ensure resources are up to date. - orignalDiskGiB := elfMachine.Spec.DiskGiB - elfMachine.Spec.DiskGiB = elfMachineTemplate.Spec.Template.Spec.DiskGiB - conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, anotherMachineHotUpdateInProgressMessage) - - log.Info(fmt.Sprintf("Resources of ElfMachine is not up to date, marking for resources not up to date and waiting for hot updating resources(disk: %d -> %d)", orignalDiskGiB, elfMachine.Spec.DiskGiB), "elfMachine", elfMachine.Name) - - if err := patchHelper.Patch(ctx, elfMachine); err != nil { - return errors.Wrapf(err, "failed to patch ElfMachine %s to mark for resources not up to date", elfMachine.Name) - } - } - - return nil -} diff --git a/controllers/elfmachinetemplate_controller_test.go b/controllers/elfmachinetemplate_controller_test.go deleted file mode 100644 index 76e842a6..00000000 --- a/controllers/elfmachinetemplate_controller_test.go +++ /dev/null @@ -1,507 +0,0 @@ -/* -Copyright 2024. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package controllers - -import ( - "bytes" - "fmt" - "time" - - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/klog/v2" - "k8s.io/utils/ptr" - clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" - controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" - capiutil "sigs.k8s.io/cluster-api/util" - "sigs.k8s.io/cluster-api/util/conditions" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - ctrlutil "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - - infrav1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1" - "github.com/smartxworks/cluster-api-provider-elf/test/fake" -) - -var _ = Describe("ElfMachineTemplateReconciler", func() { - var ( - elfCluster *infrav1.ElfCluster - cluster *clusterv1.Cluster - elfMachine *infrav1.ElfMachine - machine *clusterv1.Machine - secret *corev1.Secret - logBuffer *bytes.Buffer - ) - - BeforeEach(func() { - logBuffer = new(bytes.Buffer) - klog.SetOutput(logBuffer) - - elfCluster, cluster, elfMachine, machine, secret = fake.NewClusterAndMachineObjects() - }) - - AfterEach(func() { - }) - - Context("Reconcile a ElfMachineTemplate", func() { - It("Reconcile", func() { - emt := fake.NewElfMachineTemplate() - emt.OwnerReferences = append(emt.OwnerReferences, metav1.OwnerReference{Kind: fake.ClusterKind, APIVersion: clusterv1.GroupVersion.String(), Name: cluster.Name, UID: "blah"}) - kcp := fake.NewKCP() - kcp.Spec.MachineTemplate = controlplanev1.KubeadmControlPlaneMachineTemplate{ - InfrastructureRef: corev1.ObjectReference{Namespace: emt.Namespace, Name: emt.Name}, - } - md := fake.NewMD() - md.Labels = map[string]string{clusterv1.ClusterNameLabel: cluster.Name} - md.Spec.Template = clusterv1.MachineTemplateSpec{ - Spec: clusterv1.MachineSpec{ - InfrastructureRef: corev1.ObjectReference{Namespace: emt.Namespace, Name: emt.Name}, - }, - } - cluster.Spec.ControlPlaneRef = &corev1.ObjectReference{Namespace: kcp.Namespace, Name: kcp.Name} - ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, emt, kcp, md) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - emtKey := capiutil.ObjectKey(emt) - reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - result, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: emtKey}) - Expect(result).To(BeZero()) - Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring(fmt.Sprintf("ElfMachines resources of kcp %s are up to date", klog.KObj(kcp)))) - Expect(logBuffer.String()).To(ContainSubstring(fmt.Sprintf("ElfMachines resources of md %s are up to date", klog.KObj(md)))) - - emt.Spec.Template.Spec.DiskGiB = 0 - ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, emt, kcp, md) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - emtKey = capiutil.ObjectKey(emt) - reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - result, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: emtKey}) - Expect(result).To(BeZero()) - Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring(fmt.Sprintf("ElfMachines resources of kcp %s are up to date", klog.KObj(kcp)))) - Expect(logBuffer.String()).To(ContainSubstring(fmt.Sprintf("ElfMachines resources of md %s are up to date", klog.KObj(md)))) - }) - - It("should not error and not requeue the request without elfmachinetemplate", func() { - emt := fake.NewElfMachineTemplate() - ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) - reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - result, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: capiutil.ObjectKey(emt)}) - Expect(result).To(BeZero()) - Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring("ElfMachineTemplate not found, won't reconcile")) - - emt.OwnerReferences = append(emt.OwnerReferences, metav1.OwnerReference{Kind: fake.ClusterKind, APIVersion: clusterv1.GroupVersion.String(), Name: cluster.Name, UID: "blah"}) - ctrlMgrCtx = fake.NewControllerManagerContext(cluster, elfMachine, machine, secret, emt) - reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - result, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: capiutil.ObjectKey(emt)}) - Expect(result).To(BeZero()) - Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring("ElfMachineTemplate Waiting for ElfCluster")) - }) - - It("should not error and not requeue the request when Cluster is paused", func() { - emt := fake.NewElfMachineTemplate() - emt.OwnerReferences = append(emt.OwnerReferences, metav1.OwnerReference{Kind: fake.ClusterKind, APIVersion: clusterv1.GroupVersion.String(), Name: cluster.Name, UID: "blah"}) - cluster.Spec.Paused = true - ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, emt) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - emtKey := capiutil.ObjectKey(emt) - reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - result, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: emtKey}) - Expect(result).To(BeZero()) - Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring("ElfMachineTemplate linked to a cluster that is paused")) - }) - }) - - Context("reconcileWorkerResources", func() { - It("reconcileWorkerResources", func() { - emt := fake.NewElfMachineTemplate() - md := fake.NewMD() - md.Labels = map[string]string{clusterv1.ClusterNameLabel: cluster.Name} - md.Spec.Replicas = ptr.To[int32](3) - md.Spec.Template = clusterv1.MachineTemplateSpec{ - Spec: clusterv1.MachineSpec{ - InfrastructureRef: corev1.ObjectReference{Namespace: emt.Namespace, Name: emt.Name}, - }, - } - md.Spec.Strategy = &clusterv1.MachineDeploymentStrategy{ - RollingUpdate: &clusterv1.MachineRollingUpdateDeployment{ - MaxSurge: intOrStrPtr(1), - MaxUnavailable: intOrStrPtr(1), - }, - } - fake.ToWorkerMachine(elfMachine, md) - fake.ToWorkerMachine(machine, md) - fake.SetElfMachineTemplateForElfMachine(elfMachine, emt) - ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, md) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - mtCtx := newMachineTemplateContext(elfCluster, cluster, emt) - reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - ok, err := reconciler.reconcileWorkerResources(ctx, mtCtx) - Expect(ok).To(BeTrue()) - Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring(fmt.Sprintf("ElfMachines resources of md %s are up to date", klog.KObj(md)))) - - logBuffer.Reset() - elfMachine.Spec.DiskGiB -= 1 - ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, md) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - mtCtx = newMachineTemplateContext(elfCluster, cluster, emt) - reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - ok, err = reconciler.reconcileWorkerResources(ctx, mtCtx) - Expect(ok).To(BeFalse()) - Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).NotTo(ContainSubstring("Resources of ElfMachine is not up to date, marking for resources not up to date and waiting for hot updating resources")) - Expect(logBuffer.String()).To(ContainSubstring("Resources of ElfMachine is not up to date, marking for updating resources")) - Expect(logBuffer.String()).To(ContainSubstring("Waiting for worker ElfMachines to be updated resources")) - - // logBuffer.Reset() - // elfMachine.Spec.DiskGiB -= 1 - // updatingElfMachine, updatingMachine := fake.NewMachineObjects(elfCluster, cluster) - // fake.ToWorkerMachine(updatingElfMachine, md) - // fake.ToWorkerMachine(updatingMachine, md) - // fake.SetElfMachineTemplateForElfMachine(updatingElfMachine, emt) - // ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, md, updatingElfMachine, updatingMachine) - // fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - // fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, updatingElfMachine, updatingMachine) - // mtCtx = newMachineTemplateContext(elfCluster, cluster, emt) - // reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - // ok, err = reconciler.reconcileWorkerResources(ctx, mtCtx) - // Expect(logBuffer.String()).To(ContainSubstring("Resources of ElfMachine is not up to date, marking for updating resources")) - }) - - It("selectToBeUpdatedAndNeedUpdatedElfMachines", func() { - elfMachine1, _ := fake.NewMachineObjects(elfCluster, cluster) - elfMachine2, _ := fake.NewMachineObjects(elfCluster, cluster) - - toBeUpdated, needUpdated := selectToBeUpdatedAndNeedUpdatedElfMachines(false, 1, []*infrav1.ElfMachine{}, []*infrav1.ElfMachine{elfMachine1, elfMachine2}) - Expect(toBeUpdated).To(BeEmpty()) - Expect(needUpdated).To(Equal([]*infrav1.ElfMachine{elfMachine1, elfMachine2})) - - toBeUpdated, needUpdated = selectToBeUpdatedAndNeedUpdatedElfMachines(true, 1, []*infrav1.ElfMachine{elfMachine1}, []*infrav1.ElfMachine{elfMachine2}) - Expect(toBeUpdated).To(BeEmpty()) - Expect(needUpdated).To(Equal([]*infrav1.ElfMachine{elfMachine2})) - - toBeUpdated, needUpdated = selectToBeUpdatedAndNeedUpdatedElfMachines(true, 2, []*infrav1.ElfMachine{elfMachine1}, []*infrav1.ElfMachine{elfMachine2}) - Expect(toBeUpdated).To(Equal([]*infrav1.ElfMachine{elfMachine2})) - Expect(needUpdated).To(BeEmpty()) - - toBeUpdated, needUpdated = selectToBeUpdatedAndNeedUpdatedElfMachines(true, 1, []*infrav1.ElfMachine{}, []*infrav1.ElfMachine{elfMachine1, elfMachine2}) - Expect(toBeUpdated).To(Equal([]*infrav1.ElfMachine{elfMachine1})) - Expect(needUpdated).To(Equal([]*infrav1.ElfMachine{elfMachine2})) - }) - }) - - Context("reconcileCPResources", func() { - It("reconcileCPResources", func() { - emt := fake.NewElfMachineTemplate() - kcp := fake.NewKCP() - kcp.Spec.MachineTemplate = controlplanev1.KubeadmControlPlaneMachineTemplate{ - InfrastructureRef: corev1.ObjectReference{Namespace: emt.Namespace, Name: "notfoud"}, - } - cluster.Spec.ControlPlaneRef = &corev1.ObjectReference{Namespace: kcp.Namespace, Name: kcp.Name} - elfMachine.Spec.DiskGiB -= 1 - ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kcp) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - mtCtx := newMachineTemplateContext(elfCluster, cluster, emt) - reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - ok, err := reconciler.reconcileCPResources(ctx, mtCtx) - Expect(ok).To(BeTrue()) - Expect(err).NotTo(HaveOccurred()) - - kcp.Spec.MachineTemplate = controlplanev1.KubeadmControlPlaneMachineTemplate{ - InfrastructureRef: corev1.ObjectReference{Namespace: emt.Namespace, Name: emt.Name}, - } - ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kcp) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - mtCtx = newMachineTemplateContext(elfCluster, cluster, emt) - reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - ok, err = reconciler.reconcileCPResources(ctx, mtCtx) - Expect(ok).To(BeTrue()) - Expect(err).NotTo(HaveOccurred()) - - logBuffer.Reset() - updatingElfMachine, updatingMachine := fake.NewMachineObjects(elfCluster, cluster) - fake.ToCPMachine(updatingElfMachine, kcp) - fake.ToCPMachine(updatingMachine, kcp) - fake.SetElfMachineTemplateForElfMachine(updatingElfMachine, emt) - conditions.MarkFalse(updatingElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, "") - ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kcp, - updatingElfMachine, updatingMachine, - ) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, updatingElfMachine, updatingMachine) - mtCtx = newMachineTemplateContext(elfCluster, cluster, emt) - reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - ok, err = reconciler.reconcileCPResources(ctx, mtCtx) - Expect(ok).To(BeFalse()) - Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring("Waiting for control plane ElfMachines to be updated resources")) - - logBuffer.Reset() - kcp.Spec.Replicas = ptr.To[int32](3) - kcp.Status.Replicas = 3 - kcp.Status.UpdatedReplicas = 2 - fake.ToCPMachine(elfMachine, kcp) - fake.ToCPMachine(machine, kcp) - elfMachine.Spec.DiskGiB -= 1 - machine.Status.NodeRef = &corev1.ObjectReference{} - conditions.MarkTrue(machine, controlplanev1.MachineAPIServerPodHealthyCondition) - conditions.MarkTrue(machine, controlplanev1.MachineControllerManagerPodHealthyCondition) - conditions.MarkTrue(machine, controlplanev1.MachineSchedulerPodHealthyCondition) - conditions.MarkTrue(machine, controlplanev1.MachineEtcdPodHealthyCondition) - conditions.MarkTrue(machine, controlplanev1.MachineEtcdMemberHealthyCondition) - ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kcp) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - mtCtx = newMachineTemplateContext(elfCluster, cluster, emt) - reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - ok, err = reconciler.reconcileCPResources(ctx, mtCtx) - Expect(ok).To(BeFalse()) - Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring("KCP rolling update in progress, skip updating resources")) - Expect(logBuffer.String()).NotTo(ContainSubstring("Resources of ElfMachine is not up to date, marking for updating resources")) - - logBuffer.Reset() - kcp.Status.UpdatedReplicas = 3 - ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kcp) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - mtCtx = newMachineTemplateContext(elfCluster, cluster, emt) - reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - ok, err = reconciler.reconcileCPResources(ctx, mtCtx) - Expect(ok).To(BeFalse()) - Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring("Resources of ElfMachine is not up to date, marking for updating resources")) - Expect(logBuffer.String()).To(ContainSubstring("Waiting for control plane ElfMachines to be updated resources")) - }) - }) - - Context("preflightChecksForCP", func() { - It("should return false if KCP rolling update in progress", func() { - emt := fake.NewElfMachineTemplate() - kcp := fake.NewKCP() - kcp.Spec.Replicas = ptr.To[int32](3) - kcp.Status.Replicas = 3 - kcp.Status.UpdatedReplicas = 2 - ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - mtCtx := newMachineTemplateContext(elfCluster, cluster, emt) - reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - ok, err := reconciler.preflightChecksForCP(ctx, mtCtx, kcp) - Expect(ok).To(BeFalse()) - Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring("KCP rolling update in progress, skip updating resources")) - }) - - It("should return false if has deleting or failed machine", func() { - emt := fake.NewElfMachineTemplate() - kcp := fake.NewKCP() - kcp.Spec.Replicas = ptr.To[int32](3) - kcp.Status.Replicas = 3 - kcp.Status.UpdatedReplicas = 3 - fake.ToCPMachine(elfMachine, kcp) - fake.ToCPMachine(machine, kcp) - ctrlutil.AddFinalizer(machine, infrav1.MachineFinalizer) - machine.DeletionTimestamp = &metav1.Time{Time: time.Now().UTC()} - ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - mtCtx := newMachineTemplateContext(elfCluster, cluster, emt) - reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - ok, err := reconciler.preflightChecksForCP(ctx, mtCtx, kcp) - Expect(ok).To(BeFalse()) - Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring("Waiting for machines to be deleted")) - - logBuffer.Reset() - machine.DeletionTimestamp = nil - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) - reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - ok, err = reconciler.preflightChecksForCP(ctx, mtCtx, kcp) - Expect(ok).To(BeFalse()) - Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring("Waiting for control plane to pass preflight checks")) - - logBuffer.Reset() - machine.Status.NodeRef = &corev1.ObjectReference{} - conditions.MarkFalse(machine, controlplanev1.MachineEtcdPodHealthyCondition, controlplanev1.PodInspectionFailedReason, clusterv1.ConditionSeverityInfo, "error") - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) - reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - ok, err = reconciler.preflightChecksForCP(ctx, mtCtx, kcp) - Expect(ok).To(BeFalse()) - Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring("Waiting for control plane to pass preflight checks")) - }) - - It("should return true", func() { - emt := fake.NewElfMachineTemplate() - kcp := fake.NewKCP() - kcp.Spec.Replicas = ptr.To[int32](3) - kcp.Status.Replicas = 3 - kcp.Status.UpdatedReplicas = 3 - fake.ToCPMachine(elfMachine, kcp) - fake.ToCPMachine(machine, kcp) - machine.Status.NodeRef = &corev1.ObjectReference{} - conditions.MarkTrue(machine, controlplanev1.MachineAPIServerPodHealthyCondition) - conditions.MarkTrue(machine, controlplanev1.MachineControllerManagerPodHealthyCondition) - conditions.MarkTrue(machine, controlplanev1.MachineSchedulerPodHealthyCondition) - conditions.MarkTrue(machine, controlplanev1.MachineEtcdPodHealthyCondition) - conditions.MarkTrue(machine, controlplanev1.MachineEtcdMemberHealthyCondition) - ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - mtCtx := newMachineTemplateContext(elfCluster, cluster, emt) - reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - ok, err := reconciler.preflightChecksForCP(ctx, mtCtx, kcp) - Expect(ok).To(BeTrue()) - Expect(err).NotTo(HaveOccurred()) - }) - }) - - Context("preflightChecksForWorker", func() { - It("should return false if MD rolling update in progress", func() { - md := fake.NewMD() - fake.ToWorkerMachine(elfMachine, md) - fake.ToWorkerMachine(machine, md) - md.Spec.Replicas = ptr.To[int32](3) - md.Status.Replicas = 3 - md.Status.UpdatedReplicas = 2 - ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - ok := reconciler.preflightChecksForWorker(ctx, md, nil) - Expect(ok).To(BeFalse()) - Expect(logBuffer.String()).To(ContainSubstring("MD rolling update in progress, skip updating resources")) - }) - - It("should check maxSurge", func() { - md := fake.NewMD() - fake.ToWorkerMachine(elfMachine, md) - fake.ToWorkerMachine(machine, md) - md.Spec.Strategy = &clusterv1.MachineDeploymentStrategy{ - RollingUpdate: &clusterv1.MachineRollingUpdateDeployment{MaxSurge: intOrStrPtr(1)}, - } - md.Spec.Replicas = ptr.To[int32](3) - md.Status.Replicas = 3 - md.Status.UpdatedReplicas = 3 - ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - ok := reconciler.preflightChecksForWorker(ctx, md, []*infrav1.ElfMachine{elfMachine}) - Expect(ok).To(BeFalse()) - Expect(logBuffer.String()).To(ContainSubstring("Hot updated worker ElfMachine has reached the max number of concurrencies, so waiting for worker ElfMachines to be updated resources")) - - logBuffer.Reset() - md.Status.UnavailableReplicas = 3 - ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - ok = reconciler.preflightChecksForWorker(ctx, md, []*infrav1.ElfMachine{}) - Expect(ok).To(BeFalse()) - Expect(logBuffer.String()).To(ContainSubstring("MD unavailable replicas")) - - md.Status.UnavailableReplicas = 0 - ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - ok = reconciler.preflightChecksForWorker(ctx, md, []*infrav1.ElfMachine{}) - Expect(ok).To(BeTrue()) - }) - }) - - Context("selectResourcesNotUpToDateElfMachines", func() { - It("should return updating/needUpdated resources elfMachines", func() { - emt := fake.NewElfMachineTemplate() - upToDateElfMachine, upToDateMachine := fake.NewMachineObjects(elfCluster, cluster) - fake.SetElfMachineTemplateForElfMachine(upToDateElfMachine, emt) - noUpToDateElfMachine, noUpToDateMachine := fake.NewMachineObjects(elfCluster, cluster) - fake.SetElfMachineTemplateForElfMachine(noUpToDateElfMachine, emt) - noUpToDateElfMachine.Spec.DiskGiB -= 1 - updatingElfMachine, updatingMachine := fake.NewMachineObjects(elfCluster, cluster) - fake.SetElfMachineTemplateForElfMachine(updatingElfMachine, emt) - conditions.MarkFalse(updatingElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, "") - failedElfMachine, failedMachine := fake.NewMachineObjects(elfCluster, cluster) - fake.SetElfMachineTemplateForElfMachine(failedElfMachine, emt) - failedElfMachine.Spec.DiskGiB -= 1 - failedMachine.Status.Phase = string(clusterv1.MachinePhaseFailed) - ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, - upToDateElfMachine, upToDateMachine, - noUpToDateElfMachine, noUpToDateMachine, - updatingElfMachine, updatingMachine, - failedElfMachine, failedMachine, - ) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, upToDateElfMachine, upToDateMachine) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, noUpToDateElfMachine, noUpToDateMachine) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, updatingElfMachine, updatingMachine) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, failedElfMachine, failedMachine) - reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - elfMachines := []*infrav1.ElfMachine{upToDateElfMachine, noUpToDateElfMachine, updatingElfMachine, failedElfMachine} - updatingResourcesElfMachines, needUpdatedResourcesElfMachines, err := reconciler.selectResourcesNotUpToDateElfMachines(ctx, emt, elfMachines) - Expect(err).NotTo(HaveOccurred()) - Expect(updatingResourcesElfMachines).To(Equal([]*infrav1.ElfMachine{updatingElfMachine})) - Expect(needUpdatedResourcesElfMachines).To(Equal([]*infrav1.ElfMachine{noUpToDateElfMachine})) - }) - }) - - Context("markElfMachinesToBeUpdatedResources", func() { - It("should mark resources to be updated", func() { - emt := fake.NewElfMachineTemplate() - fake.SetElfMachineTemplateForElfMachine(elfMachine, emt) - elfMachine.Spec.DiskGiB -= 1 - ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - err := reconciler.markElfMachinesToBeUpdatedResources(ctx, emt, []*infrav1.ElfMachine{elfMachine}) - Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring("Resources of ElfMachine is not up to date, marking for updating resources")) - elfMachineKey := client.ObjectKey{Namespace: elfMachine.Namespace, Name: elfMachine.Name} - Eventually(func() bool { - _ = reconciler.Client.Get(ctx, elfMachineKey, elfMachine) - return elfMachine.Spec.DiskGiB == emt.Spec.Template.Spec.DiskGiB - }, timeout).Should(BeTrue()) - expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.WaitingForResourcesHotUpdateReason}}) - }) - }) - - Context("markElfMachinesResourcesNotUpToDate", func() { - It("should mark resources not up to date", func() { - ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) - emt := fake.NewElfMachineTemplate() - fake.SetElfMachineTemplateForElfMachine(elfMachine, emt) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - err := reconciler.markElfMachinesResourcesNotUpToDate(ctx, emt, []*infrav1.ElfMachine{elfMachine}) - Expect(err).NotTo(HaveOccurred()) - expectConditions(elfMachine, []conditionAssertion{}) - - logBuffer.Reset() - elfMachine.Spec.DiskGiB -= 1 - ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) - fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) - reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} - err = reconciler.markElfMachinesResourcesNotUpToDate(ctx, emt, []*infrav1.ElfMachine{elfMachine}) - Expect(err).NotTo(HaveOccurred()) - Expect(logBuffer.String()).To(ContainSubstring("Resources of ElfMachine is not up to date, marking for resources not up to date and waiting for hot updating resources")) - elfMachineKey := client.ObjectKey{Namespace: elfMachine.Namespace, Name: elfMachine.Name} - Eventually(func() bool { - _ = reconciler.Client.Get(ctx, elfMachineKey, elfMachine) - return elfMachine.Spec.DiskGiB == emt.Spec.Template.Spec.DiskGiB - }, timeout).Should(BeTrue()) - expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.WaitingForResourcesHotUpdateReason}}) - Expect(conditions.GetMessage(elfMachine, infrav1.ResourcesHotUpdatedCondition)).To(Equal(anotherMachineHotUpdateInProgressMessage)) - }) - }) -}) diff --git a/controllers/suite_test.go b/controllers/suite_test.go index 6f873642..67f0f38c 100644 --- a/controllers/suite_test.go +++ b/controllers/suite_test.go @@ -28,7 +28,6 @@ import ( "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/intstr" utilruntime "k8s.io/apimachinery/pkg/util/runtime" cgscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/klog/v2" @@ -141,16 +140,6 @@ func newMachineContext( } } -func newMachineTemplateContext( - elfCluster *infrav1.ElfCluster, cluster *clusterv1.Cluster, - emt *infrav1.ElfMachineTemplate) *context.MachineTemplateContext { - return &context.MachineTemplateContext{ - Cluster: cluster, - ElfCluster: elfCluster, - ElfMachineTemplate: emt, - } -} - type conditionAssertion struct { conditionType clusterv1.ConditionType status corev1.ConditionStatus @@ -169,8 +158,3 @@ func expectConditions(getter conditions.Getter, expected []conditionAssertion) { Expect(actual.Reason).To(Equal(c.reason)) } } - -func intOrStrPtr(i int32) *intstr.IntOrString { - res := intstr.FromInt(int(i)) - return &res -} diff --git a/main.go b/main.go index ff9add35..c6037c0f 100644 --- a/main.go +++ b/main.go @@ -235,10 +235,6 @@ func main() { return err } - if err := controllers.AddMachineTemplateControllerToManager(ctx, ctrlMgrCtx, mgr, controller.Options{MaxConcurrentReconciles: elfMachineTemplateConcurrency}); err != nil { - return err - } - return nil } diff --git a/pkg/hostagent/service.go b/pkg/hostagent/service.go index 2518f230..3f16ed9b 100644 --- a/pkg/hostagent/service.go +++ b/pkg/hostagent/service.go @@ -27,7 +27,16 @@ import ( "github.com/smartxworks/cluster-api-provider-elf/pkg/hostagent/tasks" ) -const defaultTimeout = 1 * time.Minute +type HostAgentJobType string + +const ( + defaultTimeout = 1 * time.Minute + + // HostAgentJobTypeExpandRootPartition is the job type for expanding the root partition. + HostAgentJobTypeExpandRootPartition HostAgentJobType = "expand-root-partition" + // HostAgentJobTypeRestartKubelet is the job type for restarting the kubelet. + HostAgentJobTypeRestartKubelet HostAgentJobType = "restart-kubelet" +) func GetHostJob(ctx goctx.Context, c client.Client, namespace, name string) (*agentv1.HostOperationJob, error) { var restartKubeletJob agentv1.HostOperationJob @@ -47,8 +56,23 @@ func GetExpandRootPartitionJobName(elfMachine *infrav1.ElfMachine) string { return fmt.Sprintf("cape-expand-root-partition-%s-%d", elfMachine.Name, elfMachine.Spec.DiskGiB) } -func ExpandRootPartition(ctx goctx.Context, c client.Client, elfMachine *infrav1.ElfMachine) (*agentv1.HostOperationJob, error) { - agentJob := &agentv1.HostOperationJob{ +func GetRestartKubeletJobName(elfMachine *infrav1.ElfMachine) string { + return fmt.Sprintf("cape-restart-kubelet-%s-%d-%d-%d", elfMachine.Name, elfMachine.Spec.NumCPUs, elfMachine.Spec.NumCoresPerSocket, elfMachine.Spec.MemoryMiB) +} + +func GetJobName(elfMachine *infrav1.ElfMachine, jobType HostAgentJobType) string { + switch jobType { + case HostAgentJobTypeExpandRootPartition: + return GetExpandRootPartitionJobName(elfMachine) + case HostAgentJobTypeRestartKubelet: + return GetRestartKubeletJobName(elfMachine) + default: + return "" + } +} + +func GenerateExpandRootPartitionJob(elfMachine *infrav1.ElfMachine) *agentv1.HostOperationJob { + return &agentv1.HostOperationJob{ ObjectMeta: metav1.ObjectMeta{ Name: GetExpandRootPartitionJobName(elfMachine), Namespace: "default", @@ -65,10 +89,35 @@ func ExpandRootPartition(ctx goctx.Context, c client.Client, elfMachine *infrav1 }, }, } +} - if err := c.Create(ctx, agentJob); err != nil { - return nil, err +func GenerateRestartKubeletJob(elfMachine *infrav1.ElfMachine) *agentv1.HostOperationJob { + return &agentv1.HostOperationJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: GetRestartKubeletJobName(elfMachine), + Namespace: "default", + }, + Spec: agentv1.HostOperationJobSpec{ + NodeName: elfMachine.Name, + Operation: agentv1.Operation{ + Ansible: &agentv1.Ansible{ + LocalPlaybookText: &agentv1.YAMLText{ + Inline: tasks.RestartKubeletTask, + }, + }, + Timeout: metav1.Duration{Duration: defaultTimeout}, + }, + }, } +} - return agentJob, nil +func GenerateJob(elfMachine *infrav1.ElfMachine, jobType HostAgentJobType) *agentv1.HostOperationJob { + switch jobType { + case HostAgentJobTypeExpandRootPartition: + return GenerateExpandRootPartitionJob(elfMachine) + case HostAgentJobTypeRestartKubelet: + return GenerateRestartKubeletJob(elfMachine) + default: + return nil + } } diff --git a/pkg/hostagent/tasks/restart_kubelet.yaml b/pkg/hostagent/tasks/restart_kubelet.yaml new file mode 100644 index 00000000..d2705247 --- /dev/null +++ b/pkg/hostagent/tasks/restart_kubelet.yaml @@ -0,0 +1,9 @@ +--- +- name: Kubelet | restart kubelet + hosts: all + become: true + gather_facts: false + tasks: + - ansible.builtin.service: + name: kubelet + state: restarted \ No newline at end of file diff --git a/pkg/hostagent/tasks/tasks.go b/pkg/hostagent/tasks/tasks.go index 8a38d3df..7cffa99d 100644 --- a/pkg/hostagent/tasks/tasks.go +++ b/pkg/hostagent/tasks/tasks.go @@ -21,3 +21,8 @@ import ( // //go:embed expand_root_partition.yaml var ExpandRootPartitionTask string + +// RestartKubeletTask is the task to restart kubelet. +// +//go:embed restart_kubelet.yaml +var RestartKubeletTask string diff --git a/pkg/service/util.go b/pkg/service/util.go index 96519991..d47254b2 100644 --- a/pkg/service/util.go +++ b/pkg/service/util.go @@ -34,14 +34,14 @@ import ( func GetUpdatedVMRestrictedFields(vm *models.VM, elfMachine *infrav1.ElfMachine) map[string]string { fieldMap := make(map[string]string) vCPU := TowerVCPU(elfMachine.Spec.NumCPUs) - cpuCores := TowerCPUCores(*vCPU, elfMachine.Spec.NumCoresPerSocket) - cpuSockets := TowerCPUSockets(*vCPU, *cpuCores) + cpuSocketCores := TowerCPUSocketCores(elfMachine.Spec.NumCoresPerSocket, *vCPU) + cpuSockets := TowerCPUSockets(*vCPU, *cpuSocketCores) if *vm.Vcpu > *vCPU { fieldMap["vcpu"] = fmt.Sprintf("actual: %d, expected: %d", *vm.Vcpu, *vCPU) } - if *vm.CPU.Cores > *cpuCores { - fieldMap["cpuCores"] = fmt.Sprintf("actual: %d, expected: %d", *vm.CPU.Cores, *cpuCores) + if *vm.CPU.Cores > *cpuSocketCores { + fieldMap["cpuCores"] = fmt.Sprintf("actual: %d, expected: %d", *vm.CPU.Cores, *cpuSocketCores) } if *vm.CPU.Sockets > *cpuSockets { fieldMap["cpuSockets"] = fmt.Sprintf("actual: %d, expected: %d", *vm.CPU.Sockets, *cpuSockets) @@ -136,16 +136,16 @@ func TowerVCPU(vCPU int32) *int32 { return &vCPU } -func TowerCPUCores(cpuCores, vCPU int32) *int32 { - if cpuCores <= 0 { - cpuCores = vCPU +func TowerCPUSocketCores(cpuSocketCores, vCPU int32) *int32 { + if cpuSocketCores <= 0 { + cpuSocketCores = vCPU } - return &cpuCores + return &cpuSocketCores } -func TowerCPUSockets(vCPU, cpuCores int32) *int32 { - cpuSockets := vCPU / cpuCores +func TowerCPUSockets(vCPU, cpuSocketCores int32) *int32 { + cpuSockets := vCPU / cpuSocketCores return &cpuSockets } @@ -154,6 +154,10 @@ func ByteToGiB(bytes int64) int32 { return int32(bytes / 1024 / 1024 / 1024) } +func ByteToMiB(bytes int64) int64 { + return bytes / 1024 / 1024 +} + func IsVMInRecycleBin(vm *models.VM) bool { return vm.InRecycleBin != nil && *vm.InRecycleBin } @@ -203,7 +207,8 @@ func IsUpdateVMTask(task *models.Task) bool { } func IsUpdateVMDiskTask(task *models.Task, vmName string) bool { - return GetTowerString(task.Description) == fmt.Sprintf("Edit VM %s disk", vmName) + return GetTowerString(task.Description) == fmt.Sprintf("Edit VM %s disk", vmName) || + strings.Contains(GetTowerString(task.Description), "Update virtual volume") } func IsVMColdMigrationTask(task *models.Task) bool { diff --git a/pkg/service/vm.go b/pkg/service/vm.go index 454c4e73..be325983 100644 --- a/pkg/service/vm.go +++ b/pkg/service/vm.go @@ -102,15 +102,17 @@ type TowerVMService struct { func (svr *TowerVMService) UpdateVM(vm *models.VM, elfMachine *infrav1.ElfMachine) (*models.WithTaskVM, error) { vCPU := TowerVCPU(elfMachine.Spec.NumCPUs) - cpuCores := TowerCPUCores(*vCPU, elfMachine.Spec.NumCoresPerSocket) - cpuSockets := TowerCPUSockets(*vCPU, *cpuCores) + cpuSocketCores := TowerCPUSocketCores(elfMachine.Spec.NumCoresPerSocket, *vCPU) + cpuSockets := TowerCPUSockets(*vCPU, *cpuSocketCores) + memory := TowerMemory(elfMachine.Spec.MemoryMiB) updateVMParams := clientvm.NewUpdateVMParams() updateVMParams.RequestBody = &models.VMUpdateParams{ Data: &models.VMUpdateParamsData{ Vcpu: vCPU, - CPUCores: cpuCores, + CPUCores: cpuSocketCores, CPUSockets: cpuSockets, + Memory: memory, }, Where: &models.VMWhereInput{ID: TowerString(*vm.ID)}, } @@ -188,8 +190,8 @@ func (svr *TowerVMService) Clone( } vCPU := TowerVCPU(elfMachine.Spec.NumCPUs) - cpuCores := TowerCPUCores(*vCPU, elfMachine.Spec.NumCoresPerSocket) - cpuSockets := TowerCPUSockets(*vCPU, *cpuCores) + cpuSocketCores := TowerCPUSocketCores(elfMachine.Spec.NumCoresPerSocket, *vCPU) + cpuSockets := TowerCPUSockets(*vCPU, *cpuSocketCores) gpuDevices := make([]*models.VMGpuOperationParams, len(gpuDeviceInfos)) for i := range len(gpuDeviceInfos) { @@ -299,7 +301,7 @@ func (svr *TowerVMService) Clone( Description: TowerString(fmt.Sprintf(config.VMDescription, elfCluster.Spec.Tower.Server)), Owner: owner, Vcpu: vCPU, - CPUCores: cpuCores, + CPUCores: cpuSocketCores, CPUSockets: cpuSockets, Memory: TowerMemory(elfMachine.Spec.MemoryMiB), GpuDevices: gpuDevices, diff --git a/pkg/util/machine/machine.go b/pkg/util/machine/machine.go index 5011390a..b0ca87bf 100644 --- a/pkg/util/machine/machine.go +++ b/pkg/util/machine/machine.go @@ -22,7 +22,6 @@ import ( "strings" "github.com/pkg/errors" - corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util/conditions" @@ -170,40 +169,6 @@ func IsMachineFailed(machine *clusterv1.Machine) bool { return machine.Status.FailureReason != nil || machine.Status.FailureMessage != nil } -func IsUpdatingElfMachineResources(elfMachine *infrav1.ElfMachine) bool { - condition := conditions.Get(elfMachine, infrav1.ResourcesHotUpdatedCondition) - if condition != nil && - condition.Status == corev1.ConditionFalse { - if condition.Reason == infrav1.WaitingForResourcesHotUpdateReason && condition.Message != "" { - return false - } - - return true - } - - return false -} - -func IsResourcesUpToDate(elfMachineTemplate *infrav1.ElfMachineTemplate, elfMachine *infrav1.ElfMachine) bool { - return elfMachineTemplate.Spec.Template.Spec.DiskGiB <= elfMachine.Spec.DiskGiB -} - -func NeedUpdateElfMachineResources(elfMachineTemplate *infrav1.ElfMachineTemplate, elfMachine *infrav1.ElfMachine) bool { - if !IsResourcesUpToDate(elfMachineTemplate, elfMachine) { - return true - } - - condition := conditions.Get(elfMachine, infrav1.ResourcesHotUpdatedCondition) - if condition != nil && - condition.Status == corev1.ConditionFalse { - if condition.Reason == infrav1.WaitingForResourcesHotUpdateReason && condition.Message != "" { - return true - } - } - - return false -} - func ConvertProviderIDToUUID(providerID *string) string { if providerID == nil || *providerID == "" { return "" diff --git a/pkg/util/machine/machine_test.go b/pkg/util/machine/machine_test.go index 1feaab17..9be3dd93 100644 --- a/pkg/util/machine/machine_test.go +++ b/pkg/util/machine/machine_test.go @@ -21,8 +21,6 @@ import ( "testing" "github.com/onsi/gomega" - clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" - "sigs.k8s.io/cluster-api/util/conditions" infrav1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1" "github.com/smartxworks/cluster-api-provider-elf/test/fake" @@ -260,53 +258,6 @@ func TestGetControlPlaneMachinesForCluster(t *testing.T) { g.Expect(machines[0].Name).To(gomega.Equal(machine.Name)) } -func TestIsUpdatingElfMachineResources(t *testing.T) { - g := gomega.NewGomegaWithT(t) - - elfCluster, cluster := fake.NewClusterObjects() - emt := fake.NewElfMachineTemplate() - elfMachine, _ := fake.NewMachineObjects(elfCluster, cluster) - fake.SetElfMachineTemplateForElfMachine(elfMachine, emt) - g.Expect(IsUpdatingElfMachineResources(elfMachine)).To(gomega.BeFalse()) - - conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, "") - g.Expect(IsUpdatingElfMachineResources(elfMachine)).To(gomega.BeTrue()) - - conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, "xx") - g.Expect(IsUpdatingElfMachineResources(elfMachine)).To(gomega.BeFalse()) -} - -func TestNeedUpdateElfMachineResources(t *testing.T) { - g := gomega.NewGomegaWithT(t) - - elfCluster, cluster := fake.NewClusterObjects() - emt := fake.NewElfMachineTemplate() - elfMachine, _ := fake.NewMachineObjects(elfCluster, cluster) - fake.SetElfMachineTemplateForElfMachine(elfMachine, emt) - g.Expect(NeedUpdateElfMachineResources(emt, elfMachine)).To(gomega.BeFalse()) - - conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, "") - g.Expect(NeedUpdateElfMachineResources(emt, elfMachine)).To(gomega.BeFalse()) - - conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, "xx") - g.Expect(NeedUpdateElfMachineResources(emt, elfMachine)).To(gomega.BeTrue()) - - elfMachine.Spec.DiskGiB -= 1 - g.Expect(NeedUpdateElfMachineResources(emt, elfMachine)).To(gomega.BeTrue()) -} - -func TestIsResourcesUpToDate(t *testing.T) { - g := gomega.NewGomegaWithT(t) - - elfCluster, cluster := fake.NewClusterObjects() - emt := fake.NewElfMachineTemplate() - elfMachine, _ := fake.NewMachineObjects(elfCluster, cluster) - fake.SetElfMachineTemplateForElfMachine(elfMachine, emt) - g.Expect(IsResourcesUpToDate(emt, elfMachine)).To(gomega.BeTrue()) - elfMachine.Spec.DiskGiB -= 1 - g.Expect(IsResourcesUpToDate(emt, elfMachine)).To(gomega.BeFalse()) -} - func toString(s string) *string { return &s } diff --git a/test/fake/tower.go b/test/fake/tower.go index 9fb50805..8f16ba32 100644 --- a/test/fake/tower.go +++ b/test/fake/tower.go @@ -83,8 +83,8 @@ func NewTowerVMFromElfMachine(elfMachine *infrav1.ElfMachine) *models.VM { vm.Name = service.TowerString(elfMachine.Name) vm.Vcpu = service.TowerVCPU(elfMachine.Spec.NumCPUs) vm.CPU = &models.NestedCPU{ - Cores: service.TowerCPUCores(*vm.Vcpu, elfMachine.Spec.NumCoresPerSocket), - Sockets: service.TowerCPUSockets(*vm.Vcpu, *service.TowerCPUCores(*vm.Vcpu, elfMachine.Spec.NumCoresPerSocket)), + Cores: service.TowerCPUSocketCores(*vm.Vcpu, elfMachine.Spec.NumCoresPerSocket), + Sockets: service.TowerCPUSockets(*vm.Vcpu, *service.TowerCPUSocketCores(*vm.Vcpu, elfMachine.Spec.NumCoresPerSocket)), } vm.Memory = service.TowerMemory(elfMachine.Spec.MemoryMiB) vm.Ha = service.TowerBool(elfMachine.Spec.HA) diff --git a/webhooks/elfmachine_webhook_mutation.go b/webhooks/elfmachine_webhook_mutation.go index 9c53b92d..477ff26e 100644 --- a/webhooks/elfmachine_webhook_mutation.go +++ b/webhooks/elfmachine_webhook_mutation.go @@ -63,6 +63,10 @@ func (m *ElfMachineMutation) Handle(ctx goctx.Context, request admission.Request version.SetCurrentCAPEVersion(&elfMachine) } + if elfMachine.Spec.NumCoresPerSocket <= 0 { + elfMachine.Spec.NumCoresPerSocket = elfMachine.Spec.NumCPUs + } + if marshaledElfMachine, err := json.Marshal(elfMachine); err != nil { return admission.Errored(http.StatusInternalServerError, err) } else { diff --git a/webhooks/elfmachine_webhook_mutation_test.go b/webhooks/elfmachine_webhook_mutation_test.go index 8fd51214..bde2321b 100644 --- a/webhooks/elfmachine_webhook_mutation_test.go +++ b/webhooks/elfmachine_webhook_mutation_test.go @@ -50,6 +50,7 @@ func TestElfMachineMutation(t *testing.T) { elfMachine := fake.NewElfMachine(nil) elfMachine.Annotations = nil + elfMachine.Spec.NumCoresPerSocket = 0 raw, err := marshal(elfMachine) g.Expect(err).NotTo(HaveOccurred()) tests = append(tests, testCase{ @@ -62,6 +63,7 @@ func TestElfMachineMutation(t *testing.T) { expectRespAllowed: true, expectPatchs: []jsonpatch.Operation{ {Operation: "add", Path: "/metadata/annotations", Value: map[string]interface{}{infrav1.CAPEVersionAnnotation: version.CAPEVersion()}}, + {Operation: "add", Path: "/spec/numCoresPerSocket", Value: float64(elfMachine.Spec.NumCPUs)}, }, }) @@ -72,7 +74,7 @@ func TestElfMachineMutation(t *testing.T) { resp := mutation.Handle(context.Background(), tc.admissionRequest) g.Expect(resp.Allowed).Should(Equal(tc.expectRespAllowed)) - g.Expect(resp.Patches).Should(Equal(tc.expectPatchs)) + g.Expect(resp.Patches).Should(ContainElements(tc.expectPatchs)) }) } } diff --git a/webhooks/elfmachine_webhook_validation.go b/webhooks/elfmachine_webhook_validation.go index 959089da..6985cc42 100644 --- a/webhooks/elfmachine_webhook_validation.go +++ b/webhooks/elfmachine_webhook_validation.go @@ -34,7 +34,12 @@ import ( // Error messages. const ( - canOnlyModifiedThroughElfMachineTemplate = "virtual machine resources can only be modified through ElfMachineTemplate %s" + canOnlyModifiedThroughElfMachineTemplate = "virtual machine resources should be the same as ElfMachineTemplate %s" + + diskCapacityCanOnlyBeExpandedMsg = "the disk capacity can only be expanded" + vcpuCapacityCanOnlyBeExpandedMsg = "the vcpu capacity can only be expanded" + memoryCapacityCanOnlyBeExpandedMsg = "the memory capacity can only be expanded" + numCoresPerSocketCannotBeChanged = "the number of cores per socket cannot be changed" ) func (v *ElfMachineValidator) SetupWebhookWithManager(mgr ctrl.Manager) error { @@ -72,27 +77,42 @@ func (v *ElfMachineValidator) ValidateUpdate(ctx goctx.Context, oldObj, newObj r var allErrs field.ErrorList elfMachineTemplateName := annotationsutil.GetTemplateClonedFromName(elfMachine) - if elfMachineTemplateName == "" { - if elfMachine.Spec.DiskGiB < oldElfMachine.Spec.DiskGiB { - allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "diskGiB"), elfMachine.Spec.DiskGiB, diskCapacityCanOnlyBeExpanded)) + if elfMachineTemplateName != "" { + // If the ElfMachine was created using ElfMachineTemplate. ElfMachine's + // resources should be the same as this ElfMachineTemplate. + var elfMachineTemplate infrav1.ElfMachineTemplate + if err := v.Client.Get(ctx, client.ObjectKey{ + Namespace: elfMachine.Namespace, + Name: annotationsutil.GetTemplateClonedFromName(elfMachine), + }, &elfMachineTemplate); err != nil { + return nil, apierrors.NewInternalError(err) } - return nil, aggregateObjErrors(elfMachine.GroupVersionKind().GroupKind(), elfMachine.Name, allErrs) + if elfMachine.Spec.DiskGiB != elfMachineTemplate.Spec.Template.Spec.DiskGiB { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "diskGiB"), elfMachine.Spec.DiskGiB, fmt.Sprintf(canOnlyModifiedThroughElfMachineTemplate, elfMachineTemplateName))) + } + if elfMachine.Spec.NumCPUs != elfMachineTemplate.Spec.Template.Spec.NumCPUs { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "numCPUs"), elfMachine.Spec.NumCPUs, fmt.Sprintf(canOnlyModifiedThroughElfMachineTemplate, elfMachineTemplateName))) + } + if elfMachine.Spec.NumCoresPerSocket != elfMachineTemplate.Spec.Template.Spec.NumCoresPerSocket { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "numCoresPerSocket"), elfMachine.Spec.NumCoresPerSocket, fmt.Sprintf(canOnlyModifiedThroughElfMachineTemplate, elfMachineTemplateName))) + } + if elfMachine.Spec.MemoryMiB != elfMachineTemplate.Spec.Template.Spec.MemoryMiB { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "memoryMiB"), elfMachine.Spec.MemoryMiB, fmt.Sprintf(canOnlyModifiedThroughElfMachineTemplate, elfMachineTemplateName))) + } } - // If the ElfMachine was created using ElfMachineTemplate. ElfMachine's - // resources can only be modified through this ElfMachineTemplate. - - var elfMachineTemplate infrav1.ElfMachineTemplate - if err := v.Client.Get(ctx, client.ObjectKey{ - Namespace: elfMachine.Namespace, - Name: annotationsutil.GetTemplateClonedFromName(elfMachine), - }, &elfMachineTemplate); err != nil { - return nil, apierrors.NewInternalError(err) + if elfMachine.Spec.DiskGiB < oldElfMachine.Spec.DiskGiB { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "diskGiB"), elfMachine.Spec.DiskGiB, diskCapacityCanOnlyBeExpandedMsg)) } - - if elfMachine.Spec.DiskGiB != elfMachineTemplate.Spec.Template.Spec.DiskGiB { - allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "diskGiB"), elfMachine.Spec.DiskGiB, fmt.Sprintf(canOnlyModifiedThroughElfMachineTemplate, elfMachineTemplateName))) + if elfMachine.Spec.NumCPUs < oldElfMachine.Spec.NumCPUs { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "numCPUs"), elfMachine.Spec.NumCPUs, vcpuCapacityCanOnlyBeExpandedMsg)) + } + if elfMachine.Spec.MemoryMiB < oldElfMachine.Spec.MemoryMiB { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "memoryMiB"), elfMachine.Spec.MemoryMiB, memoryCapacityCanOnlyBeExpandedMsg)) + } + if oldElfMachine.Spec.NumCoresPerSocket != 0 && elfMachine.Spec.NumCoresPerSocket != oldElfMachine.Spec.NumCoresPerSocket { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "numCoresPerSocket"), elfMachine.Spec.NumCoresPerSocket, numCoresPerSocketCannotBeChanged)) } return nil, aggregateObjErrors(elfMachine.GroupVersionKind().GroupKind(), elfMachine.Name, allErrs) diff --git a/webhooks/elfmachine_webhook_validation_test.go b/webhooks/elfmachine_webhook_validation_test.go index 2f519e28..1919547a 100644 --- a/webhooks/elfmachine_webhook_validation_test.go +++ b/webhooks/elfmachine_webhook_validation_test.go @@ -44,7 +44,9 @@ func TestElfMachineValidatorValidateUpdate(t *testing.T) { Spec: infrav1.ElfMachineTemplateSpec{ Template: infrav1.ElfMachineTemplateResource{ Spec: infrav1.ElfMachineSpec{ - DiskGiB: 1, + DiskGiB: 1, + NumCPUs: 1, + MemoryMiB: 1, }, }, }, @@ -63,13 +65,81 @@ func TestElfMachineValidatorValidateUpdate(t *testing.T) { }, }, Errs: field.ErrorList{ - field.Invalid(field.NewPath("spec", "diskGiB"), 1, diskCapacityCanOnlyBeExpanded), + field.Invalid(field.NewPath("spec", "diskGiB"), 1, diskCapacityCanOnlyBeExpandedMsg), + }, + }) + tests = append(tests, elfMachineTestCase{ + Name: "Cannot reduce vcpu capacity", + OldEM: &infrav1.ElfMachine{ + Spec: infrav1.ElfMachineSpec{ + NumCPUs: 2, + }, + }, + EM: &infrav1.ElfMachine{ + Spec: infrav1.ElfMachineSpec{ + NumCPUs: 1, + }, + }, + Errs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "numCPUs"), 1, vcpuCapacityCanOnlyBeExpandedMsg), + }, + }) + tests = append(tests, elfMachineTestCase{ + Name: "Cannot reduce memory capacity", + OldEM: &infrav1.ElfMachine{ + Spec: infrav1.ElfMachineSpec{ + MemoryMiB: 2, + }, + }, + EM: &infrav1.ElfMachine{ + Spec: infrav1.ElfMachineSpec{ + MemoryMiB: 1, + }, + }, + Errs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "memoryMiB"), 1, memoryCapacityCanOnlyBeExpandedMsg), + }, + }) + tests = append(tests, elfMachineTestCase{ + Name: "Can update the default numCoresPerSocket", + OldEM: &infrav1.ElfMachine{ + Spec: infrav1.ElfMachineSpec{ + NumCoresPerSocket: 0, + }, + }, + EM: &infrav1.ElfMachine{ + Spec: infrav1.ElfMachineSpec{ + NumCoresPerSocket: 1, + }, + }, + Errs: nil, + }) + tests = append(tests, elfMachineTestCase{ + Name: "Cannot update numCoresPerSocket", + OldEM: &infrav1.ElfMachine{ + Spec: infrav1.ElfMachineSpec{ + NumCoresPerSocket: 1, + }, + }, + EM: &infrav1.ElfMachine{ + Spec: infrav1.ElfMachineSpec{ + NumCoresPerSocket: 2, + }, + }, + Errs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "numCoresPerSocket"), 2, numCoresPerSocketCannotBeChanged), }, }) tests = append(tests, elfMachineTestCase{ - Name: "Disk cannot be modified directly", - OldEM: nil, + Name: "Disk cannot be modified directly", + OldEM: &infrav1.ElfMachine{ + Spec: infrav1.ElfMachineSpec{ + DiskGiB: 1, + NumCPUs: 1, + MemoryMiB: 1, + }, + }, EM: &infrav1.ElfMachine{ ObjectMeta: metav1.ObjectMeta{ Annotations: map[string]string{ @@ -77,7 +147,9 @@ func TestElfMachineValidatorValidateUpdate(t *testing.T) { }, }, Spec: infrav1.ElfMachineSpec{ - DiskGiB: 2, + DiskGiB: 2, + NumCPUs: 1, + MemoryMiB: 1, }, }, Objs: []client.Object{elfMachineTemplate}, @@ -85,6 +157,58 @@ func TestElfMachineValidatorValidateUpdate(t *testing.T) { field.Invalid(field.NewPath("spec", "diskGiB"), 2, fmt.Sprintf(canOnlyModifiedThroughElfMachineTemplate, elfMachineTemplate.Name)), }, }) + tests = append(tests, elfMachineTestCase{ + Name: "vcpu cannot be modified directly", + OldEM: &infrav1.ElfMachine{ + Spec: infrav1.ElfMachineSpec{ + DiskGiB: 1, + NumCPUs: 1, + MemoryMiB: 1, + }, + }, + EM: &infrav1.ElfMachine{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + clusterv1.TemplateClonedFromNameAnnotation: elfMachineTemplate.Name, + }, + }, + Spec: infrav1.ElfMachineSpec{ + DiskGiB: 1, + NumCPUs: 2, + MemoryMiB: 1, + }, + }, + Objs: []client.Object{elfMachineTemplate}, + Errs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "numCPUs"), 2, fmt.Sprintf(canOnlyModifiedThroughElfMachineTemplate, elfMachineTemplate.Name)), + }, + }) + tests = append(tests, elfMachineTestCase{ + Name: "memory cannot be modified directly", + OldEM: &infrav1.ElfMachine{ + Spec: infrav1.ElfMachineSpec{ + DiskGiB: 1, + NumCPUs: 1, + MemoryMiB: 1, + }, + }, + EM: &infrav1.ElfMachine{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + clusterv1.TemplateClonedFromNameAnnotation: elfMachineTemplate.Name, + }, + }, + Spec: infrav1.ElfMachineSpec{ + DiskGiB: 1, + NumCPUs: 1, + MemoryMiB: 2, + }, + }, + Objs: []client.Object{elfMachineTemplate}, + Errs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "memoryMiB"), 2, fmt.Sprintf(canOnlyModifiedThroughElfMachineTemplate, elfMachineTemplate.Name)), + }, + }) for _, tc := range tests { t.Run(tc.Name, func(t *testing.T) { diff --git a/webhooks/elfmachinetemplate_webhook_mutation.go b/webhooks/elfmachinetemplate_webhook_mutation.go index 51c04c34..c503f23b 100644 --- a/webhooks/elfmachinetemplate_webhook_mutation.go +++ b/webhooks/elfmachinetemplate_webhook_mutation.go @@ -63,6 +63,10 @@ func (m *ElfMachineTemplateMutation) Handle(ctx goctx.Context, request admission return admission.Errored(http.StatusBadRequest, err) } + if elfMachineTemplate.Spec.Template.Spec.NumCoresPerSocket <= 0 { + elfMachineTemplate.Spec.Template.Spec.NumCoresPerSocket = elfMachineTemplate.Spec.Template.Spec.NumCPUs + } + devices := elfMachineTemplate.Spec.Template.Spec.Network.Devices for i := range len(devices) { for j := range len(devices[i].AddressesFromPools) { diff --git a/webhooks/elfmachinetemplate_webhook_mutation_test.go b/webhooks/elfmachinetemplate_webhook_mutation_test.go index 9cbb8398..6d6d3aee 100644 --- a/webhooks/elfmachinetemplate_webhook_mutation_test.go +++ b/webhooks/elfmachinetemplate_webhook_mutation_test.go @@ -42,7 +42,7 @@ func TestElfMachineMutationTemplate(t *testing.T) { }, Spec: infrav1.ElfMachineTemplateSpec{ Template: infrav1.ElfMachineTemplateResource{ - Spec: infrav1.ElfMachineSpec{}, + Spec: infrav1.ElfMachineSpec{NumCoresPerSocket: 1}, }, }, } @@ -71,6 +71,24 @@ func TestElfMachineMutationTemplate(t *testing.T) { }, }) + elfMachineTemplate.Spec.Template.Spec.Network.Devices = nil + elfMachineTemplate.Spec.Template.Spec.NumCPUs = 1 + elfMachineTemplate.Spec.Template.Spec.NumCoresPerSocket = 0 + raw, err = marshal(elfMachineTemplate) + g.Expect(err).NotTo(HaveOccurred()) + tests = append(tests, testCase{ + name: "should set default values for numCoresPerSocket", + admissionRequest: admission.Request{AdmissionRequest: admissionv1.AdmissionRequest{ + Kind: metav1.GroupVersionKind{Group: infrav1.GroupVersion.Group, Version: infrav1.GroupVersion.Version, Kind: "ElfMachine"}, + Operation: admissionv1.Create, + Object: runtime.RawExtension{Raw: raw}, + }}, + expectRespAllowed: true, + expectPatchs: []jsonpatch.Operation{ + {Operation: "add", Path: "/spec/template/spec/numCoresPerSocket", Value: float64(elfMachineTemplate.Spec.Template.Spec.NumCPUs)}, + }, + }) + for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { mutation := ElfMachineTemplateMutation{} diff --git a/webhooks/elfmachinetemplate_webhook_validation.go b/webhooks/elfmachinetemplate_webhook_validation.go index 2597b2ac..a1726a8f 100644 --- a/webhooks/elfmachinetemplate_webhook_validation.go +++ b/webhooks/elfmachinetemplate_webhook_validation.go @@ -32,8 +32,10 @@ import ( // Error messages. const ( - diskCapacityCannotLessThanZeroMsg = "the disk capacity can only greater than or equal to 0" - diskCapacityCanOnlyBeExpanded = "the disk capacity can only be expanded" + diskCapacityCannotLessThanZeroMsg = "the disk capacity can only greater than or equal to 0" + memoryCannotLessThanZeroMsg = "the memory can only greater than 0" + numCPUsCannotLessThanZeroMsg = "the umCPUs can only greater than 0" + numCoresPerSocketCannotLessThanZeroMsg = "the numCoresPerSocket can only greater than 0" ) func (v *ElfMachineTemplateValidator) SetupWebhookWithManager(mgr ctrl.Manager) error { @@ -62,28 +64,26 @@ func (v *ElfMachineTemplateValidator) ValidateCreate(ctx goctx.Context, obj runt allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "diskGiB"), elfMachineTemplate.Spec.Template.Spec.DiskGiB, diskCapacityCannotLessThanZeroMsg)) } - return nil, aggregateObjErrors(elfMachineTemplate.GroupVersionKind().GroupKind(), elfMachineTemplate.Name, allErrs) -} - -// ValidateUpdate implements webhook.Validator so a webhook will be registered for the type. -func (v *ElfMachineTemplateValidator) ValidateUpdate(ctx goctx.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) { - oldElfMachineTemplate, ok := oldObj.(*infrav1.ElfMachineTemplate) //nolint:forcetypeassert - if !ok { - return nil, apierrors.NewBadRequest(fmt.Sprintf("expected an ElfMachineTemplate but got a %T", oldObj)) + if elfMachineTemplate.Spec.Template.Spec.MemoryMiB <= 0 { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "memoryMiB"), elfMachineTemplate.Spec.Template.Spec.MemoryMiB, memoryCannotLessThanZeroMsg)) } - elfMachineTemplate, ok := newObj.(*infrav1.ElfMachineTemplate) //nolint:forcetypeassert - if !ok { - return nil, apierrors.NewBadRequest(fmt.Sprintf("expected an ElfMachineTemplate but got a %T", newObj)) + + if elfMachineTemplate.Spec.Template.Spec.NumCPUs <= 0 { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "numCPUs"), elfMachineTemplate.Spec.Template.Spec.NumCPUs, numCPUsCannotLessThanZeroMsg)) } - var allErrs field.ErrorList - if elfMachineTemplate.Spec.Template.Spec.DiskGiB < oldElfMachineTemplate.Spec.Template.Spec.DiskGiB { - allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "diskGiB"), elfMachineTemplate.Spec.Template.Spec.DiskGiB, diskCapacityCanOnlyBeExpanded)) + if elfMachineTemplate.Spec.Template.Spec.NumCoresPerSocket <= 0 { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "numCoresPerSocket"), elfMachineTemplate.Spec.Template.Spec.NumCoresPerSocket, numCoresPerSocketCannotLessThanZeroMsg)) } return nil, aggregateObjErrors(elfMachineTemplate.GroupVersionKind().GroupKind(), elfMachineTemplate.Name, allErrs) } +// ValidateUpdate implements webhook.Validator so a webhook will be registered for the type. +func (v *ElfMachineTemplateValidator) ValidateUpdate(ctx goctx.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) { + return nil, nil +} + // ValidateDelete implements webhook.Validator so a webhook will be registered for the type. func (v *ElfMachineTemplateValidator) ValidateDelete(ctx goctx.Context, obj runtime.Object) (admission.Warnings, error) { return nil, nil diff --git a/webhooks/elfmachinetemplate_webhook_validation_test.go b/webhooks/elfmachinetemplate_webhook_validation_test.go index 69be42e1..2d9e62d3 100644 --- a/webhooks/elfmachinetemplate_webhook_validation_test.go +++ b/webhooks/elfmachinetemplate_webhook_validation_test.go @@ -37,7 +37,10 @@ func TestElfMachineTemplateValidatorValidateCreate(t *testing.T) { EMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ Template: infrav1.ElfMachineTemplateResource{ Spec: infrav1.ElfMachineSpec{ - DiskGiB: -1, + NumCPUs: 1, + NumCoresPerSocket: 1, + MemoryMiB: 1, + DiskGiB: -1, }, }, }}, @@ -49,7 +52,10 @@ func TestElfMachineTemplateValidatorValidateCreate(t *testing.T) { EMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ Template: infrav1.ElfMachineTemplateResource{ Spec: infrav1.ElfMachineSpec{ - DiskGiB: 0, + NumCPUs: 1, + NumCoresPerSocket: 1, + MemoryMiB: 1, + DiskGiB: 0, }, }, }}, @@ -59,46 +65,58 @@ func TestElfMachineTemplateValidatorValidateCreate(t *testing.T) { EMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ Template: infrav1.ElfMachineTemplateResource{ Spec: infrav1.ElfMachineSpec{ - DiskGiB: 100, + NumCPUs: 1, + NumCoresPerSocket: 1, + MemoryMiB: 1, + DiskGiB: 100, }, }, }}, Errs: nil, - }) - - validator := &ElfMachineTemplateValidator{} - - for _, tc := range tests { - t.Run(tc.Name, func(t *testing.T) { - warnings, err := validator.ValidateCreate(goctx.Background(), tc.EMT) - g.Expect(warnings).To(BeEmpty()) - expectTestCase(g, tc, err) - }) - } -} - -func TestElfMachineTemplateValidatorValidateUpdate(t *testing.T) { - g := NewWithT(t) - - var tests []testCaseEMT - tests = append(tests, testCaseEMT{ - Name: "Cannot reduce disk capacity", - OldEMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ + }, testCaseEMT{ + Name: "memory cannot be less than 0", + EMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ + Template: infrav1.ElfMachineTemplateResource{ + Spec: infrav1.ElfMachineSpec{ + NumCPUs: 1, + NumCoresPerSocket: 1, + DiskGiB: 100, + MemoryMiB: 0, + }, + }, + }}, + Errs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "template", "spec", "memoryMiB"), 0, memoryCannotLessThanZeroMsg), + }, + }, testCaseEMT{ + Name: "numCPUs cannot be less than 0", + EMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ Template: infrav1.ElfMachineTemplateResource{ Spec: infrav1.ElfMachineSpec{ - DiskGiB: 2, + NumCoresPerSocket: 1, + DiskGiB: 100, + MemoryMiB: 1, + NumCPUs: 0, }, }, }}, + Errs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "template", "spec", "numCPUs"), 0, numCPUsCannotLessThanZeroMsg), + }, + }, testCaseEMT{ + Name: "numCoresPerSocket cannot be less than 0", EMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ Template: infrav1.ElfMachineTemplateResource{ Spec: infrav1.ElfMachineSpec{ - DiskGiB: 1, + NumCPUs: 1, + DiskGiB: 100, + MemoryMiB: 1, + NumCoresPerSocket: 0, }, }, }}, Errs: field.ErrorList{ - field.Invalid(field.NewPath("spec", "template", "spec", "diskGiB"), 1, diskCapacityCanOnlyBeExpanded), + field.Invalid(field.NewPath("spec", "template", "spec", "numCoresPerSocket"), 0, numCoresPerSocketCannotLessThanZeroMsg), }, }) @@ -106,7 +124,7 @@ func TestElfMachineTemplateValidatorValidateUpdate(t *testing.T) { for _, tc := range tests { t.Run(tc.Name, func(t *testing.T) { - warnings, err := validator.ValidateUpdate(goctx.Background(), tc.OldEMT, tc.EMT) + warnings, err := validator.ValidateCreate(goctx.Background(), tc.EMT) g.Expect(warnings).To(BeEmpty()) expectTestCase(g, tc, err) })