From 86666424220ae2dfe6c1841f157cb0e96372f656 Mon Sep 17 00:00:00 2001 From: Mike Rostermund Date: Thu, 26 Sep 2024 11:47:11 +0200 Subject: [PATCH 1/2] Bump default to humio/humio-core:1.153.1 and add EnableZoneAwareness field to HumioUpdateStrategy EnableZoneAwareness is true by default. When EnableZoneAwareness is true: - pod replacements will be performed one zone at a time. - if no zone has been decided upon yet, pick the first one from the pods that needs replacing. - as soon as no more pods are found in the current zone that needs replacing, the zone marker is cleared. This also refactors the uses of os.Getenv() to helper functions to reduce the number of places we refer to the env var names. --- Makefile | 3 +- api/v1alpha1/humiocluster_types.go | 10 +- api/v1alpha1/zz_generated.deepcopy.go | 7 +- .../crds/core.humio.com_humioclusters.yaml | 29 +++- .../bases/core.humio.com_humioclusters.yaml | 29 +++- controllers/humiocluster_controller.go | 126 +++++++++----- controllers/humiocluster_defaults.go | 45 ++++- controllers/humiocluster_pods.go | 83 ++++++++-- controllers/humiocluster_status.go | 38 +++-- .../clusters/humiocluster_controller_test.go | 155 ++++++++++++++++-- controllers/suite/clusters/suite_test.go | 29 ++-- controllers/suite/common.go | 68 ++++++-- .../humioresources_controller_test.go | 15 +- controllers/suite/resources/suite_test.go | 14 +- controllers/versions/versions.go | 25 +-- images/helper/go.mod | 3 - images/helper/go.sum | 6 - images/helper/main.go | 2 +- pkg/helpers/helpers.go | 42 ++++- pkg/kubernetes/nodes.go | 21 +++ 20 files changed, 569 insertions(+), 181 deletions(-) diff --git a/Makefile b/Makefile index 542916b9..a2d7c8b0 100644 --- a/Makefile +++ b/Makefile @@ -52,8 +52,7 @@ test: manifests generate fmt vet ginkgo ## Run tests. go install sigs.k8s.io/controller-runtime/tools/setup-envtest@latest $(SHELL) -c "\ eval \$$($(GOBIN)/setup-envtest use -p env ${TEST_K8S_VERSION}); \ - export USE_CERTMANAGER=false; \ - export TEST_USE_EXISTING_CLUSTER=false; \ + export TEST_USING_ENVTEST=true; \ $(GINKGO) --label-filter=envtest -vv --no-color --procs 3 -output-dir=${PWD} -keep-separate-reports -race --junit-report=test-results-junit.xml --randomize-suites --randomize-all -timeout 10m ./... -covermode=count -coverprofile cover.out \ " diff --git a/api/v1alpha1/humiocluster_types.go b/api/v1alpha1/humiocluster_types.go index ec2f8001..d7bbb592 100644 --- a/api/v1alpha1/humiocluster_types.go +++ b/api/v1alpha1/humiocluster_types.go @@ -282,8 +282,14 @@ type HumioUpdateStrategy struct { // +kubebuilder:validation:Enum=OnDelete;RollingUpdate;ReplaceAllOnUpdate;RollingUpdateBestEffort Type string `json:"type,omitempty"` - // The minimum time in seconds that a pod must be ready before the next pod can be deleted when doing rolling update. + // MinReadySeconds is the minimum time in seconds that a pod must be ready before the next pod can be deleted when doing rolling update. MinReadySeconds int32 `json:"minReadySeconds,omitempty"` + + // EnableZoneAwareness toggles zone awareness on or off during updates. When enabled, the pod replacement logic + // will go through all pods in a specific zone before it starts replacing pods in the next zone. + // If pods are failing, they bypass the zone limitation and are restarted immediately - ignoring the zone. + // Zone awareness is enabled by default. + EnableZoneAwareness *bool `json:"enableZoneAwareness,omitempty"` } type HumioNodePoolSpec struct { @@ -384,6 +390,8 @@ type HumioNodePoolStatus struct { State string `json:"state,omitempty"` // DesiredPodRevision holds the desired pod revision for pods of the given node pool. DesiredPodRevision int `json:"desiredPodRevision,omitempty"` + // ZoneUnderMaintenance holds the name of the availability zone currently under maintenance + ZoneUnderMaintenance string `json:"zoneUnderMaintenance,omitempty"` } // HumioClusterStatus defines the observed state of HumioCluster diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 4b7b833e..9daf5858 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -1460,7 +1460,7 @@ func (in *HumioNodeSpec) DeepCopyInto(out *HumioNodeSpec) { if in.UpdateStrategy != nil { in, out := &in.UpdateStrategy, &out.UpdateStrategy *out = new(HumioUpdateStrategy) - **out = **in + (*in).DeepCopyInto(*out) } in.NodePoolFeatures.DeepCopyInto(&out.NodePoolFeatures) } @@ -1895,6 +1895,11 @@ func (in *HumioTokenSecretStatus) DeepCopy() *HumioTokenSecretStatus { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *HumioUpdateStrategy) DeepCopyInto(out *HumioUpdateStrategy) { *out = *in + if in.EnableZoneAwareness != nil { + in, out := &in.EnableZoneAwareness, &out.EnableZoneAwareness + *out = new(bool) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HumioUpdateStrategy. diff --git a/charts/humio-operator/crds/core.humio.com_humioclusters.yaml b/charts/humio-operator/crds/core.humio.com_humioclusters.yaml index 6199bf43..4112c5c6 100644 --- a/charts/humio-operator/crds/core.humio.com_humioclusters.yaml +++ b/charts/humio-operator/crds/core.humio.com_humioclusters.yaml @@ -13082,10 +13082,17 @@ spec: UpdateStrategy controls how Humio pods are updated when changes are made to the HumioCluster resource that results in a change to the Humio pods properties: + enableZoneAwareness: + description: |- + EnableZoneAwareness toggles zone awareness on or off during updates. When enabled, the pod replacement logic + will go through all pods in a specific zone before it starts replacing pods in the next zone. + If pods are failing, they bypass the zone limitation and are restarted immediately - ignoring the zone. + Zone awareness is enabled by default. + type: boolean minReadySeconds: - description: The minimum time in seconds that a pod - must be ready before the next pod can be deleted when - doing rolling update. + description: MinReadySeconds is the minimum time in + seconds that a pod must be ready before the next pod + can be deleted when doing rolling update. format: int32 type: integer type: @@ -14987,9 +14994,17 @@ spec: UpdateStrategy controls how Humio pods are updated when changes are made to the HumioCluster resource that results in a change to the Humio pods properties: + enableZoneAwareness: + description: |- + EnableZoneAwareness toggles zone awareness on or off during updates. When enabled, the pod replacement logic + will go through all pods in a specific zone before it starts replacing pods in the next zone. + If pods are failing, they bypass the zone limitation and are restarted immediately - ignoring the zone. + Zone awareness is enabled by default. + type: boolean minReadySeconds: - description: The minimum time in seconds that a pod must be ready - before the next pod can be deleted when doing rolling update. + description: MinReadySeconds is the minimum time in seconds that + a pod must be ready before the next pod can be deleted when + doing rolling update. format: int32 type: integer type: @@ -15062,6 +15077,10 @@ spec: From there it can be "Running", "Upgrading", "Restarting" or "Pending" type: string + zoneUnderMaintenance: + description: ZoneUnderMaintenance holds the name of the availability + zone currently under maintenance + type: string required: - name type: object diff --git a/config/crd/bases/core.humio.com_humioclusters.yaml b/config/crd/bases/core.humio.com_humioclusters.yaml index 6199bf43..4112c5c6 100644 --- a/config/crd/bases/core.humio.com_humioclusters.yaml +++ b/config/crd/bases/core.humio.com_humioclusters.yaml @@ -13082,10 +13082,17 @@ spec: UpdateStrategy controls how Humio pods are updated when changes are made to the HumioCluster resource that results in a change to the Humio pods properties: + enableZoneAwareness: + description: |- + EnableZoneAwareness toggles zone awareness on or off during updates. When enabled, the pod replacement logic + will go through all pods in a specific zone before it starts replacing pods in the next zone. + If pods are failing, they bypass the zone limitation and are restarted immediately - ignoring the zone. + Zone awareness is enabled by default. + type: boolean minReadySeconds: - description: The minimum time in seconds that a pod - must be ready before the next pod can be deleted when - doing rolling update. + description: MinReadySeconds is the minimum time in + seconds that a pod must be ready before the next pod + can be deleted when doing rolling update. format: int32 type: integer type: @@ -14987,9 +14994,17 @@ spec: UpdateStrategy controls how Humio pods are updated when changes are made to the HumioCluster resource that results in a change to the Humio pods properties: + enableZoneAwareness: + description: |- + EnableZoneAwareness toggles zone awareness on or off during updates. When enabled, the pod replacement logic + will go through all pods in a specific zone before it starts replacing pods in the next zone. + If pods are failing, they bypass the zone limitation and are restarted immediately - ignoring the zone. + Zone awareness is enabled by default. + type: boolean minReadySeconds: - description: The minimum time in seconds that a pod must be ready - before the next pod can be deleted when doing rolling update. + description: MinReadySeconds is the minimum time in seconds that + a pod must be ready before the next pod can be deleted when + doing rolling update. format: int32 type: integer type: @@ -15062,6 +15077,10 @@ spec: From there it can be "Running", "Upgrading", "Restarting" or "Pending" type: string + zoneUnderMaintenance: + description: ZoneUnderMaintenance holds the name of the availability + zone currently under maintenance + type: string required: - name type: object diff --git a/controllers/humiocluster_controller.go b/controllers/humiocluster_controller.go index 7ce6466b..7641fb5e 100644 --- a/controllers/humiocluster_controller.go +++ b/controllers/humiocluster_controller.go @@ -129,17 +129,17 @@ func (r *HumioClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request if err := r.setImageFromSource(ctx, pool); err != nil { return r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). withMessage(err.Error()). - withNodePoolState(humiov1alpha1.HumioClusterStateConfigError, pool.GetNodePoolName(), pool.GetDesiredPodRevision())) + withNodePoolState(humiov1alpha1.HumioClusterStateConfigError, pool.GetNodePoolName(), pool.GetDesiredPodRevision(), pool.GetZoneUnderMaintenance())) } if err := r.ensureValidHumioVersion(pool); err != nil { return r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). withMessage(err.Error()). - withNodePoolState(humiov1alpha1.HumioClusterStateConfigError, pool.GetNodePoolName(), pool.GetDesiredPodRevision())) + withNodePoolState(humiov1alpha1.HumioClusterStateConfigError, pool.GetNodePoolName(), pool.GetDesiredPodRevision(), pool.GetZoneUnderMaintenance())) } if err := r.ensureValidStorageConfiguration(pool); err != nil { return r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). withMessage(err.Error()). - withNodePoolState(humiov1alpha1.HumioClusterStateConfigError, pool.GetNodePoolName(), pool.GetDesiredPodRevision())) + withNodePoolState(humiov1alpha1.HumioClusterStateConfigError, pool.GetNodePoolName(), pool.GetDesiredPodRevision(), pool.GetZoneUnderMaintenance())) } } @@ -198,7 +198,7 @@ func (r *HumioClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request if err := r.validateInitialPodSpec(pool); err != nil { return r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). withMessage(err.Error()). - withNodePoolState(humiov1alpha1.HumioClusterStateConfigError, pool.GetNodePoolName(), pool.GetDesiredPodRevision())) + withNodePoolState(humiov1alpha1.HumioClusterStateConfigError, pool.GetNodePoolName(), pool.GetDesiredPodRevision(), pool.GetZoneUnderMaintenance())) } } @@ -252,7 +252,7 @@ func (r *HumioClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request desiredPodRevision++ } _, err = r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). - withNodePoolState(hc.Status.State, pool.GetNodePoolName(), desiredPodRevision)) + withNodePoolState(hc.Status.State, pool.GetNodePoolName(), desiredPodRevision, "")) return reconcile.Result{Requeue: true}, err } } @@ -261,7 +261,7 @@ func (r *HumioClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request if err := r.ensurePersistentVolumeClaimsExist(ctx, hc, pool); err != nil { opts := statusOptions() if hc.Status.State != humiov1alpha1.HumioClusterStateRestarting && hc.Status.State != humiov1alpha1.HumioClusterStateUpgrading { - opts.withNodePoolState(humiov1alpha1.HumioClusterStatePending, pool.GetNodePoolName(), pool.GetDesiredPodRevision()) + opts.withNodePoolState(humiov1alpha1.HumioClusterStatePending, pool.GetNodePoolName(), pool.GetDesiredPodRevision(), pool.GetZoneUnderMaintenance()) } return r.updateStatus(ctx, r.Client.Status(), hc, opts. withMessage(err.Error())) @@ -1833,17 +1833,17 @@ func (r *HumioClusterReconciler) ensureHumioServiceAccountAnnotations(ctx contex // The behavior of this depends on what, if anything, was changed in the pod. If there are changes that fall under a // rolling update, then the pod restart policy is set to PodRestartPolicyRolling and the reconciliation will continue if // there are any pods not in a ready state. This is so replacement pods may be created. -// If there are changes that fall under a recreate update, the the pod restart policy is set to PodRestartPolicyRecreate +// If there are changes that fall under a recreate update, then the pod restart policy is set to PodRestartPolicyRecreate // and the reconciliation will requeue and the deletions will continue to be executed until all the pods have been // removed. func (r *HumioClusterReconciler) ensureMismatchedPodsAreDeleted(ctx context.Context, hc *humiov1alpha1.HumioCluster, hnp *HumioNodePool) (reconcile.Result, error) { - foundPodList, err := kubernetes.ListPods(ctx, r, hnp.GetNamespace(), hnp.GetNodePoolLabels()) + foundPodListForNodePool, err := kubernetes.ListPods(ctx, r, hnp.GetNamespace(), hnp.GetNodePoolLabels()) if err != nil { return reconcile.Result{}, r.logErrorAndReturn(err, "failed to list pods") } // if we do not have any pods running we have nothing to delete - if len(foundPodList) == 0 { + if len(foundPodListForNodePool) == 0 { return reconcile.Result{}, nil } @@ -1854,11 +1854,6 @@ func (r *HumioClusterReconciler) ensureMismatchedPodsAreDeleted(ctx context.Cont attachments.dataVolumeSource = hnp.GetDataVolumePersistentVolumeClaimSpecTemplate("") } - podsStatus, err := r.getPodsStatus(ctx, hc, hnp, foundPodList) - if err != nil { - return reconcile.Result{}, r.logErrorAndReturn(err, "failed to get pod status") - } - envVarSourceData, err := r.getEnvVarSource(ctx, hnp) if err != nil { result, _ := r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). @@ -1885,17 +1880,9 @@ func (r *HumioClusterReconciler) ensureMismatchedPodsAreDeleted(ctx context.Cont } } - // prioritize deleting the pods with errors - var podList []corev1.Pod - if podsStatus.havePodsWithErrors() { - r.Log.Info(fmt.Sprintf("found %d humio pods with errors", len(podsStatus.podErrors))) - podList = podsStatus.podErrors - } else { - podList = foundPodList - } - desiredLifecycleState, err := r.getPodDesiredLifecycleState(hnp, podList, attachments) + podsStatus, err := r.getPodsStatus(ctx, hc, hnp, foundPodListForNodePool) if err != nil { - return reconcile.Result{}, r.logErrorAndReturn(err, "got error when getting pod desired lifecycle") + return reconcile.Result{}, r.logErrorAndReturn(err, "failed to get pod status") } if podsStatus.havePodsRequiringDeletion() { @@ -1908,6 +1895,67 @@ func (r *HumioClusterReconciler) ensureMismatchedPodsAreDeleted(ctx context.Cont return reconcile.Result{RequeueAfter: time.Second + 1}, nil } + // prioritize deleting the pods with errors + var podListConsideredForDeletion []corev1.Pod + podsWithErrors := false + if podsStatus.havePodsWithErrors() { + r.Log.Info(fmt.Sprintf("found %d humio pods with errors", len(podsStatus.podErrors))) + podListConsideredForDeletion = podsStatus.podErrors + podsWithErrors = true + } else { + podListConsideredForDeletion = foundPodListForNodePool + + // if zone awareness is enabled, pick a zone if we haven't picked one yet + if !helpers.UseEnvtest() { + if hnp.GetState() == humiov1alpha1.HumioClusterStateUpgrading || hnp.GetState() == humiov1alpha1.HumioClusterStateRestarting { + if *hnp.GetUpdateStrategy().EnableZoneAwareness { + // If we still have pods without the desired revision and have not picked a zone yet, pick one. + if hnp.GetZoneUnderMaintenance() == "" && len(podListConsideredForDeletion) > 0 { + // Filter out any pods that already have the right pod revision + podListForCurrentZoneWithWrongPodRevision := FilterPodsExcludePodsWithPodRevision(foundPodListForNodePool, hnp.GetDesiredPodRevision()) + r.Log.Info(fmt.Sprintf("zone awareness enabled, len(podListForCurrentZoneWithWrongPodRevision)=%d", len(podListForCurrentZoneWithWrongPodRevision))) + + if len(podListForCurrentZoneWithWrongPodRevision) > 0 { + newZoneUnderMaintenance, err := kubernetes.GetZoneForNodeName(ctx, r, podListForCurrentZoneWithWrongPodRevision[0].Spec.NodeName) + if err != nil { + return reconcile.Result{}, r.logErrorAndReturn(err, "unable to fetch zone") + } + + return r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). + withNodePoolState(hnp.GetState(), hnp.GetNodePoolName(), hnp.GetDesiredPodRevision(), newZoneUnderMaintenance)) + } + } + + // If there's a zone marked as under maintenance, we clear the zone-under-maintenance marker if no more work is left in that zone + if hnp.GetZoneUnderMaintenance() != "" { + r.Log.Info(fmt.Sprintf("zone awareness enabled, len(podListConsideredForDeletion)=%d", len(podListConsideredForDeletion))) + + // We have pods left and need to filter them by the zone marked as under maintenance + podListForCurrentZone, err := FilterPodsByZoneName(ctx, r, podListConsideredForDeletion, hnp.GetZoneUnderMaintenance()) + if err != nil { + return reconcile.Result{}, r.logErrorAndReturn(err, "got error filtering pods by zone name") + } + r.Log.Info(fmt.Sprintf("zone awareness enabled, len(podListForCurrentZone)=%d", len(podListForCurrentZone))) + + // Filter out any pods that already have the right pod revision, and clear the zone-under-maintenance marker if no pods are left after filtering + podListForCurrentZoneWithWrongPodRevision := FilterPodsExcludePodsWithPodRevision(podListForCurrentZone, hnp.GetDesiredPodRevision()) + r.Log.Info(fmt.Sprintf("zone awareness enabled, len(podListForCurrentZoneWithWrongPodRevision)=%d", len(podListForCurrentZoneWithWrongPodRevision))) + if len(podListForCurrentZoneWithWrongPodRevision) == 0 { + r.Log.Info(fmt.Sprintf("zone awareness enabled, no more pods for nodePool=%s in zone=%s", hnp.GetNodePoolName(), hnp.GetZoneUnderMaintenance())) + return r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). + withNodePoolState(hnp.GetState(), hnp.GetNodePoolName(), hnp.GetDesiredPodRevision(), "")) + } + } + } + } + } + } + + desiredLifecycleState, err := r.getPodDesiredLifecycleState(ctx, hnp, podListConsideredForDeletion, attachments, podsWithErrors) + if err != nil { + return reconcile.Result{}, r.logErrorAndReturn(err, "got error when getting pod desired lifecycle") + } + // If we are currently deleting pods, then check if the cluster state is Running or in a ConfigError state. If it // is, then change to an appropriate state depending on the restart policy. // If the cluster state is set as per the restart policy: @@ -1919,7 +1967,7 @@ func (r *HumioClusterReconciler) ensureMismatchedPodsAreDeleted(ctx context.Cont if desiredLifecycleState.WantsUpgrade() { r.Log.Info(fmt.Sprintf("changing cluster state from %s to %s with pod revision %d for node pool %s", hc.Status.State, humiov1alpha1.HumioClusterStateUpgrading, podRevision, hnp.GetNodePoolName())) if result, err := r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). - withNodePoolState(humiov1alpha1.HumioClusterStateUpgrading, hnp.GetNodePoolName(), podRevision)); err != nil { + withNodePoolState(humiov1alpha1.HumioClusterStateUpgrading, hnp.GetNodePoolName(), podRevision, "")); err != nil { return result, err } return reconcile.Result{Requeue: true}, nil @@ -1927,27 +1975,21 @@ func (r *HumioClusterReconciler) ensureMismatchedPodsAreDeleted(ctx context.Cont if !desiredLifecycleState.WantsUpgrade() && desiredLifecycleState.WantsRestart() { r.Log.Info(fmt.Sprintf("changing cluster state from %s to %s with pod revision %d for node pool %s", hc.Status.State, humiov1alpha1.HumioClusterStateRestarting, podRevision, hnp.GetNodePoolName())) if result, err := r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). - withNodePoolState(humiov1alpha1.HumioClusterStateRestarting, hnp.GetNodePoolName(), podRevision)); err != nil { + withNodePoolState(humiov1alpha1.HumioClusterStateRestarting, hnp.GetNodePoolName(), podRevision, "")); err != nil { return result, err } return reconcile.Result{Requeue: true}, nil } } if desiredLifecycleState.ShouldDeletePod() { - if hc.Status.State == humiov1alpha1.HumioClusterStateRestarting && podsStatus.waitingOnPods() && desiredLifecycleState.ShouldRollingRestart() { - r.Log.Info(fmt.Sprintf("pod %s should be deleted, but waiting because not all other pods are "+ - "ready. waitingOnPods=%v, clusterState=%s", desiredLifecycleState.pod.Name, - podsStatus.waitingOnPods(), hc.Status.State)) - return r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). - withMessage("waiting for pods to become ready")) - } - - if hc.Status.State == humiov1alpha1.HumioClusterStateUpgrading && podsStatus.waitingOnPods() && desiredLifecycleState.ShouldRollingRestart() { - r.Log.Info(fmt.Sprintf("pod %s should be deleted, but waiting because not all other pods are "+ - "ready. waitingOnPods=%v, clusterState=%s", desiredLifecycleState.pod.Name, - podsStatus.waitingOnPods(), hc.Status.State)) - return r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). - withMessage("waiting for pods to become ready")) + if hc.Status.State == humiov1alpha1.HumioClusterStateRestarting || hc.Status.State == humiov1alpha1.HumioClusterStateUpgrading { + if podsStatus.waitingOnPods() && desiredLifecycleState.ShouldRollingRestart() { + r.Log.Info(fmt.Sprintf("pod %s should be deleted, but waiting because not all other pods are "+ + "ready. waitingOnPods=%v, clusterState=%s", desiredLifecycleState.pod.Name, + podsStatus.waitingOnPods(), hc.Status.State)) + return r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). + withMessage("waiting for pods to become ready")) + } } var remainingMinReadyWaitTime = desiredLifecycleState.RemainingMinReadyWaitTime(podsStatus.podsReady) @@ -1984,12 +2026,12 @@ func (r *HumioClusterReconciler) ensureMismatchedPodsAreDeleted(ctx context.Cont // If we are no longer waiting on or deleting pods, and all the revisions are in sync, then we know the upgrade or // restart is complete and we can set the cluster state back to HumioClusterStateRunning. // It's possible we entered a ConfigError state during an upgrade or restart, and in this case, we should reset the - // state to Running if the the pods are healthy but we're in a ConfigError state. + // state to Running if then the pods are healthy but we're in a ConfigError state. if !podsStatus.waitingOnPods() && !desiredLifecycleState.WantsUpgrade() && !desiredLifecycleState.WantsRestart() && podsStatus.podRevisionsInSync() { if hc.Status.State == humiov1alpha1.HumioClusterStateRestarting || hc.Status.State == humiov1alpha1.HumioClusterStateUpgrading || hc.Status.State == humiov1alpha1.HumioClusterStateConfigError { r.Log.Info(fmt.Sprintf("no longer deleting pods. changing cluster state from %s to %s", hc.Status.State, humiov1alpha1.HumioClusterStateRunning)) if result, err := r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). - withNodePoolState(humiov1alpha1.HumioClusterStateRunning, hnp.GetNodePoolName(), hnp.GetDesiredPodRevision())); err != nil { + withNodePoolState(humiov1alpha1.HumioClusterStateRunning, hnp.GetNodePoolName(), hnp.GetDesiredPodRevision(), "")); err != nil { return result, err } } diff --git a/controllers/humiocluster_defaults.go b/controllers/humiocluster_defaults.go index 5b4561e8..ef5658f3 100644 --- a/controllers/humiocluster_defaults.go +++ b/controllers/humiocluster_defaults.go @@ -18,7 +18,6 @@ package controllers import ( "fmt" - "os" "reflect" "strconv" "strings" @@ -82,13 +81,19 @@ type HumioNodePool struct { ingress humiov1alpha1.HumioClusterIngressSpec clusterAnnotations map[string]string desiredPodRevision int + state string + zoneUnderMaintenance string } func NewHumioNodeManagerFromHumioCluster(hc *humiov1alpha1.HumioCluster) *HumioNodePool { desiredPodRevision := 0 + zoneUnderMaintenance := "" + state := "" for _, status := range hc.Status.NodePoolStatus { if status.Name == hc.Name { desiredPodRevision = status.DesiredPodRevision + zoneUnderMaintenance = status.ZoneUnderMaintenance + state = status.State break } } @@ -151,14 +156,21 @@ func NewHumioNodeManagerFromHumioCluster(hc *humiov1alpha1.HumioCluster) *HumioN ingress: hc.Spec.Ingress, clusterAnnotations: hc.Annotations, desiredPodRevision: desiredPodRevision, + zoneUnderMaintenance: zoneUnderMaintenance, + state: state, } } func NewHumioNodeManagerFromHumioNodePool(hc *humiov1alpha1.HumioCluster, hnp *humiov1alpha1.HumioNodePoolSpec) *HumioNodePool { desiredPodRevision := 0 + zoneUnderMaintenance := "" + state := "" + for _, status := range hc.Status.NodePoolStatus { if status.Name == strings.Join([]string{hc.Name, hnp.Name}, "-") { desiredPodRevision = status.DesiredPodRevision + zoneUnderMaintenance = status.ZoneUnderMaintenance + state = status.State break } } @@ -221,6 +233,8 @@ func NewHumioNodeManagerFromHumioNodePool(hc *humiov1alpha1.HumioCluster, hnp *h ingress: hc.Spec.Ingress, clusterAnnotations: hc.Annotations, desiredPodRevision: desiredPodRevision, + zoneUnderMaintenance: zoneUnderMaintenance, + state: state, } } @@ -252,8 +266,8 @@ func (hnp *HumioNodePool) GetImage() string { return hnp.humioNodeSpec.Image } - if os.Getenv("HUMIO_OPERATOR_DEFAULT_HUMIO_CORE_IMAGE") != "" { - return os.Getenv("HUMIO_OPERATOR_DEFAULT_HUMIO_CORE_IMAGE") + if defaultImageFromEnvVar := helpers.GetDefaultHumioCoreImageFromEnvVar(); defaultImageFromEnvVar != "" { + return defaultImageFromEnvVar } return versions.DefaultHumioImageVersion() @@ -268,8 +282,8 @@ func (hnp *HumioNodePool) GetHelperImage() string { return hnp.humioNodeSpec.HelperImage } - if os.Getenv("HUMIO_OPERATOR_DEFAULT_HUMIO_HELPER_IMAGE") != "" { - return os.Getenv("HUMIO_OPERATOR_DEFAULT_HUMIO_HELPER_IMAGE") + if defaultHelperImageFromEnvVar := helpers.GetDefaultHumioHelperImageFromEnvVar(); defaultHelperImageFromEnvVar != "" { + return defaultHelperImageFromEnvVar } return versions.DefaultHelperImageVersion() @@ -308,6 +322,14 @@ func (hnp *HumioNodePool) GetDesiredPodRevision() int { return hnp.desiredPodRevision } +func (hnp *HumioNodePool) GetZoneUnderMaintenance() string { + return hnp.zoneUnderMaintenance +} + +func (hnp *HumioNodePool) GetState() string { + return hnp.state +} + func (hnp *HumioNodePool) GetIngress() humiov1alpha1.HumioClusterIngressSpec { return hnp.ingress } @@ -575,7 +597,7 @@ func (hnp *HumioNodePool) GetContainerReadinessProbe() *corev1.Probe { SuccessThreshold: 1, FailureThreshold: 10, } - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { probe.InitialDelaySeconds = 0 } return probe @@ -817,13 +839,20 @@ func (hnp *HumioNodePool) GetProbeScheme() corev1.URIScheme { } func (hnp *HumioNodePool) GetUpdateStrategy() *humiov1alpha1.HumioUpdateStrategy { + defaultZoneAwareness := true + if hnp.humioNodeSpec.UpdateStrategy != nil { + if hnp.humioNodeSpec.UpdateStrategy.EnableZoneAwareness == nil { + hnp.humioNodeSpec.UpdateStrategy.EnableZoneAwareness = &defaultZoneAwareness + } + return hnp.humioNodeSpec.UpdateStrategy } return &humiov1alpha1.HumioUpdateStrategy{ - Type: humiov1alpha1.HumioClusterUpdateStrategyReplaceAllOnUpdate, - MinReadySeconds: 0, + Type: humiov1alpha1.HumioClusterUpdateStrategyReplaceAllOnUpdate, + MinReadySeconds: 0, + EnableZoneAwareness: &defaultZoneAwareness, } } diff --git a/controllers/humiocluster_pods.go b/controllers/humiocluster_pods.go index 34e1d29c..db06068c 100644 --- a/controllers/humiocluster_pods.go +++ b/controllers/humiocluster_pods.go @@ -23,6 +23,7 @@ import ( "fmt" "reflect" "sort" + "strconv" "strings" "time" @@ -718,12 +719,12 @@ func (r *HumioClusterReconciler) waitForNewPods(ctx context.Context, hnp *HumioN return fmt.Errorf("timed out waiting to validate new pods was created") } -func (r *HumioClusterReconciler) podsMatch(hnp *HumioNodePool, pod corev1.Pod, desiredPod corev1.Pod) (bool, error) { +func (r *HumioClusterReconciler) podsMatch(hnp *HumioNodePool, pod corev1.Pod, desiredPod corev1.Pod) bool { if _, ok := pod.Annotations[PodHashAnnotation]; !ok { - return false, fmt.Errorf("did not find annotation with pod hash") + return false } if _, ok := pod.Annotations[PodRevisionAnnotation]; !ok { - return false, fmt.Errorf("did not find annotation with pod revision") + return false } var specMatches bool @@ -779,28 +780,35 @@ func (r *HumioClusterReconciler) podsMatch(hnp *HumioNodePool, pod corev1.Pod, d podSpecDiff := cmp.Diff(sanitizedCurrentPod.Spec, sanitizedDesiredPod.Spec) if !specMatches { r.Log.Info(fmt.Sprintf("pod annotation %s does not match desired pod: got %+v, expected %+v", PodHashAnnotation, pod.Annotations[PodHashAnnotation], desiredPodHash), "podSpecDiff", podSpecDiff) - return false, nil + return false } if !revisionMatches { r.Log.Info(fmt.Sprintf("pod annotation %s does not match desired pod: got %+v, expected %+v", PodRevisionAnnotation, pod.Annotations[PodRevisionAnnotation], desiredPod.Annotations[PodRevisionAnnotation]), "podSpecDiff", podSpecDiff) - return false, nil + return false } if !envVarSourceMatches { r.Log.Info(fmt.Sprintf("pod annotation %s does not match desired pod: got %+v, expected %+v", envVarSourceHashAnnotation, pod.Annotations[envVarSourceHashAnnotation], desiredPod.Annotations[envVarSourceHashAnnotation]), "podSpecDiff", podSpecDiff) - return false, nil + return false } if !certHasAnnotationMatches { r.Log.Info(fmt.Sprintf("pod annotation %s does not match desired pod: got %+v, expected %+v", certHashAnnotation, pod.Annotations[certHashAnnotation], desiredPod.Annotations[certHashAnnotation]), "podSpecDiff", podSpecDiff) - return false, nil + return false } if !bootstrapTokenAnnotationMatches { r.Log.Info(fmt.Sprintf("pod annotation %s bootstrapTokenAnnotationMatches not match desired pod: got %+v, expected %+v", bootstrapTokenHashAnnotation, pod.Annotations[bootstrapTokenHashAnnotation], desiredPod.Annotations[bootstrapTokenHashAnnotation]), "podSpecDiff", podSpecDiff) - return false, nil + return false } - return true, nil + return true } -func (r *HumioClusterReconciler) getPodDesiredLifecycleState(hnp *HumioNodePool, foundPodList []corev1.Pod, attachments *podAttachments) (podLifecycleState, error) { +// getPodDesiredLifecycleState goes through the list of pods and decides what action to take for the pods. +// It compares pods it is given with a newly-constructed pod. If they do not match, we know we have +// "at least" a configuration difference and require a rolling replacement of the pods. +// If the container image differs, it will indicate that a version difference is present. +// For very specific configuration differences it may indicate that all pods in the node pool should be +// replaced simultaneously. +// The value of podLifecycleState.pod indicates what pod should be replaced next. +func (r *HumioClusterReconciler) getPodDesiredLifecycleState(ctx context.Context, hnp *HumioNodePool, foundPodList []corev1.Pod, attachments *podAttachments, podsWithErrors bool) (podLifecycleState, error) { for _, pod := range foundPodList { podLifecycleStateValue := NewPodLifecycleState(*hnp, pod) @@ -822,10 +830,7 @@ func (r *HumioClusterReconciler) getPodDesiredLifecycleState(hnp *HumioNodePool, desiredPod.Annotations[bootstrapTokenHashAnnotation] = attachments.bootstrapTokenSecretReference.hash } - podsMatch, err := r.podsMatch(hnp, pod, *desiredPod) - if err != nil { - r.Log.Error(err, "failed to check if pods match") - } + podsMatch := r.podsMatch(hnp, pod, *desiredPod) // ignore pod if it matches the desired pod if podsMatch { @@ -855,6 +860,29 @@ func (r *HumioClusterReconciler) getPodDesiredLifecycleState(hnp *HumioNodePool, podLifecycleStateValue.configurationDifference.requiresSimultaneousRestart = true } + // if we run with envtest, we won't have zone information available + if !helpers.UseEnvtest() { + // if zone awareness is enabled, ignore pod if zone is incorrect + if !podsWithErrors && *hnp.GetUpdateStrategy().EnableZoneAwareness { + r.Log.Info(fmt.Sprintf("zone awareness enabled, looking up zone for pod=%s", pod.Name)) + if pod.Spec.NodeName == "" { + // current pod does not have a nodeName set + r.Log.Info(fmt.Sprintf("pod=%s does not have a nodeName set, ignoring", pod.Name)) + continue + } + + // fetch zone for node name and ignore pod if zone is not the one that is marked as under maintenance + zoneForNodeName, err := kubernetes.GetZoneForNodeName(ctx, r, pod.Spec.NodeName) + if err != nil { + return podLifecycleState{}, r.logErrorAndReturn(err, "could get zone name for node") + } + if hnp.GetZoneUnderMaintenance() != "" && zoneForNodeName != hnp.GetZoneUnderMaintenance() { + r.Log.Info(fmt.Sprintf("ignoring pod=%s as zoneUnderMaintenace=%s but pod has nodeName=%s where zone=%s", pod.Name, hnp.GetZoneUnderMaintenance(), pod.Spec.NodeName, zoneForNodeName)) + continue + } + } + } + return *podLifecycleStateValue, nil } return podLifecycleState{}, nil @@ -1061,3 +1089,30 @@ func findPodForPvc(podList []corev1.Pod, pvc corev1.PersistentVolumeClaim) (core return corev1.Pod{}, fmt.Errorf("could not find a pod for pvc %s", pvc.Name) } +func FilterPodsByZoneName(ctx context.Context, c client.Client, podList []corev1.Pod, zoneName string) ([]corev1.Pod, error) { + filteredPodList := []corev1.Pod{} + for _, pod := range podList { + zoneForNodeName, err := kubernetes.GetZoneForNodeName(ctx, c, pod.Spec.NodeName) + if err != nil { + return nil, err + } + if zoneForNodeName == zoneName { + filteredPodList = append(filteredPodList, pod) + } + } + return filteredPodList, nil +} + +func FilterPodsExcludePodsWithPodRevision(podList []corev1.Pod, podRevisionToExclude int) []corev1.Pod { + filteredPodList := []corev1.Pod{} + for _, pod := range podList { + podRevision, found := pod.Annotations[PodRevisionAnnotation] + if found { + if strconv.Itoa(podRevisionToExclude) == podRevision { + continue + } + } + filteredPodList = append(filteredPodList, pod) + } + return filteredPodList +} diff --git a/controllers/humiocluster_status.go b/controllers/humiocluster_status.go index 8024552c..77e84619 100644 --- a/controllers/humiocluster_status.go +++ b/controllers/humiocluster_status.go @@ -48,9 +48,10 @@ type messageOption struct { } type stateOption struct { - state string - nodePoolName string - desiredPodRevision int + state string + nodePoolName string + desiredPodRevision int + zoneUnderMaintenance string } type stateOptionList struct { @@ -101,11 +102,12 @@ func (o *optionBuilder) withState(state string) *optionBuilder { return o } -func (o *optionBuilder) withNodePoolState(state string, nodePoolName string, podRevision int) *optionBuilder { +func (o *optionBuilder) withNodePoolState(state string, nodePoolName string, podRevision int, zoneName string) *optionBuilder { o.options = append(o.options, stateOption{ - state: state, - nodePoolName: nodePoolName, - desiredPodRevision: podRevision, + state: state, + nodePoolName: nodePoolName, + desiredPodRevision: podRevision, + zoneUnderMaintenance: zoneName, }) return o } @@ -113,7 +115,12 @@ func (o *optionBuilder) withNodePoolState(state string, nodePoolName string, pod func (o *optionBuilder) withNodePoolStatusList(humioNodePoolStatusList humiov1alpha1.HumioNodePoolStatusList) *optionBuilder { var statesList []stateOption for _, poolStatus := range humioNodePoolStatusList { - statesList = append(statesList, stateOption{nodePoolName: poolStatus.Name, state: poolStatus.State, desiredPodRevision: poolStatus.DesiredPodRevision}) + statesList = append(statesList, stateOption{ + nodePoolName: poolStatus.Name, + state: poolStatus.State, + desiredPodRevision: poolStatus.DesiredPodRevision, + zoneUnderMaintenance: poolStatus.ZoneUnderMaintenance, + }) } o.options = append(o.options, stateOptionList{ statesList: statesList, @@ -174,15 +181,17 @@ func (s stateOption) Apply(hc *humiov1alpha1.HumioCluster) { if nodePoolStatus.Name == s.nodePoolName { nodePoolStatus.State = s.state nodePoolStatus.DesiredPodRevision = s.desiredPodRevision + nodePoolStatus.ZoneUnderMaintenance = s.zoneUnderMaintenance hc.Status.NodePoolStatus[idx] = nodePoolStatus return } } hc.Status.NodePoolStatus = append(hc.Status.NodePoolStatus, humiov1alpha1.HumioNodePoolStatus{ - Name: s.nodePoolName, - State: s.state, - DesiredPodRevision: s.desiredPodRevision, + Name: s.nodePoolName, + State: s.state, + DesiredPodRevision: s.desiredPodRevision, + ZoneUnderMaintenance: s.zoneUnderMaintenance, }) } } @@ -202,9 +211,10 @@ func (s stateOptionList) Apply(hc *humiov1alpha1.HumioCluster) { hc.Status.NodePoolStatus = humiov1alpha1.HumioNodePoolStatusList{} for _, poolStatus := range s.statesList { hc.Status.NodePoolStatus = append(hc.Status.NodePoolStatus, humiov1alpha1.HumioNodePoolStatus{ - Name: poolStatus.nodePoolName, - State: poolStatus.state, - DesiredPodRevision: poolStatus.desiredPodRevision, + Name: poolStatus.nodePoolName, + State: poolStatus.state, + DesiredPodRevision: poolStatus.desiredPodRevision, + ZoneUnderMaintenance: poolStatus.zoneUnderMaintenance, }) } } diff --git a/controllers/suite/clusters/humiocluster_controller_test.go b/controllers/suite/clusters/humiocluster_controller_test.go index 8998e023..872e30c1 100644 --- a/controllers/suite/clusters/humiocluster_controller_test.go +++ b/controllers/suite/clusters/humiocluster_controller_test.go @@ -19,9 +19,9 @@ package clusters import ( "context" "fmt" - "os" "reflect" "strings" + "time" cmapi "github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1" humiov1alpha1 "github.com/humio/humio-operator/api/v1alpha1" @@ -40,6 +40,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" + "sigs.k8s.io/controller-runtime/pkg/client" ) var _ = Describe("HumioCluster Controller", func() { @@ -533,7 +534,7 @@ var _ = Describe("HumioCluster Controller", func() { Eventually(func() []corev1.Pod { var clusterPods []corev1.Pod clusterPods, _ = kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(&updatedHumioCluster).GetPodLabels()) - _ = suite.MarkPodsAsRunning(ctx, k8sClient, clusterPods, key.Name) + _ = suite.MarkPodsAsRunningIfUsingEnvtest(ctx, k8sClient, clusterPods, key.Name) return clusterPods }, testTimeout, suite.TestInterval).Should(HaveLen(toCreate.Spec.NodeCount)) @@ -1129,7 +1130,7 @@ var _ = Describe("HumioCluster Controller", func() { suite.UsingClusterBy(key.Name, "Simulating mock pods to be scheduled") clusterPods, _ = kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(toCreate).GetPodLabels()) - _ = suite.MarkPodsAsRunning(ctx, k8sClient, clusterPods, key.Name) + _ = suite.MarkPodsAsRunningIfUsingEnvtest(ctx, k8sClient, clusterPods, key.Name) suite.UsingClusterBy(key.Name, "Waiting for humio cluster state to be Running") Eventually(func() string { @@ -1205,7 +1206,7 @@ var _ = Describe("HumioCluster Controller", func() { suite.UsingClusterBy(key.Name, "Validating pod uses default helper image as init container") Eventually(func() string { clusterPods, _ := kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(toCreate).GetPodLabels()) - _ = suite.MarkPodsAsRunning(ctx, k8sClient, clusterPods, key.Name) + _ = suite.MarkPodsAsRunningIfUsingEnvtest(ctx, k8sClient, clusterPods, key.Name) for _, pod := range clusterPods { initIdx, _ := kubernetes.GetInitContainerIndexByName(pod, controllers.InitContainerName) @@ -1269,7 +1270,7 @@ var _ = Describe("HumioCluster Controller", func() { suite.UsingClusterBy(key.Name, "Validating pod bootstrap token annotation hash") Eventually(func() string { clusterPods, _ := kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(toCreate).GetPodLabels()) - _ = suite.MarkPodsAsRunning(ctx, k8sClient, clusterPods, key.Name) + _ = suite.MarkPodsAsRunningIfUsingEnvtest(ctx, k8sClient, clusterPods, key.Name) if len(clusterPods) > 0 { return clusterPods[0].Annotations["humio.com/bootstrap-token-hash"] @@ -1304,7 +1305,7 @@ var _ = Describe("HumioCluster Controller", func() { suite.UsingClusterBy(key.Name, "Validating pod is recreated with the new bootstrap token hash annotation") Eventually(func() string { clusterPods, _ := kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(toCreate).GetPodLabels()) - _ = suite.MarkPodsAsRunning(ctx, k8sClient, clusterPods, key.Name) + _ = suite.MarkPodsAsRunningIfUsingEnvtest(ctx, k8sClient, clusterPods, key.Name) if len(clusterPods) > 0 { return clusterPods[0].Annotations["humio.com/bootstrap-token-hash"] @@ -3336,15 +3337,15 @@ var _ = Describe("HumioCluster Controller", func() { suite.CreateAndBootstrapCluster(ctx, k8sClient, testHumioClient, toCreate, true, humiov1alpha1.HumioClusterStateRunning, testTimeout) defer suite.CleanupCluster(ctx, k8sClient, toCreate) - initialExpectedVolumesCount := 5 - initialExpectedHumioContainerVolumeMountsCount := 4 + initialExpectedVolumesCount := 5 // shared, tmp, humio-data, extra-kafka-configs, init-service-account-secret + initialExpectedHumioContainerVolumeMountsCount := 4 // shared, tmp, humio-data, extra-kafka-configs - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" { + if !helpers.UseEnvtest() { // k8s will automatically inject a service account token initialExpectedVolumesCount += 1 // kube-api-access- initialExpectedHumioContainerVolumeMountsCount += 1 // kube-api-access- - if helpers.UseCertManager() { + if helpers.TLSEnabled(toCreate) { initialExpectedVolumesCount += 1 // tls-cert initialExpectedHumioContainerVolumeMountsCount += 1 // tls-cert } @@ -4525,7 +4526,7 @@ var _ = Describe("HumioCluster Controller", func() { suite.UsingClusterBy(key.Name, "Validating pod is created with the default grace period") Eventually(func() int64 { clusterPods, _ := kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(toCreate).GetPodLabels()) - _ = suite.MarkPodsAsRunning(ctx, k8sClient, clusterPods, key.Name) + _ = suite.MarkPodsAsRunningIfUsingEnvtest(ctx, k8sClient, clusterPods, key.Name) for _, pod := range clusterPods { if pod.Spec.TerminationGracePeriodSeconds != nil { @@ -4985,4 +4986,136 @@ var _ = Describe("HumioCluster Controller", func() { }, testTimeout, suite.TestInterval).Should(HaveKeyWithValue(kubernetes.NodePoolLabelName, key.Name)) }) }) + + Context("test rolling update with max unavailable absolute value", Label("envtest", "dummy"), func() { + It("Update should correctly replace pods to use new image in a rolling fashion", func() { + key := types.NamespacedName{ + Name: "hc-update-absolute-maxunavail", + Namespace: testProcessNamespace, + } + maxUnavailable := intstr.FromInt32(1) + toCreate := suite.ConstructBasicSingleNodeHumioCluster(key, true) + toCreate.Spec.Image = versions.OldSupportedHumioVersion() + toCreate.Spec.NodeCount = 9 + toCreate.Spec.UpdateStrategy = &humiov1alpha1.HumioUpdateStrategy{ + Type: humiov1alpha1.HumioClusterUpdateStrategyRollingUpdate, + } + + suite.UsingClusterBy(key.Name, "Creating the cluster successfully") + ctx := context.Background() + suite.CreateAndBootstrapCluster(ctx, k8sClient, testHumioClient, toCreate, true, humiov1alpha1.HumioClusterStateRunning, testTimeout) + defer suite.CleanupCluster(ctx, k8sClient, toCreate) + + var updatedHumioCluster humiov1alpha1.HumioCluster + clusterPods, _ := kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(toCreate).GetPodLabels()) + for _, pod := range clusterPods { + humioIndex, _ := kubernetes.GetContainerIndexByName(pod, controllers.HumioContainerName) + Expect(pod.Spec.Containers[humioIndex].Image).To(BeIdenticalTo(toCreate.Spec.Image)) + Expect(pod.Annotations).To(HaveKeyWithValue(controllers.PodRevisionAnnotation, "1")) + } + updatedHumioCluster = humiov1alpha1.HumioCluster{} + Expect(k8sClient.Get(ctx, key, &updatedHumioCluster)).Should(Succeed()) + Expect(controllers.NewHumioNodeManagerFromHumioCluster(&updatedHumioCluster).GetDesiredPodRevision()).To(BeEquivalentTo(1)) + + mostSeenUnavailable := 0 + forever := make(chan struct{}) + ctx2, cancel := context.WithCancel(context.Background()) + + // TODO: Consider refactoring goroutine to a "watcher". https://book-v1.book.kubebuilder.io/beyond_basics/controller_watches + // Using a for-loop executing ListPods will only see snapshots in time and we could easily miss + // a point in time where we have too many pods that are not ready. + go func(ctx2 context.Context, k8sClient client.Client, toCreate humiov1alpha1.HumioCluster) { + hnp := controllers.NewHumioNodeManagerFromHumioCluster(&toCreate) + for { + select { + case <-ctx2.Done(): // if cancel() execute + forever <- struct{}{} + return + default: + // Assume all is unavailable, and decrement number each time we see one that is working + unavailableThisRound := hnp.GetNodeCount() + + pods, _ := kubernetes.ListPods(ctx2, k8sClient, hnp.GetNamespace(), hnp.GetPodLabels()) + suite.UsingClusterBy(key.Name, fmt.Sprintf("goroutine looking for unavailable pods: len(pods)=%d", len(pods))) + for _, pod := range pods { + suite.UsingClusterBy(key.Name, fmt.Sprintf("goroutine looking for unavailable pods: pod.Status.Phase=%s", pod.Status.Phase)) + + if pod.Status.Phase == corev1.PodFailed { + suite.UsingClusterBy(key.Name, fmt.Sprintf("goroutine looking for unavailable pods, full pod dump of failing pod: %+v", pod)) + var eventList corev1.EventList + _ = k8sClient.List(ctx2, &eventList) + for _, event := range eventList.Items { + if event.InvolvedObject.UID == pod.UID { + suite.UsingClusterBy(key.Name, fmt.Sprintf("Found event for failing pod: involvedObject=%+v, reason=%s, message=%s, source=%+v", event.InvolvedObject, event.Reason, event.Message, event.Source)) + } + } + } + + if pod.Status.Phase == corev1.PodRunning { + for idx, containerStatus := range pod.Status.ContainerStatuses { + suite.UsingClusterBy(key.Name, fmt.Sprintf("goroutine looking for unavailable pods: pod.Status.ContainerStatuses[%d]=%+v", idx, containerStatus)) + if containerStatus.Ready { + unavailableThisRound-- + } + } + } + } + // Save the number of unavailable pods in this round + mostSeenUnavailable = max(mostSeenUnavailable, unavailableThisRound) + } + time.Sleep(1 * time.Second) + } + }(ctx2, k8sClient, *toCreate) + + suite.UsingClusterBy(key.Name, "Updating the cluster image successfully") + updatedImage := versions.DefaultHumioImageVersion() + Eventually(func() error { + updatedHumioCluster = humiov1alpha1.HumioCluster{} + err := k8sClient.Get(ctx, key, &updatedHumioCluster) + if err != nil { + return err + } + updatedHumioCluster.Spec.Image = updatedImage + return k8sClient.Update(ctx, &updatedHumioCluster) + }, testTimeout, suite.TestInterval).Should(Succeed()) + + Eventually(func() string { + updatedHumioCluster = humiov1alpha1.HumioCluster{} + Expect(k8sClient.Get(ctx, key, &updatedHumioCluster)).Should(Succeed()) + return updatedHumioCluster.Status.State + }, testTimeout, suite.TestInterval).Should(BeIdenticalTo(humiov1alpha1.HumioClusterStateUpgrading)) + + ensurePodsRollingRestart(ctx, controllers.NewHumioNodeManagerFromHumioCluster(&updatedHumioCluster), 2) + + Eventually(func() string { + updatedHumioCluster = humiov1alpha1.HumioCluster{} + Expect(k8sClient.Get(ctx, key, &updatedHumioCluster)).Should(Succeed()) + return updatedHumioCluster.Status.State + }, testTimeout, suite.TestInterval).Should(BeIdenticalTo(humiov1alpha1.HumioClusterStateRunning)) + + suite.UsingClusterBy(key.Name, "Confirming pod revision is the same for all pods and the cluster itself") + updatedHumioCluster = humiov1alpha1.HumioCluster{} + Expect(k8sClient.Get(ctx, key, &updatedHumioCluster)).Should(Succeed()) + Expect(controllers.NewHumioNodeManagerFromHumioCluster(&updatedHumioCluster).GetDesiredPodRevision()).To(BeEquivalentTo(2)) + + updatedClusterPods, _ := kubernetes.ListPods(ctx, k8sClient, updatedHumioCluster.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(&updatedHumioCluster).GetPodLabels()) + Expect(updatedClusterPods).To(HaveLen(toCreate.Spec.NodeCount)) + for _, pod := range updatedClusterPods { + humioIndex, _ := kubernetes.GetContainerIndexByName(pod, controllers.HumioContainerName) + Expect(pod.Spec.Containers[humioIndex].Image).To(BeIdenticalTo(updatedImage)) + Expect(pod.Annotations).To(HaveKeyWithValue(controllers.PodRevisionAnnotation, "2")) + } + + cancel() + <-forever + + if helpers.TLSEnabled(&updatedHumioCluster) { + suite.UsingClusterBy(key.Name, "Ensuring pod names are not changed") + Expect(podNames(clusterPods)).To(Equal(podNames(updatedClusterPods))) + } + + suite.UsingClusterBy(key.Name, fmt.Sprintf("Verifying we do not have too many unavailable pods during pod replacements, mostSeenUnavailable(%d) <= maxUnavailable(%d)", mostSeenUnavailable, maxUnavailable.IntValue())) + Expect(mostSeenUnavailable).To(BeNumerically("<=", maxUnavailable.IntValue())) + }) + }) }) diff --git a/controllers/suite/clusters/suite_test.go b/controllers/suite/clusters/suite_test.go index e75f3f7a..3f192126 100644 --- a/controllers/suite/clusters/suite_test.go +++ b/controllers/suite/clusters/suite_test.go @@ -20,7 +20,6 @@ import ( "context" "encoding/json" "fmt" - "os" "path/filepath" "sort" "strconv" @@ -87,17 +86,17 @@ var _ = BeforeSuite(func() { By("bootstrapping test environment") useExistingCluster := true testProcessNamespace = fmt.Sprintf("e2e-clusters-%d", GinkgoParallelProcess()) - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" { + if !helpers.UseEnvtest() { testTimeout = time.Second * 900 testEnv = &envtest.Environment{ UseExistingCluster: &useExistingCluster, } - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { testHumioClient = humio.NewMockClient() } else { testHumioClient = humio.NewClient(log, "") By("Verifying we have a valid license, as tests will require starting up real LogScale containers") - Expect(os.Getenv("HUMIO_E2E_LICENSE")).NotTo(BeEmpty()) + Expect(helpers.GetE2ELicenseFromEnvVar()).NotTo(BeEmpty()) } } else { testTimeout = time.Second * 30 @@ -337,8 +336,8 @@ func constructBasicMultiNodePoolHumioCluster(key types.NamespacedName, useAutoCr return toCreate } -func markPodAsPending(ctx context.Context, client client.Client, nodeID int, pod corev1.Pod, clusterName string) error { - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" { +func markPodAsPendingIfUsingEnvtest(ctx context.Context, client client.Client, nodeID int, pod corev1.Pod, clusterName string) error { + if !helpers.UseEnvtest() { return nil } @@ -356,8 +355,8 @@ func markPodAsPending(ctx context.Context, client client.Client, nodeID int, pod return client.Status().Update(ctx, &pod) } -func markPodsWithRevisionAsReady(ctx context.Context, hnp *controllers.HumioNodePool, podRevision int, desiredReadyPodCount int) { - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" { +func markPodsWithRevisionAsReadyIfUsingEnvTest(ctx context.Context, hnp *controllers.HumioNodePool, podRevision int, desiredReadyPodCount int) { + if !helpers.UseEnvtest() { return } foundPodList, _ := kubernetes.ListPods(ctx, k8sClient, hnp.GetNamespace(), hnp.GetNodePoolLabels()) @@ -390,7 +389,7 @@ func markPodsWithRevisionAsReady(ctx context.Context, hnp *controllers.HumioNode for i := range podListWithRevision { suite.UsingClusterBy(hnp.GetClusterName(), fmt.Sprintf("Considering pod %s with podIP %s", podListWithRevision[i].Name, podListWithRevision[i].Status.PodIP)) if podListWithRevision[i].Status.PodIP == "" { - err := suite.MarkPodAsRunning(ctx, k8sClient, podListWithRevision[i], hnp.GetClusterName()) + err := suite.MarkPodAsRunningIfUsingEnvtest(ctx, k8sClient, podListWithRevision[i], hnp.GetClusterName()) if err != nil { suite.UsingClusterBy(hnp.GetClusterName(), fmt.Sprintf("Got error while marking pod %s as running: %v", podListWithRevision[i].Name, err)) } @@ -444,7 +443,7 @@ func podPendingCountByRevision(ctx context.Context, hnp *controllers.HumioNodePo clusterPods, _ := kubernetes.ListPods(ctx, k8sClient, hnp.GetNamespace(), hnp.GetNodePoolLabels()) for nodeID, pod := range clusterPods { revision, _ := strconv.Atoi(pod.Annotations[controllers.PodRevisionAnnotation]) - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" { + if !helpers.UseEnvtest() { if pod.DeletionTimestamp == nil { for _, condition := range pod.Status.Conditions { if condition.Type == corev1.PodScheduled { @@ -456,7 +455,7 @@ func podPendingCountByRevision(ctx context.Context, hnp *controllers.HumioNodePo } } else { if nodeID+1 <= expectedPendingCount { - _ = markPodAsPending(ctx, k8sClient, nodeID, pod, hnp.GetClusterName()) + _ = markPodAsPendingIfUsingEnvtest(ctx, k8sClient, nodeID, pod, hnp.GetClusterName()) revisionToPendingCount[revision]++ } } @@ -484,7 +483,7 @@ func ensurePodsRollingRestart(ctx context.Context, hnp *controllers.HumioNodePoo for expectedReadyCount := 1; expectedReadyCount < hnp.GetNodeCount()+1; expectedReadyCount++ { Eventually(func() map[int]int { suite.UsingClusterBy(hnp.GetClusterName(), fmt.Sprintf("ensurePodsRollingRestart Ensuring replacement pods are ready one at a time expectedReadyCount=%d", expectedReadyCount)) - markPodsWithRevisionAsReady(ctx, hnp, expectedPodRevision, expectedReadyCount) + markPodsWithRevisionAsReadyIfUsingEnvTest(ctx, hnp, expectedPodRevision, expectedReadyCount) return podReadyCountByRevision(ctx, hnp, expectedPodRevision) }, testTimeout, suite.TestInterval).Should(HaveKeyWithValue(expectedPodRevision, expectedReadyCount)) } @@ -502,7 +501,7 @@ func ensurePodsGoPending(ctx context.Context, hnp *controllers.HumioNodePool, ex func ensurePodsTerminate(ctx context.Context, hnp *controllers.HumioNodePool, expectedPodRevision int) { suite.UsingClusterBy(hnp.GetClusterName(), "ensurePodsTerminate Ensuring all existing pods are terminated at the same time") Eventually(func() map[int]int { - markPodsWithRevisionAsReady(ctx, hnp, expectedPodRevision, 0) + markPodsWithRevisionAsReadyIfUsingEnvTest(ctx, hnp, expectedPodRevision, 0) numPodsReadyByRevision := podReadyCountByRevision(ctx, hnp, expectedPodRevision) suite.UsingClusterBy(hnp.GetClusterName(), fmt.Sprintf("podsReadyCountByRevision() = %#+v", numPodsReadyByRevision)) return numPodsReadyByRevision @@ -510,7 +509,7 @@ func ensurePodsTerminate(ctx context.Context, hnp *controllers.HumioNodePool, ex suite.UsingClusterBy(hnp.GetClusterName(), "ensurePodsTerminate Ensuring replacement pods are not ready at the same time") Eventually(func() map[int]int { - markPodsWithRevisionAsReady(ctx, hnp, expectedPodRevision, 0) + markPodsWithRevisionAsReadyIfUsingEnvTest(ctx, hnp, expectedPodRevision, 0) numPodsReadyByRevision := podReadyCountByRevision(ctx, hnp, expectedPodRevision) suite.UsingClusterBy(hnp.GetClusterName(), fmt.Sprintf("ensurePodsTerminate podsReadyCountByRevision() = %#+v", numPodsReadyByRevision)) return numPodsReadyByRevision @@ -523,7 +522,7 @@ func ensurePodsSimultaneousRestart(ctx context.Context, hnp *controllers.HumioNo suite.UsingClusterBy(hnp.GetClusterName(), "ensurePodsSimultaneousRestart Ensuring all pods come back up after terminating") Eventually(func() map[int]int { - markPodsWithRevisionAsReady(ctx, hnp, expectedPodRevision, hnp.GetNodeCount()) + markPodsWithRevisionAsReadyIfUsingEnvTest(ctx, hnp, expectedPodRevision, hnp.GetNodeCount()) numPodsReadyByRevision := podReadyCountByRevision(ctx, hnp, expectedPodRevision) suite.UsingClusterBy(hnp.GetClusterName(), fmt.Sprintf("ensurePodsSimultaneousRestart podsReadyCountByRevision() = %#+v", numPodsReadyByRevision)) return numPodsReadyByRevision diff --git a/controllers/suite/common.go b/controllers/suite/common.go index 991f0f1b..743eb3a6 100644 --- a/controllers/suite/common.go +++ b/controllers/suite/common.go @@ -53,14 +53,14 @@ func UsingClusterBy(cluster, text string, callbacks ...func()) { } } -func MarkPodsAsRunning(ctx context.Context, client client.Client, pods []corev1.Pod, clusterName string) error { - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" { +func MarkPodsAsRunningIfUsingEnvtest(ctx context.Context, client client.Client, pods []corev1.Pod, clusterName string) error { + if !helpers.UseEnvtest() { return nil } UsingClusterBy(clusterName, "Simulating Humio container starts up and is marked Ready") for _, pod := range pods { - err := MarkPodAsRunning(ctx, client, pod, clusterName) + err := MarkPodAsRunningIfUsingEnvtest(ctx, client, pod, clusterName) if err != nil { return err } @@ -68,8 +68,8 @@ func MarkPodsAsRunning(ctx context.Context, client client.Client, pods []corev1. return nil } -func MarkPodAsRunning(ctx context.Context, k8sClient client.Client, pod corev1.Pod, clusterName string) error { - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" { +func MarkPodAsRunningIfUsingEnvtest(ctx context.Context, k8sClient client.Client, pod corev1.Pod, clusterName string) error { + if !helpers.UseEnvtest() { return nil } @@ -81,6 +81,18 @@ func MarkPodAsRunning(ctx context.Context, k8sClient client.Client, pod corev1.P Status: corev1.ConditionTrue, }, } + pod.Status.InitContainerStatuses = []corev1.ContainerStatus{ + { + Name: controllers.InitContainerName, + Ready: true, + }, + } + pod.Status.ContainerStatuses = []corev1.ContainerStatus{ + { + Name: controllers.HumioContainerName, + Ready: true, + }, + } pod.Status.Phase = corev1.PodRunning return k8sClient.Status().Update(ctx, &pod) } @@ -226,7 +238,7 @@ func ConstructBasicNodeSpecForHumioCluster(key types.NamespacedName) humiov1alph }, } - if os.Getenv("DUMMY_LOGSCALE_IMAGE") != "true" { + if !helpers.UseDummyImage() { nodeSpec.SidecarContainers = []corev1.Container{ { Name: "wait-for-global-snapshot-on-disk", @@ -314,8 +326,8 @@ func CreateLicenseSecret(ctx context.Context, clusterKey types.NamespacedName, k licenseString := "eyJ0eXAiOiJKV1QiLCJhbGciOiJFUzUxMiJ9.eyJpc09lbSI6ZmFsc2UsImF1ZCI6Ikh1bWlvLWxpY2Vuc2UtY2hlY2siLCJzdWIiOiJIdW1pbyBFMkUgdGVzdHMiLCJ1aWQiOiJGUXNvWlM3Yk1PUldrbEtGIiwibWF4VXNlcnMiOjEwLCJhbGxvd1NBQVMiOnRydWUsIm1heENvcmVzIjoxLCJ2YWxpZFVudGlsIjoxNzQzMTY2ODAwLCJleHAiOjE3NzQ1OTMyOTcsImlzVHJpYWwiOmZhbHNlLCJpYXQiOjE2Nzk5ODUyOTcsIm1heEluZ2VzdEdiUGVyRGF5IjoxfQ.someinvalidsignature" // If we use a k8s that is not envtest, and we didn't specify we are using a dummy image, we require a valid license - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" && os.Getenv("DUMMY_LOGSCALE_IMAGE") != "true" { - licenseString = os.Getenv("HUMIO_E2E_LICENSE") + if !helpers.UseEnvtest() && !helpers.UseDummyImage() { + licenseString = helpers.GetE2ELicenseFromEnvVar() } licenseSecret := corev1.Secret{ @@ -363,7 +375,7 @@ func CreateAndBootstrapCluster(ctx context.Context, k8sClient client.Client, hum } } - if os.Getenv("TEST_USE_EXISTING_CLUSTER") != "true" { + if helpers.UseEnvtest() { // Simulate sidecar creating the secret which contains the admin token used to authenticate with humio secretData := map[string][]byte{"token": []byte("")} adminTokenSecretName := fmt.Sprintf("%s-%s", key.Name, kubernetes.ServiceTokenSecretNameSuffix) @@ -458,7 +470,7 @@ func CreateAndBootstrapCluster(ctx context.Context, k8sClient client.Client, hum Eventually(func() []corev1.Pod { var clusterPods []corev1.Pod clusterPods, _ = kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(&updatedHumioCluster).GetPodLabels()) - _ = MarkPodsAsRunning(ctx, k8sClient, clusterPods, key.Name) + _ = MarkPodsAsRunningIfUsingEnvtest(ctx, k8sClient, clusterPods, key.Name) return clusterPods }, testTimeout, TestInterval).Should(HaveLen(cluster.Spec.NodeCount)) @@ -466,7 +478,7 @@ func CreateAndBootstrapCluster(ctx context.Context, k8sClient client.Client, hum Eventually(func() []corev1.Pod { var clusterPods []corev1.Pod clusterPods, _ = kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioNodePool(&updatedHumioCluster, &cluster.Spec.NodePools[idx]).GetPodLabels()) - _ = MarkPodsAsRunning(ctx, k8sClient, clusterPods, key.Name) + _ = MarkPodsAsRunningIfUsingEnvtest(ctx, k8sClient, clusterPods, key.Name) return clusterPods }, testTimeout, TestInterval).Should(HaveLen(pool.NodeCount)) } @@ -504,11 +516,11 @@ func CreateAndBootstrapCluster(ctx context.Context, k8sClient client.Client, hum UsingClusterBy(key.Name, "Confirming cluster enters running state") Eventually(func() string { clusterPods, _ := kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(&updatedHumioCluster).GetPodLabels()) - _ = MarkPodsAsRunning(ctx, k8sClient, clusterPods, key.Name) + _ = MarkPodsAsRunningIfUsingEnvtest(ctx, k8sClient, clusterPods, key.Name) for idx := range cluster.Spec.NodePools { clusterPods, _ := kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioNodePool(&updatedHumioCluster, &cluster.Spec.NodePools[idx]).GetPodLabels()) - _ = MarkPodsAsRunning(ctx, k8sClient, clusterPods, key.Name) + _ = MarkPodsAsRunningIfUsingEnvtest(ctx, k8sClient, clusterPods, key.Name) } updatedHumioCluster = humiov1alpha1.HumioCluster{} @@ -539,7 +551,7 @@ func CreateAndBootstrapCluster(ctx context.Context, k8sClient client.Client, hum }, &corev1.Secret{}) }, testTimeout, TestInterval).Should(Succeed()) - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" && os.Getenv("DUMMY_LOGSCALE_IMAGE") != "true" { + if !helpers.UseEnvtest() && !helpers.UseDummyImage() { UsingClusterBy(key.Name, "Validating cluster nodes have ZONE configured correctly") if updatedHumioCluster.Spec.DisableInitContainer { Eventually(func() []string { @@ -649,6 +661,34 @@ func CreateAndBootstrapCluster(ctx context.Context, k8sClient client.Client, hum }, testTimeout, TestInterval).Should(HaveKeyWithValue(corev1.PodRunning, updatedHumioCluster.Spec.NodePools[idx].NodeCount)) } + + Eventually(func() int { + numPodsReady := 0 + clusterPods, _ := kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(&updatedHumioCluster).GetPodLabels()) + for _, pod := range clusterPods { + for _, containerStatus := range pod.Status.ContainerStatuses { + if containerStatus.Name == controllers.HumioContainerName && containerStatus.Ready { + numPodsReady++ + } + } + } + return numPodsReady + }, testTimeout, TestInterval).Should(BeIdenticalTo(updatedHumioCluster.Spec.NodeCount)) + + for idx := range updatedHumioCluster.Spec.NodePools { + Eventually(func() int { + numPodsReady := 0 + clusterPods, _ := kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioNodePool(&updatedHumioCluster, &updatedHumioCluster.Spec.NodePools[idx]).GetPodLabels()) + for _, pod := range clusterPods { + for _, containerStatus := range pod.Status.ContainerStatuses { + if containerStatus.Name == controllers.HumioContainerName && containerStatus.Ready { + numPodsReady++ + } + } + } + return numPodsReady + }, testTimeout, TestInterval).Should(BeIdenticalTo(updatedHumioCluster.Spec.NodePools[idx].NodeCount)) + } } func WaitForReconcileToSync(ctx context.Context, key types.NamespacedName, k8sClient client.Client, currentHumioCluster *humiov1alpha1.HumioCluster, testTimeout time.Duration) { diff --git a/controllers/suite/resources/humioresources_controller_test.go b/controllers/suite/resources/humioresources_controller_test.go index 9b9f2fe8..2b199a92 100644 --- a/controllers/suite/resources/humioresources_controller_test.go +++ b/controllers/suite/resources/humioresources_controller_test.go @@ -20,7 +20,6 @@ import ( "context" "fmt" "net/http" - "os" "github.com/humio/humio-operator/pkg/kubernetes" @@ -98,9 +97,7 @@ var _ = Describe("Humio Resources Controllers", func() { ingestTokenSecret) }, testTimeout, suite.TestInterval).Should(Succeed()) - if os.Getenv("TEST_USE_EXISTING_CLUSTER") != "true" { - Expect(string(ingestTokenSecret.Data["token"])).ToNot(BeEmpty()) - } + Expect(string(ingestTokenSecret.Data["token"])).ToNot(BeEmpty()) Expect(ingestTokenSecret.OwnerReferences).Should(HaveLen(1)) suite.UsingClusterBy(clusterKey.Name, "HumioIngestToken: Checking correct parser assigned to ingest token") @@ -152,9 +149,7 @@ var _ = Describe("Humio Resources Controllers", func() { ingestTokenSecret) }, testTimeout, suite.TestInterval).Should(Succeed()) - if os.Getenv("TEST_USE_EXISTING_CLUSTER") != "true" { - Expect(string(ingestTokenSecret.Data["token"])).ToNot(BeEmpty()) - } + Expect(string(ingestTokenSecret.Data["token"])).ToNot(BeEmpty()) suite.UsingClusterBy(clusterKey.Name, "HumioIngestToken: Successfully deleting it") Expect(k8sClient.Delete(ctx, fetchedIngestToken)).To(Succeed()) @@ -223,9 +218,7 @@ var _ = Describe("Humio Resources Controllers", func() { }, testTimeout, suite.TestInterval).Should(Succeed()) Expect(ingestTokenSecret.Labels).Should(HaveKeyWithValue("custom-label", "custom-value")) - if os.Getenv("TEST_USE_EXISTING_CLUSTER") != "true" { - Expect(string(ingestTokenSecret.Data["token"])).ToNot(BeEmpty()) - } + Expect(string(ingestTokenSecret.Data["token"])).ToNot(BeEmpty()) suite.UsingClusterBy(clusterKey.Name, "HumioIngestToken: Successfully deleting it") Expect(k8sClient.Delete(ctx, fetchedIngestToken)).To(Succeed()) @@ -662,7 +655,7 @@ var _ = Describe("Humio Resources Controllers", func() { Namespace: clusterKey.Namespace, } protocol := "http" - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" && helpers.UseCertManager() { + if !helpers.UseEnvtest() && helpers.UseCertManager() { protocol = "https" } diff --git a/controllers/suite/resources/suite_test.go b/controllers/suite/resources/suite_test.go index e7db65c1..a8e200b0 100644 --- a/controllers/suite/resources/suite_test.go +++ b/controllers/suite/resources/suite_test.go @@ -20,7 +20,6 @@ import ( "context" "encoding/json" "fmt" - "os" "path/filepath" "strings" "testing" @@ -97,17 +96,17 @@ var _ = BeforeSuite(func() { Namespace: fmt.Sprintf("e2e-resources-%d", GinkgoParallelProcess()), } - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" { + if !helpers.UseEnvtest() { testTimeout = time.Second * 300 testEnv = &envtest.Environment{ UseExistingCluster: &useExistingCluster, } - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { humioClient = humio.NewMockClient() } else { humioClient = humio.NewClient(log, "") By("Verifying we have a valid license, as tests will require starting up real LogScale containers") - Expect(os.Getenv("HUMIO_E2E_LICENSE")).NotTo(BeEmpty()) + Expect(helpers.GetE2ELicenseFromEnvVar()).NotTo(BeEmpty()) } } else { @@ -262,11 +261,6 @@ var _ = BeforeSuite(func() { suite.UsingClusterBy(clusterKey.Name, fmt.Sprintf("HumioCluster: Creating shared test cluster in namespace %s", clusterKey.Namespace)) cluster = suite.ConstructBasicSingleNodeHumioCluster(clusterKey, true) - if os.Getenv("DUMMY_LOGSCALE_IMAGE") != "true" { - cluster.Spec.HumioNodeSpec.Image = "humio/humio-core:1.150.0" - } else { - cluster.Spec.HumioNodeSpec.Image = "humio/humio-core:dummy" - } suite.CreateAndBootstrapCluster(context.TODO(), k8sClient, humioClient, cluster, true, corev1alpha1.HumioClusterStateRunning, testTimeout) sharedCluster, err = helpers.NewCluster(context.TODO(), k8sClient, clusterKey.Name, "", clusterKey.Namespace, helpers.UseCertManager(), true, false) @@ -381,7 +375,7 @@ var _ = AfterSuite(func() { })).To(Succeed()) } - if testNamespace.ObjectMeta.Name != "" && os.Getenv("PRESERVE_KIND_CLUSTER") == "true" { + if testNamespace.ObjectMeta.Name != "" && !helpers.UseEnvtest() && helpers.PreserveKindCluster() { By(fmt.Sprintf("Removing test namespace: %s", clusterKey.Namespace)) err := k8sClient.Delete(context.TODO(), &testNamespace) Expect(err).ToNot(HaveOccurred()) diff --git a/controllers/versions/versions.go b/controllers/versions/versions.go index e576ac5e..5c8e328e 100644 --- a/controllers/versions/versions.go +++ b/controllers/versions/versions.go @@ -1,13 +1,14 @@ package versions import ( - "os" "strings" + + "github.com/humio/humio-operator/pkg/helpers" ) const ( defaultHelperImageVersion = "humio/humio-operator-helper:8f5ef6c7e470226e77d985f36cf39be9a100afea" - defaultHumioImageVersion = "humio/humio-core:1.142.3" + defaultHumioImageVersion = "humio/humio-core:1.153.1" oldSupportedHumioVersion = "humio/humio-core:1.118.0" upgradeJumpHumioVersion = "humio/humio-core:1.128.0" @@ -28,70 +29,70 @@ const ( func DefaultHelperImageVersion() string { version := []string{defaultHelperImageVersion} - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { version = append(version, dummyImageSuffix) } return strings.Join(version, "") } func DefaultHumioImageVersion() string { version := []string{defaultHumioImageVersion} - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { version = append(version, dummyImageSuffix) } return strings.Join(version, "") } func OldSupportedHumioVersion() string { version := []string{oldSupportedHumioVersion} - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { version = append(version, dummyImageSuffix) } return strings.Join(version, "") } func UpgradeJumpHumioVersion() string { version := []string{upgradeJumpHumioVersion} - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { version = append(version, dummyImageSuffix) } return strings.Join(version, "") } func OldUnsupportedHumioVersion() string { version := []string{oldUnsupportedHumioVersion} - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { version = append(version, dummyImageSuffix) } return strings.Join(version, "") } func UpgradeHelperImageVersion() string { version := []string{upgradeHelperImageVersion} - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { version = append(version, dummyImageSuffix) } return strings.Join(version, "") } func UpgradePatchBestEffortOldVersion() string { version := []string{upgradePatchBestEffortOldVersion} - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { version = append(version, dummyImageSuffix) } return strings.Join(version, "") } func UpgradePatchBestEffortNewVersion() string { version := []string{upgradePatchBestEffortNewVersion} - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { version = append(version, dummyImageSuffix) } return strings.Join(version, "") } func UpgradeRollingBestEffortVersionJumpOldVersion() string { version := []string{upgradeRollingBestEffortVersionJumpOldVersion} - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { version = append(version, dummyImageSuffix) } return strings.Join(version, "") } func UpgradeRollingBestEffortVersionJumpNewVersion() string { version := []string{upgradeRollingBestEffortVersionJumpNewVersion} - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { version = append(version, dummyImageSuffix) } return strings.Join(version, "") diff --git a/images/helper/go.mod b/images/helper/go.mod index 3e3cb278..2cd62a1f 100644 --- a/images/helper/go.mod +++ b/images/helper/go.mod @@ -3,8 +3,6 @@ module github.com/humio/humio-operator/images/helper go 1.22 require ( - github.com/cli/shurcooL-graphql v0.0.4 - github.com/humio/cli v0.33.1-0.20240425153346-f278dc8465f3 k8s.io/api v0.29.5 k8s.io/apimachinery v0.29.5 k8s.io/client-go v0.29.5 @@ -30,7 +28,6 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect golang.org/x/net v0.24.0 // indirect golang.org/x/oauth2 v0.10.0 // indirect - golang.org/x/sync v0.5.0 // indirect golang.org/x/sys v0.19.0 // indirect golang.org/x/term v0.19.0 // indirect golang.org/x/text v0.14.0 // indirect diff --git a/images/helper/go.sum b/images/helper/go.sum index 36f7067a..17901a8b 100644 --- a/images/helper/go.sum +++ b/images/helper/go.sum @@ -1,5 +1,3 @@ -github.com/cli/shurcooL-graphql v0.0.4 h1:6MogPnQJLjKkaXPyGqPRXOI2qCsQdqNfUY1QSJu2GuY= -github.com/cli/shurcooL-graphql v0.0.4/go.mod h1:3waN4u02FiZivIV+p1y4d0Jo1jc6BViMA73C+sZo2fk= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= @@ -33,8 +31,6 @@ github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJY github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/humio/cli v0.33.1-0.20240425153346-f278dc8465f3 h1:9UVZdMFGt7FktPvRjJ58RQFHFSYIEfkcbCg4Xq8z9HM= -github.com/humio/cli v0.33.1-0.20240425153346-f278dc8465f3/go.mod h1:GGgOajbd4z5osw50k5+dXYrcSkj9nZssAWS4Lv77yc4= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= @@ -95,8 +91,6 @@ golang.org/x/oauth2 v0.10.0/go.mod h1:kTpgurOux7LqtuxjuyZa4Gj2gdezIt/jQtGnNFfypQ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE= -golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/images/helper/main.go b/images/helper/main.go index 791188e4..02aaf3a7 100644 --- a/images/helper/main.go +++ b/images/helper/main.go @@ -73,7 +73,7 @@ func initMode() { } else { zone, found := node.Labels[corev1.LabelZoneFailureDomainStable] if !found { - zone, _ = node.Labels[corev1.LabelZoneFailureDomain] + zone = node.Labels[corev1.LabelZoneFailureDomain] } err := os.WriteFile(targetFile, []byte(zone), 0644) // #nosec G306 if err != nil { diff --git a/pkg/helpers/helpers.go b/pkg/helpers/helpers.go index 79dcbcac..e9f37624 100644 --- a/pkg/helpers/helpers.go +++ b/pkg/helpers/helpers.go @@ -59,12 +59,6 @@ func RemoveElement(list []string, s string) []string { return list } -// UseCertManager returns whether the operator will use cert-manager -func UseCertManager() bool { - certmanagerEnabled, found := os.LookupEnv("USE_CERTMANAGER") - return found && certmanagerEnabled == "true" -} - // TLSEnabled returns whether we a cluster should configure TLS or not func TLSEnabled(hc *humiov1alpha1.HumioCluster) bool { if hc.Spec.TLS == nil { @@ -141,3 +135,39 @@ func GetWatchNamespace() (string, error) { } return ns, nil } + +// UseCertManager returns whether the operator will use cert-manager +func UseCertManager() bool { + return !UseEnvtest() && os.Getenv("USE_CERTMANAGER") == "true" +} + +// GetDefaultHumioCoreImageFromEnvVar returns the user-defined default image for humio-core containers +func GetDefaultHumioCoreImageFromEnvVar() string { + return os.Getenv("HUMIO_OPERATOR_DEFAULT_HUMIO_CORE_IMAGE") +} + +// GetDefaultHumioHelperImageFromEnvVar returns the user-defined default image for helper containers +func GetDefaultHumioHelperImageFromEnvVar() string { + return os.Getenv("HUMIO_OPERATOR_DEFAULT_HUMIO_HELPER_IMAGE") +} + +// UseEnvtest returns whether the Kubernetes API is provided by envtest +func UseEnvtest() bool { + return os.Getenv("TEST_USING_ENVTEST") == "true" +} + +// UseDummyImage returns whether we are using a dummy image replacement instead of real container images +func UseDummyImage() bool { + return os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" +} + +// GetE2ELicenseFromEnvVar returns the E2E license set as an environment variable +func GetE2ELicenseFromEnvVar() string { + return os.Getenv("HUMIO_E2E_LICENSE") +} + +// PreserveKindCluster returns true if the intention is to not delete kind cluster after test execution. +// This is to allow reruns of tests to be performed where resources can be reused. +func PreserveKindCluster() bool { + return os.Getenv("PRESERVE_KIND_CLUSTER") == "true" +} diff --git a/pkg/kubernetes/nodes.go b/pkg/kubernetes/nodes.go index 0decc988..57041c9c 100644 --- a/pkg/kubernetes/nodes.go +++ b/pkg/kubernetes/nodes.go @@ -16,3 +16,24 @@ func GetNode(ctx context.Context, c client.Client, nodeName string) (*corev1.Nod }, &node) return &node, err } + +var nodeNameToZoneName = map[string]string{} + +func GetZoneForNodeName(ctx context.Context, c client.Client, nodeName string) (string, error) { + zone, inCache := nodeNameToZoneName[nodeName] + if inCache { + return zone, nil + } + + node, err := GetNode(ctx, c, nodeName) + if err != nil { + return "", nil + } + zone, found := node.Labels[corev1.LabelZoneFailureDomainStable] + if !found { + zone = node.Labels[corev1.LabelZoneFailureDomain] + } + + nodeNameToZoneName[nodeName] = zone + return zone, nil +} From b7922dadcf932d1739e9df3df0e4fb577336bbf7 Mon Sep 17 00:00:00 2001 From: Mike Rostermund Date: Mon, 30 Sep 2024 12:00:37 +0200 Subject: [PATCH 2/2] Filter out pods without nodeName before fetching zone for the nodeName. --- controllers/humiocluster_controller.go | 8 ++++++-- controllers/humiocluster_pods.go | 11 +++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/controllers/humiocluster_controller.go b/controllers/humiocluster_controller.go index 7641fb5e..b203a851 100644 --- a/controllers/humiocluster_controller.go +++ b/controllers/humiocluster_controller.go @@ -1915,8 +1915,12 @@ func (r *HumioClusterReconciler) ensureMismatchedPodsAreDeleted(ctx context.Cont podListForCurrentZoneWithWrongPodRevision := FilterPodsExcludePodsWithPodRevision(foundPodListForNodePool, hnp.GetDesiredPodRevision()) r.Log.Info(fmt.Sprintf("zone awareness enabled, len(podListForCurrentZoneWithWrongPodRevision)=%d", len(podListForCurrentZoneWithWrongPodRevision))) - if len(podListForCurrentZoneWithWrongPodRevision) > 0 { - newZoneUnderMaintenance, err := kubernetes.GetZoneForNodeName(ctx, r, podListForCurrentZoneWithWrongPodRevision[0].Spec.NodeName) + // Filter out any pods with empty nodeName fields + podListForCurrentZoneWithWrongPodRevisionAndNonEmptyNodeName := FilterPodsExcludePodsWithEmptyNodeName(podListForCurrentZoneWithWrongPodRevision) + r.Log.Info(fmt.Sprintf("zone awareness enabled, len(podListForCurrentZoneWithWrongPodRevision)=%d", len(podListForCurrentZoneWithWrongPodRevision))) + + if len(podListForCurrentZoneWithWrongPodRevisionAndNonEmptyNodeName) > 0 { + newZoneUnderMaintenance, err := kubernetes.GetZoneForNodeName(ctx, r, podListForCurrentZoneWithWrongPodRevisionAndNonEmptyNodeName[0].Spec.NodeName) if err != nil { return reconcile.Result{}, r.logErrorAndReturn(err, "unable to fetch zone") } diff --git a/controllers/humiocluster_pods.go b/controllers/humiocluster_pods.go index db06068c..5c292082 100644 --- a/controllers/humiocluster_pods.go +++ b/controllers/humiocluster_pods.go @@ -1116,3 +1116,14 @@ func FilterPodsExcludePodsWithPodRevision(podList []corev1.Pod, podRevisionToExc } return filteredPodList } + +func FilterPodsExcludePodsWithEmptyNodeName(podList []corev1.Pod) []corev1.Pod { + filteredPodList := []corev1.Pod{} + for _, pod := range podList { + if pod.Spec.NodeName == "" { + continue + } + filteredPodList = append(filteredPodList, pod) + } + return filteredPodList +}