diff --git a/Makefile b/Makefile index 542916b9..a2d7c8b0 100644 --- a/Makefile +++ b/Makefile @@ -52,8 +52,7 @@ test: manifests generate fmt vet ginkgo ## Run tests. go install sigs.k8s.io/controller-runtime/tools/setup-envtest@latest $(SHELL) -c "\ eval \$$($(GOBIN)/setup-envtest use -p env ${TEST_K8S_VERSION}); \ - export USE_CERTMANAGER=false; \ - export TEST_USE_EXISTING_CLUSTER=false; \ + export TEST_USING_ENVTEST=true; \ $(GINKGO) --label-filter=envtest -vv --no-color --procs 3 -output-dir=${PWD} -keep-separate-reports -race --junit-report=test-results-junit.xml --randomize-suites --randomize-all -timeout 10m ./... -covermode=count -coverprofile cover.out \ " diff --git a/api/v1alpha1/humiocluster_types.go b/api/v1alpha1/humiocluster_types.go index ec2f8001..d7bbb592 100644 --- a/api/v1alpha1/humiocluster_types.go +++ b/api/v1alpha1/humiocluster_types.go @@ -282,8 +282,14 @@ type HumioUpdateStrategy struct { // +kubebuilder:validation:Enum=OnDelete;RollingUpdate;ReplaceAllOnUpdate;RollingUpdateBestEffort Type string `json:"type,omitempty"` - // The minimum time in seconds that a pod must be ready before the next pod can be deleted when doing rolling update. + // MinReadySeconds is the minimum time in seconds that a pod must be ready before the next pod can be deleted when doing rolling update. MinReadySeconds int32 `json:"minReadySeconds,omitempty"` + + // EnableZoneAwareness toggles zone awareness on or off during updates. When enabled, the pod replacement logic + // will go through all pods in a specific zone before it starts replacing pods in the next zone. + // If pods are failing, they bypass the zone limitation and are restarted immediately - ignoring the zone. + // Zone awareness is enabled by default. + EnableZoneAwareness *bool `json:"enableZoneAwareness,omitempty"` } type HumioNodePoolSpec struct { @@ -384,6 +390,8 @@ type HumioNodePoolStatus struct { State string `json:"state,omitempty"` // DesiredPodRevision holds the desired pod revision for pods of the given node pool. DesiredPodRevision int `json:"desiredPodRevision,omitempty"` + // ZoneUnderMaintenance holds the name of the availability zone currently under maintenance + ZoneUnderMaintenance string `json:"zoneUnderMaintenance,omitempty"` } // HumioClusterStatus defines the observed state of HumioCluster diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 4b7b833e..9daf5858 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -1460,7 +1460,7 @@ func (in *HumioNodeSpec) DeepCopyInto(out *HumioNodeSpec) { if in.UpdateStrategy != nil { in, out := &in.UpdateStrategy, &out.UpdateStrategy *out = new(HumioUpdateStrategy) - **out = **in + (*in).DeepCopyInto(*out) } in.NodePoolFeatures.DeepCopyInto(&out.NodePoolFeatures) } @@ -1895,6 +1895,11 @@ func (in *HumioTokenSecretStatus) DeepCopy() *HumioTokenSecretStatus { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *HumioUpdateStrategy) DeepCopyInto(out *HumioUpdateStrategy) { *out = *in + if in.EnableZoneAwareness != nil { + in, out := &in.EnableZoneAwareness, &out.EnableZoneAwareness + *out = new(bool) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HumioUpdateStrategy. diff --git a/charts/humio-operator/crds/core.humio.com_humioclusters.yaml b/charts/humio-operator/crds/core.humio.com_humioclusters.yaml index 6199bf43..4112c5c6 100644 --- a/charts/humio-operator/crds/core.humio.com_humioclusters.yaml +++ b/charts/humio-operator/crds/core.humio.com_humioclusters.yaml @@ -13082,10 +13082,17 @@ spec: UpdateStrategy controls how Humio pods are updated when changes are made to the HumioCluster resource that results in a change to the Humio pods properties: + enableZoneAwareness: + description: |- + EnableZoneAwareness toggles zone awareness on or off during updates. When enabled, the pod replacement logic + will go through all pods in a specific zone before it starts replacing pods in the next zone. + If pods are failing, they bypass the zone limitation and are restarted immediately - ignoring the zone. + Zone awareness is enabled by default. + type: boolean minReadySeconds: - description: The minimum time in seconds that a pod - must be ready before the next pod can be deleted when - doing rolling update. + description: MinReadySeconds is the minimum time in + seconds that a pod must be ready before the next pod + can be deleted when doing rolling update. format: int32 type: integer type: @@ -14987,9 +14994,17 @@ spec: UpdateStrategy controls how Humio pods are updated when changes are made to the HumioCluster resource that results in a change to the Humio pods properties: + enableZoneAwareness: + description: |- + EnableZoneAwareness toggles zone awareness on or off during updates. When enabled, the pod replacement logic + will go through all pods in a specific zone before it starts replacing pods in the next zone. + If pods are failing, they bypass the zone limitation and are restarted immediately - ignoring the zone. + Zone awareness is enabled by default. + type: boolean minReadySeconds: - description: The minimum time in seconds that a pod must be ready - before the next pod can be deleted when doing rolling update. + description: MinReadySeconds is the minimum time in seconds that + a pod must be ready before the next pod can be deleted when + doing rolling update. format: int32 type: integer type: @@ -15062,6 +15077,10 @@ spec: From there it can be "Running", "Upgrading", "Restarting" or "Pending" type: string + zoneUnderMaintenance: + description: ZoneUnderMaintenance holds the name of the availability + zone currently under maintenance + type: string required: - name type: object diff --git a/config/crd/bases/core.humio.com_humioclusters.yaml b/config/crd/bases/core.humio.com_humioclusters.yaml index 6199bf43..4112c5c6 100644 --- a/config/crd/bases/core.humio.com_humioclusters.yaml +++ b/config/crd/bases/core.humio.com_humioclusters.yaml @@ -13082,10 +13082,17 @@ spec: UpdateStrategy controls how Humio pods are updated when changes are made to the HumioCluster resource that results in a change to the Humio pods properties: + enableZoneAwareness: + description: |- + EnableZoneAwareness toggles zone awareness on or off during updates. When enabled, the pod replacement logic + will go through all pods in a specific zone before it starts replacing pods in the next zone. + If pods are failing, they bypass the zone limitation and are restarted immediately - ignoring the zone. + Zone awareness is enabled by default. + type: boolean minReadySeconds: - description: The minimum time in seconds that a pod - must be ready before the next pod can be deleted when - doing rolling update. + description: MinReadySeconds is the minimum time in + seconds that a pod must be ready before the next pod + can be deleted when doing rolling update. format: int32 type: integer type: @@ -14987,9 +14994,17 @@ spec: UpdateStrategy controls how Humio pods are updated when changes are made to the HumioCluster resource that results in a change to the Humio pods properties: + enableZoneAwareness: + description: |- + EnableZoneAwareness toggles zone awareness on or off during updates. When enabled, the pod replacement logic + will go through all pods in a specific zone before it starts replacing pods in the next zone. + If pods are failing, they bypass the zone limitation and are restarted immediately - ignoring the zone. + Zone awareness is enabled by default. + type: boolean minReadySeconds: - description: The minimum time in seconds that a pod must be ready - before the next pod can be deleted when doing rolling update. + description: MinReadySeconds is the minimum time in seconds that + a pod must be ready before the next pod can be deleted when + doing rolling update. format: int32 type: integer type: @@ -15062,6 +15077,10 @@ spec: From there it can be "Running", "Upgrading", "Restarting" or "Pending" type: string + zoneUnderMaintenance: + description: ZoneUnderMaintenance holds the name of the availability + zone currently under maintenance + type: string required: - name type: object diff --git a/controllers/humiocluster_controller.go b/controllers/humiocluster_controller.go index 7ce6466b..b203a851 100644 --- a/controllers/humiocluster_controller.go +++ b/controllers/humiocluster_controller.go @@ -129,17 +129,17 @@ func (r *HumioClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request if err := r.setImageFromSource(ctx, pool); err != nil { return r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). withMessage(err.Error()). - withNodePoolState(humiov1alpha1.HumioClusterStateConfigError, pool.GetNodePoolName(), pool.GetDesiredPodRevision())) + withNodePoolState(humiov1alpha1.HumioClusterStateConfigError, pool.GetNodePoolName(), pool.GetDesiredPodRevision(), pool.GetZoneUnderMaintenance())) } if err := r.ensureValidHumioVersion(pool); err != nil { return r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). withMessage(err.Error()). - withNodePoolState(humiov1alpha1.HumioClusterStateConfigError, pool.GetNodePoolName(), pool.GetDesiredPodRevision())) + withNodePoolState(humiov1alpha1.HumioClusterStateConfigError, pool.GetNodePoolName(), pool.GetDesiredPodRevision(), pool.GetZoneUnderMaintenance())) } if err := r.ensureValidStorageConfiguration(pool); err != nil { return r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). withMessage(err.Error()). - withNodePoolState(humiov1alpha1.HumioClusterStateConfigError, pool.GetNodePoolName(), pool.GetDesiredPodRevision())) + withNodePoolState(humiov1alpha1.HumioClusterStateConfigError, pool.GetNodePoolName(), pool.GetDesiredPodRevision(), pool.GetZoneUnderMaintenance())) } } @@ -198,7 +198,7 @@ func (r *HumioClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request if err := r.validateInitialPodSpec(pool); err != nil { return r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). withMessage(err.Error()). - withNodePoolState(humiov1alpha1.HumioClusterStateConfigError, pool.GetNodePoolName(), pool.GetDesiredPodRevision())) + withNodePoolState(humiov1alpha1.HumioClusterStateConfigError, pool.GetNodePoolName(), pool.GetDesiredPodRevision(), pool.GetZoneUnderMaintenance())) } } @@ -252,7 +252,7 @@ func (r *HumioClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request desiredPodRevision++ } _, err = r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). - withNodePoolState(hc.Status.State, pool.GetNodePoolName(), desiredPodRevision)) + withNodePoolState(hc.Status.State, pool.GetNodePoolName(), desiredPodRevision, "")) return reconcile.Result{Requeue: true}, err } } @@ -261,7 +261,7 @@ func (r *HumioClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request if err := r.ensurePersistentVolumeClaimsExist(ctx, hc, pool); err != nil { opts := statusOptions() if hc.Status.State != humiov1alpha1.HumioClusterStateRestarting && hc.Status.State != humiov1alpha1.HumioClusterStateUpgrading { - opts.withNodePoolState(humiov1alpha1.HumioClusterStatePending, pool.GetNodePoolName(), pool.GetDesiredPodRevision()) + opts.withNodePoolState(humiov1alpha1.HumioClusterStatePending, pool.GetNodePoolName(), pool.GetDesiredPodRevision(), pool.GetZoneUnderMaintenance()) } return r.updateStatus(ctx, r.Client.Status(), hc, opts. withMessage(err.Error())) @@ -1833,17 +1833,17 @@ func (r *HumioClusterReconciler) ensureHumioServiceAccountAnnotations(ctx contex // The behavior of this depends on what, if anything, was changed in the pod. If there are changes that fall under a // rolling update, then the pod restart policy is set to PodRestartPolicyRolling and the reconciliation will continue if // there are any pods not in a ready state. This is so replacement pods may be created. -// If there are changes that fall under a recreate update, the the pod restart policy is set to PodRestartPolicyRecreate +// If there are changes that fall under a recreate update, then the pod restart policy is set to PodRestartPolicyRecreate // and the reconciliation will requeue and the deletions will continue to be executed until all the pods have been // removed. func (r *HumioClusterReconciler) ensureMismatchedPodsAreDeleted(ctx context.Context, hc *humiov1alpha1.HumioCluster, hnp *HumioNodePool) (reconcile.Result, error) { - foundPodList, err := kubernetes.ListPods(ctx, r, hnp.GetNamespace(), hnp.GetNodePoolLabels()) + foundPodListForNodePool, err := kubernetes.ListPods(ctx, r, hnp.GetNamespace(), hnp.GetNodePoolLabels()) if err != nil { return reconcile.Result{}, r.logErrorAndReturn(err, "failed to list pods") } // if we do not have any pods running we have nothing to delete - if len(foundPodList) == 0 { + if len(foundPodListForNodePool) == 0 { return reconcile.Result{}, nil } @@ -1854,11 +1854,6 @@ func (r *HumioClusterReconciler) ensureMismatchedPodsAreDeleted(ctx context.Cont attachments.dataVolumeSource = hnp.GetDataVolumePersistentVolumeClaimSpecTemplate("") } - podsStatus, err := r.getPodsStatus(ctx, hc, hnp, foundPodList) - if err != nil { - return reconcile.Result{}, r.logErrorAndReturn(err, "failed to get pod status") - } - envVarSourceData, err := r.getEnvVarSource(ctx, hnp) if err != nil { result, _ := r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). @@ -1885,17 +1880,9 @@ func (r *HumioClusterReconciler) ensureMismatchedPodsAreDeleted(ctx context.Cont } } - // prioritize deleting the pods with errors - var podList []corev1.Pod - if podsStatus.havePodsWithErrors() { - r.Log.Info(fmt.Sprintf("found %d humio pods with errors", len(podsStatus.podErrors))) - podList = podsStatus.podErrors - } else { - podList = foundPodList - } - desiredLifecycleState, err := r.getPodDesiredLifecycleState(hnp, podList, attachments) + podsStatus, err := r.getPodsStatus(ctx, hc, hnp, foundPodListForNodePool) if err != nil { - return reconcile.Result{}, r.logErrorAndReturn(err, "got error when getting pod desired lifecycle") + return reconcile.Result{}, r.logErrorAndReturn(err, "failed to get pod status") } if podsStatus.havePodsRequiringDeletion() { @@ -1908,6 +1895,71 @@ func (r *HumioClusterReconciler) ensureMismatchedPodsAreDeleted(ctx context.Cont return reconcile.Result{RequeueAfter: time.Second + 1}, nil } + // prioritize deleting the pods with errors + var podListConsideredForDeletion []corev1.Pod + podsWithErrors := false + if podsStatus.havePodsWithErrors() { + r.Log.Info(fmt.Sprintf("found %d humio pods with errors", len(podsStatus.podErrors))) + podListConsideredForDeletion = podsStatus.podErrors + podsWithErrors = true + } else { + podListConsideredForDeletion = foundPodListForNodePool + + // if zone awareness is enabled, pick a zone if we haven't picked one yet + if !helpers.UseEnvtest() { + if hnp.GetState() == humiov1alpha1.HumioClusterStateUpgrading || hnp.GetState() == humiov1alpha1.HumioClusterStateRestarting { + if *hnp.GetUpdateStrategy().EnableZoneAwareness { + // If we still have pods without the desired revision and have not picked a zone yet, pick one. + if hnp.GetZoneUnderMaintenance() == "" && len(podListConsideredForDeletion) > 0 { + // Filter out any pods that already have the right pod revision + podListForCurrentZoneWithWrongPodRevision := FilterPodsExcludePodsWithPodRevision(foundPodListForNodePool, hnp.GetDesiredPodRevision()) + r.Log.Info(fmt.Sprintf("zone awareness enabled, len(podListForCurrentZoneWithWrongPodRevision)=%d", len(podListForCurrentZoneWithWrongPodRevision))) + + // Filter out any pods with empty nodeName fields + podListForCurrentZoneWithWrongPodRevisionAndNonEmptyNodeName := FilterPodsExcludePodsWithEmptyNodeName(podListForCurrentZoneWithWrongPodRevision) + r.Log.Info(fmt.Sprintf("zone awareness enabled, len(podListForCurrentZoneWithWrongPodRevision)=%d", len(podListForCurrentZoneWithWrongPodRevision))) + + if len(podListForCurrentZoneWithWrongPodRevisionAndNonEmptyNodeName) > 0 { + newZoneUnderMaintenance, err := kubernetes.GetZoneForNodeName(ctx, r, podListForCurrentZoneWithWrongPodRevisionAndNonEmptyNodeName[0].Spec.NodeName) + if err != nil { + return reconcile.Result{}, r.logErrorAndReturn(err, "unable to fetch zone") + } + + return r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). + withNodePoolState(hnp.GetState(), hnp.GetNodePoolName(), hnp.GetDesiredPodRevision(), newZoneUnderMaintenance)) + } + } + + // If there's a zone marked as under maintenance, we clear the zone-under-maintenance marker if no more work is left in that zone + if hnp.GetZoneUnderMaintenance() != "" { + r.Log.Info(fmt.Sprintf("zone awareness enabled, len(podListConsideredForDeletion)=%d", len(podListConsideredForDeletion))) + + // We have pods left and need to filter them by the zone marked as under maintenance + podListForCurrentZone, err := FilterPodsByZoneName(ctx, r, podListConsideredForDeletion, hnp.GetZoneUnderMaintenance()) + if err != nil { + return reconcile.Result{}, r.logErrorAndReturn(err, "got error filtering pods by zone name") + } + r.Log.Info(fmt.Sprintf("zone awareness enabled, len(podListForCurrentZone)=%d", len(podListForCurrentZone))) + + // Filter out any pods that already have the right pod revision, and clear the zone-under-maintenance marker if no pods are left after filtering + podListForCurrentZoneWithWrongPodRevision := FilterPodsExcludePodsWithPodRevision(podListForCurrentZone, hnp.GetDesiredPodRevision()) + r.Log.Info(fmt.Sprintf("zone awareness enabled, len(podListForCurrentZoneWithWrongPodRevision)=%d", len(podListForCurrentZoneWithWrongPodRevision))) + if len(podListForCurrentZoneWithWrongPodRevision) == 0 { + r.Log.Info(fmt.Sprintf("zone awareness enabled, no more pods for nodePool=%s in zone=%s", hnp.GetNodePoolName(), hnp.GetZoneUnderMaintenance())) + return r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). + withNodePoolState(hnp.GetState(), hnp.GetNodePoolName(), hnp.GetDesiredPodRevision(), "")) + } + } + } + } + } + } + + desiredLifecycleState, err := r.getPodDesiredLifecycleState(ctx, hnp, podListConsideredForDeletion, attachments, podsWithErrors) + if err != nil { + return reconcile.Result{}, r.logErrorAndReturn(err, "got error when getting pod desired lifecycle") + } + // If we are currently deleting pods, then check if the cluster state is Running or in a ConfigError state. If it // is, then change to an appropriate state depending on the restart policy. // If the cluster state is set as per the restart policy: @@ -1919,7 +1971,7 @@ func (r *HumioClusterReconciler) ensureMismatchedPodsAreDeleted(ctx context.Cont if desiredLifecycleState.WantsUpgrade() { r.Log.Info(fmt.Sprintf("changing cluster state from %s to %s with pod revision %d for node pool %s", hc.Status.State, humiov1alpha1.HumioClusterStateUpgrading, podRevision, hnp.GetNodePoolName())) if result, err := r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). - withNodePoolState(humiov1alpha1.HumioClusterStateUpgrading, hnp.GetNodePoolName(), podRevision)); err != nil { + withNodePoolState(humiov1alpha1.HumioClusterStateUpgrading, hnp.GetNodePoolName(), podRevision, "")); err != nil { return result, err } return reconcile.Result{Requeue: true}, nil @@ -1927,27 +1979,21 @@ func (r *HumioClusterReconciler) ensureMismatchedPodsAreDeleted(ctx context.Cont if !desiredLifecycleState.WantsUpgrade() && desiredLifecycleState.WantsRestart() { r.Log.Info(fmt.Sprintf("changing cluster state from %s to %s with pod revision %d for node pool %s", hc.Status.State, humiov1alpha1.HumioClusterStateRestarting, podRevision, hnp.GetNodePoolName())) if result, err := r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). - withNodePoolState(humiov1alpha1.HumioClusterStateRestarting, hnp.GetNodePoolName(), podRevision)); err != nil { + withNodePoolState(humiov1alpha1.HumioClusterStateRestarting, hnp.GetNodePoolName(), podRevision, "")); err != nil { return result, err } return reconcile.Result{Requeue: true}, nil } } if desiredLifecycleState.ShouldDeletePod() { - if hc.Status.State == humiov1alpha1.HumioClusterStateRestarting && podsStatus.waitingOnPods() && desiredLifecycleState.ShouldRollingRestart() { - r.Log.Info(fmt.Sprintf("pod %s should be deleted, but waiting because not all other pods are "+ - "ready. waitingOnPods=%v, clusterState=%s", desiredLifecycleState.pod.Name, - podsStatus.waitingOnPods(), hc.Status.State)) - return r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). - withMessage("waiting for pods to become ready")) - } - - if hc.Status.State == humiov1alpha1.HumioClusterStateUpgrading && podsStatus.waitingOnPods() && desiredLifecycleState.ShouldRollingRestart() { - r.Log.Info(fmt.Sprintf("pod %s should be deleted, but waiting because not all other pods are "+ - "ready. waitingOnPods=%v, clusterState=%s", desiredLifecycleState.pod.Name, - podsStatus.waitingOnPods(), hc.Status.State)) - return r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). - withMessage("waiting for pods to become ready")) + if hc.Status.State == humiov1alpha1.HumioClusterStateRestarting || hc.Status.State == humiov1alpha1.HumioClusterStateUpgrading { + if podsStatus.waitingOnPods() && desiredLifecycleState.ShouldRollingRestart() { + r.Log.Info(fmt.Sprintf("pod %s should be deleted, but waiting because not all other pods are "+ + "ready. waitingOnPods=%v, clusterState=%s", desiredLifecycleState.pod.Name, + podsStatus.waitingOnPods(), hc.Status.State)) + return r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). + withMessage("waiting for pods to become ready")) + } } var remainingMinReadyWaitTime = desiredLifecycleState.RemainingMinReadyWaitTime(podsStatus.podsReady) @@ -1984,12 +2030,12 @@ func (r *HumioClusterReconciler) ensureMismatchedPodsAreDeleted(ctx context.Cont // If we are no longer waiting on or deleting pods, and all the revisions are in sync, then we know the upgrade or // restart is complete and we can set the cluster state back to HumioClusterStateRunning. // It's possible we entered a ConfigError state during an upgrade or restart, and in this case, we should reset the - // state to Running if the the pods are healthy but we're in a ConfigError state. + // state to Running if then the pods are healthy but we're in a ConfigError state. if !podsStatus.waitingOnPods() && !desiredLifecycleState.WantsUpgrade() && !desiredLifecycleState.WantsRestart() && podsStatus.podRevisionsInSync() { if hc.Status.State == humiov1alpha1.HumioClusterStateRestarting || hc.Status.State == humiov1alpha1.HumioClusterStateUpgrading || hc.Status.State == humiov1alpha1.HumioClusterStateConfigError { r.Log.Info(fmt.Sprintf("no longer deleting pods. changing cluster state from %s to %s", hc.Status.State, humiov1alpha1.HumioClusterStateRunning)) if result, err := r.updateStatus(ctx, r.Client.Status(), hc, statusOptions(). - withNodePoolState(humiov1alpha1.HumioClusterStateRunning, hnp.GetNodePoolName(), hnp.GetDesiredPodRevision())); err != nil { + withNodePoolState(humiov1alpha1.HumioClusterStateRunning, hnp.GetNodePoolName(), hnp.GetDesiredPodRevision(), "")); err != nil { return result, err } } diff --git a/controllers/humiocluster_defaults.go b/controllers/humiocluster_defaults.go index 5b4561e8..ef5658f3 100644 --- a/controllers/humiocluster_defaults.go +++ b/controllers/humiocluster_defaults.go @@ -18,7 +18,6 @@ package controllers import ( "fmt" - "os" "reflect" "strconv" "strings" @@ -82,13 +81,19 @@ type HumioNodePool struct { ingress humiov1alpha1.HumioClusterIngressSpec clusterAnnotations map[string]string desiredPodRevision int + state string + zoneUnderMaintenance string } func NewHumioNodeManagerFromHumioCluster(hc *humiov1alpha1.HumioCluster) *HumioNodePool { desiredPodRevision := 0 + zoneUnderMaintenance := "" + state := "" for _, status := range hc.Status.NodePoolStatus { if status.Name == hc.Name { desiredPodRevision = status.DesiredPodRevision + zoneUnderMaintenance = status.ZoneUnderMaintenance + state = status.State break } } @@ -151,14 +156,21 @@ func NewHumioNodeManagerFromHumioCluster(hc *humiov1alpha1.HumioCluster) *HumioN ingress: hc.Spec.Ingress, clusterAnnotations: hc.Annotations, desiredPodRevision: desiredPodRevision, + zoneUnderMaintenance: zoneUnderMaintenance, + state: state, } } func NewHumioNodeManagerFromHumioNodePool(hc *humiov1alpha1.HumioCluster, hnp *humiov1alpha1.HumioNodePoolSpec) *HumioNodePool { desiredPodRevision := 0 + zoneUnderMaintenance := "" + state := "" + for _, status := range hc.Status.NodePoolStatus { if status.Name == strings.Join([]string{hc.Name, hnp.Name}, "-") { desiredPodRevision = status.DesiredPodRevision + zoneUnderMaintenance = status.ZoneUnderMaintenance + state = status.State break } } @@ -221,6 +233,8 @@ func NewHumioNodeManagerFromHumioNodePool(hc *humiov1alpha1.HumioCluster, hnp *h ingress: hc.Spec.Ingress, clusterAnnotations: hc.Annotations, desiredPodRevision: desiredPodRevision, + zoneUnderMaintenance: zoneUnderMaintenance, + state: state, } } @@ -252,8 +266,8 @@ func (hnp *HumioNodePool) GetImage() string { return hnp.humioNodeSpec.Image } - if os.Getenv("HUMIO_OPERATOR_DEFAULT_HUMIO_CORE_IMAGE") != "" { - return os.Getenv("HUMIO_OPERATOR_DEFAULT_HUMIO_CORE_IMAGE") + if defaultImageFromEnvVar := helpers.GetDefaultHumioCoreImageFromEnvVar(); defaultImageFromEnvVar != "" { + return defaultImageFromEnvVar } return versions.DefaultHumioImageVersion() @@ -268,8 +282,8 @@ func (hnp *HumioNodePool) GetHelperImage() string { return hnp.humioNodeSpec.HelperImage } - if os.Getenv("HUMIO_OPERATOR_DEFAULT_HUMIO_HELPER_IMAGE") != "" { - return os.Getenv("HUMIO_OPERATOR_DEFAULT_HUMIO_HELPER_IMAGE") + if defaultHelperImageFromEnvVar := helpers.GetDefaultHumioHelperImageFromEnvVar(); defaultHelperImageFromEnvVar != "" { + return defaultHelperImageFromEnvVar } return versions.DefaultHelperImageVersion() @@ -308,6 +322,14 @@ func (hnp *HumioNodePool) GetDesiredPodRevision() int { return hnp.desiredPodRevision } +func (hnp *HumioNodePool) GetZoneUnderMaintenance() string { + return hnp.zoneUnderMaintenance +} + +func (hnp *HumioNodePool) GetState() string { + return hnp.state +} + func (hnp *HumioNodePool) GetIngress() humiov1alpha1.HumioClusterIngressSpec { return hnp.ingress } @@ -575,7 +597,7 @@ func (hnp *HumioNodePool) GetContainerReadinessProbe() *corev1.Probe { SuccessThreshold: 1, FailureThreshold: 10, } - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { probe.InitialDelaySeconds = 0 } return probe @@ -817,13 +839,20 @@ func (hnp *HumioNodePool) GetProbeScheme() corev1.URIScheme { } func (hnp *HumioNodePool) GetUpdateStrategy() *humiov1alpha1.HumioUpdateStrategy { + defaultZoneAwareness := true + if hnp.humioNodeSpec.UpdateStrategy != nil { + if hnp.humioNodeSpec.UpdateStrategy.EnableZoneAwareness == nil { + hnp.humioNodeSpec.UpdateStrategy.EnableZoneAwareness = &defaultZoneAwareness + } + return hnp.humioNodeSpec.UpdateStrategy } return &humiov1alpha1.HumioUpdateStrategy{ - Type: humiov1alpha1.HumioClusterUpdateStrategyReplaceAllOnUpdate, - MinReadySeconds: 0, + Type: humiov1alpha1.HumioClusterUpdateStrategyReplaceAllOnUpdate, + MinReadySeconds: 0, + EnableZoneAwareness: &defaultZoneAwareness, } } diff --git a/controllers/humiocluster_pods.go b/controllers/humiocluster_pods.go index 34e1d29c..5c292082 100644 --- a/controllers/humiocluster_pods.go +++ b/controllers/humiocluster_pods.go @@ -23,6 +23,7 @@ import ( "fmt" "reflect" "sort" + "strconv" "strings" "time" @@ -718,12 +719,12 @@ func (r *HumioClusterReconciler) waitForNewPods(ctx context.Context, hnp *HumioN return fmt.Errorf("timed out waiting to validate new pods was created") } -func (r *HumioClusterReconciler) podsMatch(hnp *HumioNodePool, pod corev1.Pod, desiredPod corev1.Pod) (bool, error) { +func (r *HumioClusterReconciler) podsMatch(hnp *HumioNodePool, pod corev1.Pod, desiredPod corev1.Pod) bool { if _, ok := pod.Annotations[PodHashAnnotation]; !ok { - return false, fmt.Errorf("did not find annotation with pod hash") + return false } if _, ok := pod.Annotations[PodRevisionAnnotation]; !ok { - return false, fmt.Errorf("did not find annotation with pod revision") + return false } var specMatches bool @@ -779,28 +780,35 @@ func (r *HumioClusterReconciler) podsMatch(hnp *HumioNodePool, pod corev1.Pod, d podSpecDiff := cmp.Diff(sanitizedCurrentPod.Spec, sanitizedDesiredPod.Spec) if !specMatches { r.Log.Info(fmt.Sprintf("pod annotation %s does not match desired pod: got %+v, expected %+v", PodHashAnnotation, pod.Annotations[PodHashAnnotation], desiredPodHash), "podSpecDiff", podSpecDiff) - return false, nil + return false } if !revisionMatches { r.Log.Info(fmt.Sprintf("pod annotation %s does not match desired pod: got %+v, expected %+v", PodRevisionAnnotation, pod.Annotations[PodRevisionAnnotation], desiredPod.Annotations[PodRevisionAnnotation]), "podSpecDiff", podSpecDiff) - return false, nil + return false } if !envVarSourceMatches { r.Log.Info(fmt.Sprintf("pod annotation %s does not match desired pod: got %+v, expected %+v", envVarSourceHashAnnotation, pod.Annotations[envVarSourceHashAnnotation], desiredPod.Annotations[envVarSourceHashAnnotation]), "podSpecDiff", podSpecDiff) - return false, nil + return false } if !certHasAnnotationMatches { r.Log.Info(fmt.Sprintf("pod annotation %s does not match desired pod: got %+v, expected %+v", certHashAnnotation, pod.Annotations[certHashAnnotation], desiredPod.Annotations[certHashAnnotation]), "podSpecDiff", podSpecDiff) - return false, nil + return false } if !bootstrapTokenAnnotationMatches { r.Log.Info(fmt.Sprintf("pod annotation %s bootstrapTokenAnnotationMatches not match desired pod: got %+v, expected %+v", bootstrapTokenHashAnnotation, pod.Annotations[bootstrapTokenHashAnnotation], desiredPod.Annotations[bootstrapTokenHashAnnotation]), "podSpecDiff", podSpecDiff) - return false, nil + return false } - return true, nil + return true } -func (r *HumioClusterReconciler) getPodDesiredLifecycleState(hnp *HumioNodePool, foundPodList []corev1.Pod, attachments *podAttachments) (podLifecycleState, error) { +// getPodDesiredLifecycleState goes through the list of pods and decides what action to take for the pods. +// It compares pods it is given with a newly-constructed pod. If they do not match, we know we have +// "at least" a configuration difference and require a rolling replacement of the pods. +// If the container image differs, it will indicate that a version difference is present. +// For very specific configuration differences it may indicate that all pods in the node pool should be +// replaced simultaneously. +// The value of podLifecycleState.pod indicates what pod should be replaced next. +func (r *HumioClusterReconciler) getPodDesiredLifecycleState(ctx context.Context, hnp *HumioNodePool, foundPodList []corev1.Pod, attachments *podAttachments, podsWithErrors bool) (podLifecycleState, error) { for _, pod := range foundPodList { podLifecycleStateValue := NewPodLifecycleState(*hnp, pod) @@ -822,10 +830,7 @@ func (r *HumioClusterReconciler) getPodDesiredLifecycleState(hnp *HumioNodePool, desiredPod.Annotations[bootstrapTokenHashAnnotation] = attachments.bootstrapTokenSecretReference.hash } - podsMatch, err := r.podsMatch(hnp, pod, *desiredPod) - if err != nil { - r.Log.Error(err, "failed to check if pods match") - } + podsMatch := r.podsMatch(hnp, pod, *desiredPod) // ignore pod if it matches the desired pod if podsMatch { @@ -855,6 +860,29 @@ func (r *HumioClusterReconciler) getPodDesiredLifecycleState(hnp *HumioNodePool, podLifecycleStateValue.configurationDifference.requiresSimultaneousRestart = true } + // if we run with envtest, we won't have zone information available + if !helpers.UseEnvtest() { + // if zone awareness is enabled, ignore pod if zone is incorrect + if !podsWithErrors && *hnp.GetUpdateStrategy().EnableZoneAwareness { + r.Log.Info(fmt.Sprintf("zone awareness enabled, looking up zone for pod=%s", pod.Name)) + if pod.Spec.NodeName == "" { + // current pod does not have a nodeName set + r.Log.Info(fmt.Sprintf("pod=%s does not have a nodeName set, ignoring", pod.Name)) + continue + } + + // fetch zone for node name and ignore pod if zone is not the one that is marked as under maintenance + zoneForNodeName, err := kubernetes.GetZoneForNodeName(ctx, r, pod.Spec.NodeName) + if err != nil { + return podLifecycleState{}, r.logErrorAndReturn(err, "could get zone name for node") + } + if hnp.GetZoneUnderMaintenance() != "" && zoneForNodeName != hnp.GetZoneUnderMaintenance() { + r.Log.Info(fmt.Sprintf("ignoring pod=%s as zoneUnderMaintenace=%s but pod has nodeName=%s where zone=%s", pod.Name, hnp.GetZoneUnderMaintenance(), pod.Spec.NodeName, zoneForNodeName)) + continue + } + } + } + return *podLifecycleStateValue, nil } return podLifecycleState{}, nil @@ -1061,3 +1089,41 @@ func findPodForPvc(podList []corev1.Pod, pvc corev1.PersistentVolumeClaim) (core return corev1.Pod{}, fmt.Errorf("could not find a pod for pvc %s", pvc.Name) } +func FilterPodsByZoneName(ctx context.Context, c client.Client, podList []corev1.Pod, zoneName string) ([]corev1.Pod, error) { + filteredPodList := []corev1.Pod{} + for _, pod := range podList { + zoneForNodeName, err := kubernetes.GetZoneForNodeName(ctx, c, pod.Spec.NodeName) + if err != nil { + return nil, err + } + if zoneForNodeName == zoneName { + filteredPodList = append(filteredPodList, pod) + } + } + return filteredPodList, nil +} + +func FilterPodsExcludePodsWithPodRevision(podList []corev1.Pod, podRevisionToExclude int) []corev1.Pod { + filteredPodList := []corev1.Pod{} + for _, pod := range podList { + podRevision, found := pod.Annotations[PodRevisionAnnotation] + if found { + if strconv.Itoa(podRevisionToExclude) == podRevision { + continue + } + } + filteredPodList = append(filteredPodList, pod) + } + return filteredPodList +} + +func FilterPodsExcludePodsWithEmptyNodeName(podList []corev1.Pod) []corev1.Pod { + filteredPodList := []corev1.Pod{} + for _, pod := range podList { + if pod.Spec.NodeName == "" { + continue + } + filteredPodList = append(filteredPodList, pod) + } + return filteredPodList +} diff --git a/controllers/humiocluster_status.go b/controllers/humiocluster_status.go index 8024552c..77e84619 100644 --- a/controllers/humiocluster_status.go +++ b/controllers/humiocluster_status.go @@ -48,9 +48,10 @@ type messageOption struct { } type stateOption struct { - state string - nodePoolName string - desiredPodRevision int + state string + nodePoolName string + desiredPodRevision int + zoneUnderMaintenance string } type stateOptionList struct { @@ -101,11 +102,12 @@ func (o *optionBuilder) withState(state string) *optionBuilder { return o } -func (o *optionBuilder) withNodePoolState(state string, nodePoolName string, podRevision int) *optionBuilder { +func (o *optionBuilder) withNodePoolState(state string, nodePoolName string, podRevision int, zoneName string) *optionBuilder { o.options = append(o.options, stateOption{ - state: state, - nodePoolName: nodePoolName, - desiredPodRevision: podRevision, + state: state, + nodePoolName: nodePoolName, + desiredPodRevision: podRevision, + zoneUnderMaintenance: zoneName, }) return o } @@ -113,7 +115,12 @@ func (o *optionBuilder) withNodePoolState(state string, nodePoolName string, pod func (o *optionBuilder) withNodePoolStatusList(humioNodePoolStatusList humiov1alpha1.HumioNodePoolStatusList) *optionBuilder { var statesList []stateOption for _, poolStatus := range humioNodePoolStatusList { - statesList = append(statesList, stateOption{nodePoolName: poolStatus.Name, state: poolStatus.State, desiredPodRevision: poolStatus.DesiredPodRevision}) + statesList = append(statesList, stateOption{ + nodePoolName: poolStatus.Name, + state: poolStatus.State, + desiredPodRevision: poolStatus.DesiredPodRevision, + zoneUnderMaintenance: poolStatus.ZoneUnderMaintenance, + }) } o.options = append(o.options, stateOptionList{ statesList: statesList, @@ -174,15 +181,17 @@ func (s stateOption) Apply(hc *humiov1alpha1.HumioCluster) { if nodePoolStatus.Name == s.nodePoolName { nodePoolStatus.State = s.state nodePoolStatus.DesiredPodRevision = s.desiredPodRevision + nodePoolStatus.ZoneUnderMaintenance = s.zoneUnderMaintenance hc.Status.NodePoolStatus[idx] = nodePoolStatus return } } hc.Status.NodePoolStatus = append(hc.Status.NodePoolStatus, humiov1alpha1.HumioNodePoolStatus{ - Name: s.nodePoolName, - State: s.state, - DesiredPodRevision: s.desiredPodRevision, + Name: s.nodePoolName, + State: s.state, + DesiredPodRevision: s.desiredPodRevision, + ZoneUnderMaintenance: s.zoneUnderMaintenance, }) } } @@ -202,9 +211,10 @@ func (s stateOptionList) Apply(hc *humiov1alpha1.HumioCluster) { hc.Status.NodePoolStatus = humiov1alpha1.HumioNodePoolStatusList{} for _, poolStatus := range s.statesList { hc.Status.NodePoolStatus = append(hc.Status.NodePoolStatus, humiov1alpha1.HumioNodePoolStatus{ - Name: poolStatus.nodePoolName, - State: poolStatus.state, - DesiredPodRevision: poolStatus.desiredPodRevision, + Name: poolStatus.nodePoolName, + State: poolStatus.state, + DesiredPodRevision: poolStatus.desiredPodRevision, + ZoneUnderMaintenance: poolStatus.zoneUnderMaintenance, }) } } diff --git a/controllers/suite/clusters/humiocluster_controller_test.go b/controllers/suite/clusters/humiocluster_controller_test.go index 8998e023..872e30c1 100644 --- a/controllers/suite/clusters/humiocluster_controller_test.go +++ b/controllers/suite/clusters/humiocluster_controller_test.go @@ -19,9 +19,9 @@ package clusters import ( "context" "fmt" - "os" "reflect" "strings" + "time" cmapi "github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1" humiov1alpha1 "github.com/humio/humio-operator/api/v1alpha1" @@ -40,6 +40,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" + "sigs.k8s.io/controller-runtime/pkg/client" ) var _ = Describe("HumioCluster Controller", func() { @@ -533,7 +534,7 @@ var _ = Describe("HumioCluster Controller", func() { Eventually(func() []corev1.Pod { var clusterPods []corev1.Pod clusterPods, _ = kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(&updatedHumioCluster).GetPodLabels()) - _ = suite.MarkPodsAsRunning(ctx, k8sClient, clusterPods, key.Name) + _ = suite.MarkPodsAsRunningIfUsingEnvtest(ctx, k8sClient, clusterPods, key.Name) return clusterPods }, testTimeout, suite.TestInterval).Should(HaveLen(toCreate.Spec.NodeCount)) @@ -1129,7 +1130,7 @@ var _ = Describe("HumioCluster Controller", func() { suite.UsingClusterBy(key.Name, "Simulating mock pods to be scheduled") clusterPods, _ = kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(toCreate).GetPodLabels()) - _ = suite.MarkPodsAsRunning(ctx, k8sClient, clusterPods, key.Name) + _ = suite.MarkPodsAsRunningIfUsingEnvtest(ctx, k8sClient, clusterPods, key.Name) suite.UsingClusterBy(key.Name, "Waiting for humio cluster state to be Running") Eventually(func() string { @@ -1205,7 +1206,7 @@ var _ = Describe("HumioCluster Controller", func() { suite.UsingClusterBy(key.Name, "Validating pod uses default helper image as init container") Eventually(func() string { clusterPods, _ := kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(toCreate).GetPodLabels()) - _ = suite.MarkPodsAsRunning(ctx, k8sClient, clusterPods, key.Name) + _ = suite.MarkPodsAsRunningIfUsingEnvtest(ctx, k8sClient, clusterPods, key.Name) for _, pod := range clusterPods { initIdx, _ := kubernetes.GetInitContainerIndexByName(pod, controllers.InitContainerName) @@ -1269,7 +1270,7 @@ var _ = Describe("HumioCluster Controller", func() { suite.UsingClusterBy(key.Name, "Validating pod bootstrap token annotation hash") Eventually(func() string { clusterPods, _ := kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(toCreate).GetPodLabels()) - _ = suite.MarkPodsAsRunning(ctx, k8sClient, clusterPods, key.Name) + _ = suite.MarkPodsAsRunningIfUsingEnvtest(ctx, k8sClient, clusterPods, key.Name) if len(clusterPods) > 0 { return clusterPods[0].Annotations["humio.com/bootstrap-token-hash"] @@ -1304,7 +1305,7 @@ var _ = Describe("HumioCluster Controller", func() { suite.UsingClusterBy(key.Name, "Validating pod is recreated with the new bootstrap token hash annotation") Eventually(func() string { clusterPods, _ := kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(toCreate).GetPodLabels()) - _ = suite.MarkPodsAsRunning(ctx, k8sClient, clusterPods, key.Name) + _ = suite.MarkPodsAsRunningIfUsingEnvtest(ctx, k8sClient, clusterPods, key.Name) if len(clusterPods) > 0 { return clusterPods[0].Annotations["humio.com/bootstrap-token-hash"] @@ -3336,15 +3337,15 @@ var _ = Describe("HumioCluster Controller", func() { suite.CreateAndBootstrapCluster(ctx, k8sClient, testHumioClient, toCreate, true, humiov1alpha1.HumioClusterStateRunning, testTimeout) defer suite.CleanupCluster(ctx, k8sClient, toCreate) - initialExpectedVolumesCount := 5 - initialExpectedHumioContainerVolumeMountsCount := 4 + initialExpectedVolumesCount := 5 // shared, tmp, humio-data, extra-kafka-configs, init-service-account-secret + initialExpectedHumioContainerVolumeMountsCount := 4 // shared, tmp, humio-data, extra-kafka-configs - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" { + if !helpers.UseEnvtest() { // k8s will automatically inject a service account token initialExpectedVolumesCount += 1 // kube-api-access- initialExpectedHumioContainerVolumeMountsCount += 1 // kube-api-access- - if helpers.UseCertManager() { + if helpers.TLSEnabled(toCreate) { initialExpectedVolumesCount += 1 // tls-cert initialExpectedHumioContainerVolumeMountsCount += 1 // tls-cert } @@ -4525,7 +4526,7 @@ var _ = Describe("HumioCluster Controller", func() { suite.UsingClusterBy(key.Name, "Validating pod is created with the default grace period") Eventually(func() int64 { clusterPods, _ := kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(toCreate).GetPodLabels()) - _ = suite.MarkPodsAsRunning(ctx, k8sClient, clusterPods, key.Name) + _ = suite.MarkPodsAsRunningIfUsingEnvtest(ctx, k8sClient, clusterPods, key.Name) for _, pod := range clusterPods { if pod.Spec.TerminationGracePeriodSeconds != nil { @@ -4985,4 +4986,136 @@ var _ = Describe("HumioCluster Controller", func() { }, testTimeout, suite.TestInterval).Should(HaveKeyWithValue(kubernetes.NodePoolLabelName, key.Name)) }) }) + + Context("test rolling update with max unavailable absolute value", Label("envtest", "dummy"), func() { + It("Update should correctly replace pods to use new image in a rolling fashion", func() { + key := types.NamespacedName{ + Name: "hc-update-absolute-maxunavail", + Namespace: testProcessNamespace, + } + maxUnavailable := intstr.FromInt32(1) + toCreate := suite.ConstructBasicSingleNodeHumioCluster(key, true) + toCreate.Spec.Image = versions.OldSupportedHumioVersion() + toCreate.Spec.NodeCount = 9 + toCreate.Spec.UpdateStrategy = &humiov1alpha1.HumioUpdateStrategy{ + Type: humiov1alpha1.HumioClusterUpdateStrategyRollingUpdate, + } + + suite.UsingClusterBy(key.Name, "Creating the cluster successfully") + ctx := context.Background() + suite.CreateAndBootstrapCluster(ctx, k8sClient, testHumioClient, toCreate, true, humiov1alpha1.HumioClusterStateRunning, testTimeout) + defer suite.CleanupCluster(ctx, k8sClient, toCreate) + + var updatedHumioCluster humiov1alpha1.HumioCluster + clusterPods, _ := kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(toCreate).GetPodLabels()) + for _, pod := range clusterPods { + humioIndex, _ := kubernetes.GetContainerIndexByName(pod, controllers.HumioContainerName) + Expect(pod.Spec.Containers[humioIndex].Image).To(BeIdenticalTo(toCreate.Spec.Image)) + Expect(pod.Annotations).To(HaveKeyWithValue(controllers.PodRevisionAnnotation, "1")) + } + updatedHumioCluster = humiov1alpha1.HumioCluster{} + Expect(k8sClient.Get(ctx, key, &updatedHumioCluster)).Should(Succeed()) + Expect(controllers.NewHumioNodeManagerFromHumioCluster(&updatedHumioCluster).GetDesiredPodRevision()).To(BeEquivalentTo(1)) + + mostSeenUnavailable := 0 + forever := make(chan struct{}) + ctx2, cancel := context.WithCancel(context.Background()) + + // TODO: Consider refactoring goroutine to a "watcher". https://book-v1.book.kubebuilder.io/beyond_basics/controller_watches + // Using a for-loop executing ListPods will only see snapshots in time and we could easily miss + // a point in time where we have too many pods that are not ready. + go func(ctx2 context.Context, k8sClient client.Client, toCreate humiov1alpha1.HumioCluster) { + hnp := controllers.NewHumioNodeManagerFromHumioCluster(&toCreate) + for { + select { + case <-ctx2.Done(): // if cancel() execute + forever <- struct{}{} + return + default: + // Assume all is unavailable, and decrement number each time we see one that is working + unavailableThisRound := hnp.GetNodeCount() + + pods, _ := kubernetes.ListPods(ctx2, k8sClient, hnp.GetNamespace(), hnp.GetPodLabels()) + suite.UsingClusterBy(key.Name, fmt.Sprintf("goroutine looking for unavailable pods: len(pods)=%d", len(pods))) + for _, pod := range pods { + suite.UsingClusterBy(key.Name, fmt.Sprintf("goroutine looking for unavailable pods: pod.Status.Phase=%s", pod.Status.Phase)) + + if pod.Status.Phase == corev1.PodFailed { + suite.UsingClusterBy(key.Name, fmt.Sprintf("goroutine looking for unavailable pods, full pod dump of failing pod: %+v", pod)) + var eventList corev1.EventList + _ = k8sClient.List(ctx2, &eventList) + for _, event := range eventList.Items { + if event.InvolvedObject.UID == pod.UID { + suite.UsingClusterBy(key.Name, fmt.Sprintf("Found event for failing pod: involvedObject=%+v, reason=%s, message=%s, source=%+v", event.InvolvedObject, event.Reason, event.Message, event.Source)) + } + } + } + + if pod.Status.Phase == corev1.PodRunning { + for idx, containerStatus := range pod.Status.ContainerStatuses { + suite.UsingClusterBy(key.Name, fmt.Sprintf("goroutine looking for unavailable pods: pod.Status.ContainerStatuses[%d]=%+v", idx, containerStatus)) + if containerStatus.Ready { + unavailableThisRound-- + } + } + } + } + // Save the number of unavailable pods in this round + mostSeenUnavailable = max(mostSeenUnavailable, unavailableThisRound) + } + time.Sleep(1 * time.Second) + } + }(ctx2, k8sClient, *toCreate) + + suite.UsingClusterBy(key.Name, "Updating the cluster image successfully") + updatedImage := versions.DefaultHumioImageVersion() + Eventually(func() error { + updatedHumioCluster = humiov1alpha1.HumioCluster{} + err := k8sClient.Get(ctx, key, &updatedHumioCluster) + if err != nil { + return err + } + updatedHumioCluster.Spec.Image = updatedImage + return k8sClient.Update(ctx, &updatedHumioCluster) + }, testTimeout, suite.TestInterval).Should(Succeed()) + + Eventually(func() string { + updatedHumioCluster = humiov1alpha1.HumioCluster{} + Expect(k8sClient.Get(ctx, key, &updatedHumioCluster)).Should(Succeed()) + return updatedHumioCluster.Status.State + }, testTimeout, suite.TestInterval).Should(BeIdenticalTo(humiov1alpha1.HumioClusterStateUpgrading)) + + ensurePodsRollingRestart(ctx, controllers.NewHumioNodeManagerFromHumioCluster(&updatedHumioCluster), 2) + + Eventually(func() string { + updatedHumioCluster = humiov1alpha1.HumioCluster{} + Expect(k8sClient.Get(ctx, key, &updatedHumioCluster)).Should(Succeed()) + return updatedHumioCluster.Status.State + }, testTimeout, suite.TestInterval).Should(BeIdenticalTo(humiov1alpha1.HumioClusterStateRunning)) + + suite.UsingClusterBy(key.Name, "Confirming pod revision is the same for all pods and the cluster itself") + updatedHumioCluster = humiov1alpha1.HumioCluster{} + Expect(k8sClient.Get(ctx, key, &updatedHumioCluster)).Should(Succeed()) + Expect(controllers.NewHumioNodeManagerFromHumioCluster(&updatedHumioCluster).GetDesiredPodRevision()).To(BeEquivalentTo(2)) + + updatedClusterPods, _ := kubernetes.ListPods(ctx, k8sClient, updatedHumioCluster.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(&updatedHumioCluster).GetPodLabels()) + Expect(updatedClusterPods).To(HaveLen(toCreate.Spec.NodeCount)) + for _, pod := range updatedClusterPods { + humioIndex, _ := kubernetes.GetContainerIndexByName(pod, controllers.HumioContainerName) + Expect(pod.Spec.Containers[humioIndex].Image).To(BeIdenticalTo(updatedImage)) + Expect(pod.Annotations).To(HaveKeyWithValue(controllers.PodRevisionAnnotation, "2")) + } + + cancel() + <-forever + + if helpers.TLSEnabled(&updatedHumioCluster) { + suite.UsingClusterBy(key.Name, "Ensuring pod names are not changed") + Expect(podNames(clusterPods)).To(Equal(podNames(updatedClusterPods))) + } + + suite.UsingClusterBy(key.Name, fmt.Sprintf("Verifying we do not have too many unavailable pods during pod replacements, mostSeenUnavailable(%d) <= maxUnavailable(%d)", mostSeenUnavailable, maxUnavailable.IntValue())) + Expect(mostSeenUnavailable).To(BeNumerically("<=", maxUnavailable.IntValue())) + }) + }) }) diff --git a/controllers/suite/clusters/suite_test.go b/controllers/suite/clusters/suite_test.go index e75f3f7a..3f192126 100644 --- a/controllers/suite/clusters/suite_test.go +++ b/controllers/suite/clusters/suite_test.go @@ -20,7 +20,6 @@ import ( "context" "encoding/json" "fmt" - "os" "path/filepath" "sort" "strconv" @@ -87,17 +86,17 @@ var _ = BeforeSuite(func() { By("bootstrapping test environment") useExistingCluster := true testProcessNamespace = fmt.Sprintf("e2e-clusters-%d", GinkgoParallelProcess()) - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" { + if !helpers.UseEnvtest() { testTimeout = time.Second * 900 testEnv = &envtest.Environment{ UseExistingCluster: &useExistingCluster, } - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { testHumioClient = humio.NewMockClient() } else { testHumioClient = humio.NewClient(log, "") By("Verifying we have a valid license, as tests will require starting up real LogScale containers") - Expect(os.Getenv("HUMIO_E2E_LICENSE")).NotTo(BeEmpty()) + Expect(helpers.GetE2ELicenseFromEnvVar()).NotTo(BeEmpty()) } } else { testTimeout = time.Second * 30 @@ -337,8 +336,8 @@ func constructBasicMultiNodePoolHumioCluster(key types.NamespacedName, useAutoCr return toCreate } -func markPodAsPending(ctx context.Context, client client.Client, nodeID int, pod corev1.Pod, clusterName string) error { - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" { +func markPodAsPendingIfUsingEnvtest(ctx context.Context, client client.Client, nodeID int, pod corev1.Pod, clusterName string) error { + if !helpers.UseEnvtest() { return nil } @@ -356,8 +355,8 @@ func markPodAsPending(ctx context.Context, client client.Client, nodeID int, pod return client.Status().Update(ctx, &pod) } -func markPodsWithRevisionAsReady(ctx context.Context, hnp *controllers.HumioNodePool, podRevision int, desiredReadyPodCount int) { - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" { +func markPodsWithRevisionAsReadyIfUsingEnvTest(ctx context.Context, hnp *controllers.HumioNodePool, podRevision int, desiredReadyPodCount int) { + if !helpers.UseEnvtest() { return } foundPodList, _ := kubernetes.ListPods(ctx, k8sClient, hnp.GetNamespace(), hnp.GetNodePoolLabels()) @@ -390,7 +389,7 @@ func markPodsWithRevisionAsReady(ctx context.Context, hnp *controllers.HumioNode for i := range podListWithRevision { suite.UsingClusterBy(hnp.GetClusterName(), fmt.Sprintf("Considering pod %s with podIP %s", podListWithRevision[i].Name, podListWithRevision[i].Status.PodIP)) if podListWithRevision[i].Status.PodIP == "" { - err := suite.MarkPodAsRunning(ctx, k8sClient, podListWithRevision[i], hnp.GetClusterName()) + err := suite.MarkPodAsRunningIfUsingEnvtest(ctx, k8sClient, podListWithRevision[i], hnp.GetClusterName()) if err != nil { suite.UsingClusterBy(hnp.GetClusterName(), fmt.Sprintf("Got error while marking pod %s as running: %v", podListWithRevision[i].Name, err)) } @@ -444,7 +443,7 @@ func podPendingCountByRevision(ctx context.Context, hnp *controllers.HumioNodePo clusterPods, _ := kubernetes.ListPods(ctx, k8sClient, hnp.GetNamespace(), hnp.GetNodePoolLabels()) for nodeID, pod := range clusterPods { revision, _ := strconv.Atoi(pod.Annotations[controllers.PodRevisionAnnotation]) - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" { + if !helpers.UseEnvtest() { if pod.DeletionTimestamp == nil { for _, condition := range pod.Status.Conditions { if condition.Type == corev1.PodScheduled { @@ -456,7 +455,7 @@ func podPendingCountByRevision(ctx context.Context, hnp *controllers.HumioNodePo } } else { if nodeID+1 <= expectedPendingCount { - _ = markPodAsPending(ctx, k8sClient, nodeID, pod, hnp.GetClusterName()) + _ = markPodAsPendingIfUsingEnvtest(ctx, k8sClient, nodeID, pod, hnp.GetClusterName()) revisionToPendingCount[revision]++ } } @@ -484,7 +483,7 @@ func ensurePodsRollingRestart(ctx context.Context, hnp *controllers.HumioNodePoo for expectedReadyCount := 1; expectedReadyCount < hnp.GetNodeCount()+1; expectedReadyCount++ { Eventually(func() map[int]int { suite.UsingClusterBy(hnp.GetClusterName(), fmt.Sprintf("ensurePodsRollingRestart Ensuring replacement pods are ready one at a time expectedReadyCount=%d", expectedReadyCount)) - markPodsWithRevisionAsReady(ctx, hnp, expectedPodRevision, expectedReadyCount) + markPodsWithRevisionAsReadyIfUsingEnvTest(ctx, hnp, expectedPodRevision, expectedReadyCount) return podReadyCountByRevision(ctx, hnp, expectedPodRevision) }, testTimeout, suite.TestInterval).Should(HaveKeyWithValue(expectedPodRevision, expectedReadyCount)) } @@ -502,7 +501,7 @@ func ensurePodsGoPending(ctx context.Context, hnp *controllers.HumioNodePool, ex func ensurePodsTerminate(ctx context.Context, hnp *controllers.HumioNodePool, expectedPodRevision int) { suite.UsingClusterBy(hnp.GetClusterName(), "ensurePodsTerminate Ensuring all existing pods are terminated at the same time") Eventually(func() map[int]int { - markPodsWithRevisionAsReady(ctx, hnp, expectedPodRevision, 0) + markPodsWithRevisionAsReadyIfUsingEnvTest(ctx, hnp, expectedPodRevision, 0) numPodsReadyByRevision := podReadyCountByRevision(ctx, hnp, expectedPodRevision) suite.UsingClusterBy(hnp.GetClusterName(), fmt.Sprintf("podsReadyCountByRevision() = %#+v", numPodsReadyByRevision)) return numPodsReadyByRevision @@ -510,7 +509,7 @@ func ensurePodsTerminate(ctx context.Context, hnp *controllers.HumioNodePool, ex suite.UsingClusterBy(hnp.GetClusterName(), "ensurePodsTerminate Ensuring replacement pods are not ready at the same time") Eventually(func() map[int]int { - markPodsWithRevisionAsReady(ctx, hnp, expectedPodRevision, 0) + markPodsWithRevisionAsReadyIfUsingEnvTest(ctx, hnp, expectedPodRevision, 0) numPodsReadyByRevision := podReadyCountByRevision(ctx, hnp, expectedPodRevision) suite.UsingClusterBy(hnp.GetClusterName(), fmt.Sprintf("ensurePodsTerminate podsReadyCountByRevision() = %#+v", numPodsReadyByRevision)) return numPodsReadyByRevision @@ -523,7 +522,7 @@ func ensurePodsSimultaneousRestart(ctx context.Context, hnp *controllers.HumioNo suite.UsingClusterBy(hnp.GetClusterName(), "ensurePodsSimultaneousRestart Ensuring all pods come back up after terminating") Eventually(func() map[int]int { - markPodsWithRevisionAsReady(ctx, hnp, expectedPodRevision, hnp.GetNodeCount()) + markPodsWithRevisionAsReadyIfUsingEnvTest(ctx, hnp, expectedPodRevision, hnp.GetNodeCount()) numPodsReadyByRevision := podReadyCountByRevision(ctx, hnp, expectedPodRevision) suite.UsingClusterBy(hnp.GetClusterName(), fmt.Sprintf("ensurePodsSimultaneousRestart podsReadyCountByRevision() = %#+v", numPodsReadyByRevision)) return numPodsReadyByRevision diff --git a/controllers/suite/common.go b/controllers/suite/common.go index 991f0f1b..743eb3a6 100644 --- a/controllers/suite/common.go +++ b/controllers/suite/common.go @@ -53,14 +53,14 @@ func UsingClusterBy(cluster, text string, callbacks ...func()) { } } -func MarkPodsAsRunning(ctx context.Context, client client.Client, pods []corev1.Pod, clusterName string) error { - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" { +func MarkPodsAsRunningIfUsingEnvtest(ctx context.Context, client client.Client, pods []corev1.Pod, clusterName string) error { + if !helpers.UseEnvtest() { return nil } UsingClusterBy(clusterName, "Simulating Humio container starts up and is marked Ready") for _, pod := range pods { - err := MarkPodAsRunning(ctx, client, pod, clusterName) + err := MarkPodAsRunningIfUsingEnvtest(ctx, client, pod, clusterName) if err != nil { return err } @@ -68,8 +68,8 @@ func MarkPodsAsRunning(ctx context.Context, client client.Client, pods []corev1. return nil } -func MarkPodAsRunning(ctx context.Context, k8sClient client.Client, pod corev1.Pod, clusterName string) error { - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" { +func MarkPodAsRunningIfUsingEnvtest(ctx context.Context, k8sClient client.Client, pod corev1.Pod, clusterName string) error { + if !helpers.UseEnvtest() { return nil } @@ -81,6 +81,18 @@ func MarkPodAsRunning(ctx context.Context, k8sClient client.Client, pod corev1.P Status: corev1.ConditionTrue, }, } + pod.Status.InitContainerStatuses = []corev1.ContainerStatus{ + { + Name: controllers.InitContainerName, + Ready: true, + }, + } + pod.Status.ContainerStatuses = []corev1.ContainerStatus{ + { + Name: controllers.HumioContainerName, + Ready: true, + }, + } pod.Status.Phase = corev1.PodRunning return k8sClient.Status().Update(ctx, &pod) } @@ -226,7 +238,7 @@ func ConstructBasicNodeSpecForHumioCluster(key types.NamespacedName) humiov1alph }, } - if os.Getenv("DUMMY_LOGSCALE_IMAGE") != "true" { + if !helpers.UseDummyImage() { nodeSpec.SidecarContainers = []corev1.Container{ { Name: "wait-for-global-snapshot-on-disk", @@ -314,8 +326,8 @@ func CreateLicenseSecret(ctx context.Context, clusterKey types.NamespacedName, k licenseString := "eyJ0eXAiOiJKV1QiLCJhbGciOiJFUzUxMiJ9.eyJpc09lbSI6ZmFsc2UsImF1ZCI6Ikh1bWlvLWxpY2Vuc2UtY2hlY2siLCJzdWIiOiJIdW1pbyBFMkUgdGVzdHMiLCJ1aWQiOiJGUXNvWlM3Yk1PUldrbEtGIiwibWF4VXNlcnMiOjEwLCJhbGxvd1NBQVMiOnRydWUsIm1heENvcmVzIjoxLCJ2YWxpZFVudGlsIjoxNzQzMTY2ODAwLCJleHAiOjE3NzQ1OTMyOTcsImlzVHJpYWwiOmZhbHNlLCJpYXQiOjE2Nzk5ODUyOTcsIm1heEluZ2VzdEdiUGVyRGF5IjoxfQ.someinvalidsignature" // If we use a k8s that is not envtest, and we didn't specify we are using a dummy image, we require a valid license - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" && os.Getenv("DUMMY_LOGSCALE_IMAGE") != "true" { - licenseString = os.Getenv("HUMIO_E2E_LICENSE") + if !helpers.UseEnvtest() && !helpers.UseDummyImage() { + licenseString = helpers.GetE2ELicenseFromEnvVar() } licenseSecret := corev1.Secret{ @@ -363,7 +375,7 @@ func CreateAndBootstrapCluster(ctx context.Context, k8sClient client.Client, hum } } - if os.Getenv("TEST_USE_EXISTING_CLUSTER") != "true" { + if helpers.UseEnvtest() { // Simulate sidecar creating the secret which contains the admin token used to authenticate with humio secretData := map[string][]byte{"token": []byte("")} adminTokenSecretName := fmt.Sprintf("%s-%s", key.Name, kubernetes.ServiceTokenSecretNameSuffix) @@ -458,7 +470,7 @@ func CreateAndBootstrapCluster(ctx context.Context, k8sClient client.Client, hum Eventually(func() []corev1.Pod { var clusterPods []corev1.Pod clusterPods, _ = kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(&updatedHumioCluster).GetPodLabels()) - _ = MarkPodsAsRunning(ctx, k8sClient, clusterPods, key.Name) + _ = MarkPodsAsRunningIfUsingEnvtest(ctx, k8sClient, clusterPods, key.Name) return clusterPods }, testTimeout, TestInterval).Should(HaveLen(cluster.Spec.NodeCount)) @@ -466,7 +478,7 @@ func CreateAndBootstrapCluster(ctx context.Context, k8sClient client.Client, hum Eventually(func() []corev1.Pod { var clusterPods []corev1.Pod clusterPods, _ = kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioNodePool(&updatedHumioCluster, &cluster.Spec.NodePools[idx]).GetPodLabels()) - _ = MarkPodsAsRunning(ctx, k8sClient, clusterPods, key.Name) + _ = MarkPodsAsRunningIfUsingEnvtest(ctx, k8sClient, clusterPods, key.Name) return clusterPods }, testTimeout, TestInterval).Should(HaveLen(pool.NodeCount)) } @@ -504,11 +516,11 @@ func CreateAndBootstrapCluster(ctx context.Context, k8sClient client.Client, hum UsingClusterBy(key.Name, "Confirming cluster enters running state") Eventually(func() string { clusterPods, _ := kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(&updatedHumioCluster).GetPodLabels()) - _ = MarkPodsAsRunning(ctx, k8sClient, clusterPods, key.Name) + _ = MarkPodsAsRunningIfUsingEnvtest(ctx, k8sClient, clusterPods, key.Name) for idx := range cluster.Spec.NodePools { clusterPods, _ := kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioNodePool(&updatedHumioCluster, &cluster.Spec.NodePools[idx]).GetPodLabels()) - _ = MarkPodsAsRunning(ctx, k8sClient, clusterPods, key.Name) + _ = MarkPodsAsRunningIfUsingEnvtest(ctx, k8sClient, clusterPods, key.Name) } updatedHumioCluster = humiov1alpha1.HumioCluster{} @@ -539,7 +551,7 @@ func CreateAndBootstrapCluster(ctx context.Context, k8sClient client.Client, hum }, &corev1.Secret{}) }, testTimeout, TestInterval).Should(Succeed()) - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" && os.Getenv("DUMMY_LOGSCALE_IMAGE") != "true" { + if !helpers.UseEnvtest() && !helpers.UseDummyImage() { UsingClusterBy(key.Name, "Validating cluster nodes have ZONE configured correctly") if updatedHumioCluster.Spec.DisableInitContainer { Eventually(func() []string { @@ -649,6 +661,34 @@ func CreateAndBootstrapCluster(ctx context.Context, k8sClient client.Client, hum }, testTimeout, TestInterval).Should(HaveKeyWithValue(corev1.PodRunning, updatedHumioCluster.Spec.NodePools[idx].NodeCount)) } + + Eventually(func() int { + numPodsReady := 0 + clusterPods, _ := kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioCluster(&updatedHumioCluster).GetPodLabels()) + for _, pod := range clusterPods { + for _, containerStatus := range pod.Status.ContainerStatuses { + if containerStatus.Name == controllers.HumioContainerName && containerStatus.Ready { + numPodsReady++ + } + } + } + return numPodsReady + }, testTimeout, TestInterval).Should(BeIdenticalTo(updatedHumioCluster.Spec.NodeCount)) + + for idx := range updatedHumioCluster.Spec.NodePools { + Eventually(func() int { + numPodsReady := 0 + clusterPods, _ := kubernetes.ListPods(ctx, k8sClient, key.Namespace, controllers.NewHumioNodeManagerFromHumioNodePool(&updatedHumioCluster, &updatedHumioCluster.Spec.NodePools[idx]).GetPodLabels()) + for _, pod := range clusterPods { + for _, containerStatus := range pod.Status.ContainerStatuses { + if containerStatus.Name == controllers.HumioContainerName && containerStatus.Ready { + numPodsReady++ + } + } + } + return numPodsReady + }, testTimeout, TestInterval).Should(BeIdenticalTo(updatedHumioCluster.Spec.NodePools[idx].NodeCount)) + } } func WaitForReconcileToSync(ctx context.Context, key types.NamespacedName, k8sClient client.Client, currentHumioCluster *humiov1alpha1.HumioCluster, testTimeout time.Duration) { diff --git a/controllers/suite/resources/humioresources_controller_test.go b/controllers/suite/resources/humioresources_controller_test.go index 9b9f2fe8..2b199a92 100644 --- a/controllers/suite/resources/humioresources_controller_test.go +++ b/controllers/suite/resources/humioresources_controller_test.go @@ -20,7 +20,6 @@ import ( "context" "fmt" "net/http" - "os" "github.com/humio/humio-operator/pkg/kubernetes" @@ -98,9 +97,7 @@ var _ = Describe("Humio Resources Controllers", func() { ingestTokenSecret) }, testTimeout, suite.TestInterval).Should(Succeed()) - if os.Getenv("TEST_USE_EXISTING_CLUSTER") != "true" { - Expect(string(ingestTokenSecret.Data["token"])).ToNot(BeEmpty()) - } + Expect(string(ingestTokenSecret.Data["token"])).ToNot(BeEmpty()) Expect(ingestTokenSecret.OwnerReferences).Should(HaveLen(1)) suite.UsingClusterBy(clusterKey.Name, "HumioIngestToken: Checking correct parser assigned to ingest token") @@ -152,9 +149,7 @@ var _ = Describe("Humio Resources Controllers", func() { ingestTokenSecret) }, testTimeout, suite.TestInterval).Should(Succeed()) - if os.Getenv("TEST_USE_EXISTING_CLUSTER") != "true" { - Expect(string(ingestTokenSecret.Data["token"])).ToNot(BeEmpty()) - } + Expect(string(ingestTokenSecret.Data["token"])).ToNot(BeEmpty()) suite.UsingClusterBy(clusterKey.Name, "HumioIngestToken: Successfully deleting it") Expect(k8sClient.Delete(ctx, fetchedIngestToken)).To(Succeed()) @@ -223,9 +218,7 @@ var _ = Describe("Humio Resources Controllers", func() { }, testTimeout, suite.TestInterval).Should(Succeed()) Expect(ingestTokenSecret.Labels).Should(HaveKeyWithValue("custom-label", "custom-value")) - if os.Getenv("TEST_USE_EXISTING_CLUSTER") != "true" { - Expect(string(ingestTokenSecret.Data["token"])).ToNot(BeEmpty()) - } + Expect(string(ingestTokenSecret.Data["token"])).ToNot(BeEmpty()) suite.UsingClusterBy(clusterKey.Name, "HumioIngestToken: Successfully deleting it") Expect(k8sClient.Delete(ctx, fetchedIngestToken)).To(Succeed()) @@ -662,7 +655,7 @@ var _ = Describe("Humio Resources Controllers", func() { Namespace: clusterKey.Namespace, } protocol := "http" - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" && helpers.UseCertManager() { + if !helpers.UseEnvtest() && helpers.UseCertManager() { protocol = "https" } diff --git a/controllers/suite/resources/suite_test.go b/controllers/suite/resources/suite_test.go index e7db65c1..a8e200b0 100644 --- a/controllers/suite/resources/suite_test.go +++ b/controllers/suite/resources/suite_test.go @@ -20,7 +20,6 @@ import ( "context" "encoding/json" "fmt" - "os" "path/filepath" "strings" "testing" @@ -97,17 +96,17 @@ var _ = BeforeSuite(func() { Namespace: fmt.Sprintf("e2e-resources-%d", GinkgoParallelProcess()), } - if os.Getenv("TEST_USE_EXISTING_CLUSTER") == "true" { + if !helpers.UseEnvtest() { testTimeout = time.Second * 300 testEnv = &envtest.Environment{ UseExistingCluster: &useExistingCluster, } - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { humioClient = humio.NewMockClient() } else { humioClient = humio.NewClient(log, "") By("Verifying we have a valid license, as tests will require starting up real LogScale containers") - Expect(os.Getenv("HUMIO_E2E_LICENSE")).NotTo(BeEmpty()) + Expect(helpers.GetE2ELicenseFromEnvVar()).NotTo(BeEmpty()) } } else { @@ -262,11 +261,6 @@ var _ = BeforeSuite(func() { suite.UsingClusterBy(clusterKey.Name, fmt.Sprintf("HumioCluster: Creating shared test cluster in namespace %s", clusterKey.Namespace)) cluster = suite.ConstructBasicSingleNodeHumioCluster(clusterKey, true) - if os.Getenv("DUMMY_LOGSCALE_IMAGE") != "true" { - cluster.Spec.HumioNodeSpec.Image = "humio/humio-core:1.150.0" - } else { - cluster.Spec.HumioNodeSpec.Image = "humio/humio-core:dummy" - } suite.CreateAndBootstrapCluster(context.TODO(), k8sClient, humioClient, cluster, true, corev1alpha1.HumioClusterStateRunning, testTimeout) sharedCluster, err = helpers.NewCluster(context.TODO(), k8sClient, clusterKey.Name, "", clusterKey.Namespace, helpers.UseCertManager(), true, false) @@ -381,7 +375,7 @@ var _ = AfterSuite(func() { })).To(Succeed()) } - if testNamespace.ObjectMeta.Name != "" && os.Getenv("PRESERVE_KIND_CLUSTER") == "true" { + if testNamespace.ObjectMeta.Name != "" && !helpers.UseEnvtest() && helpers.PreserveKindCluster() { By(fmt.Sprintf("Removing test namespace: %s", clusterKey.Namespace)) err := k8sClient.Delete(context.TODO(), &testNamespace) Expect(err).ToNot(HaveOccurred()) diff --git a/controllers/versions/versions.go b/controllers/versions/versions.go index e576ac5e..5c8e328e 100644 --- a/controllers/versions/versions.go +++ b/controllers/versions/versions.go @@ -1,13 +1,14 @@ package versions import ( - "os" "strings" + + "github.com/humio/humio-operator/pkg/helpers" ) const ( defaultHelperImageVersion = "humio/humio-operator-helper:8f5ef6c7e470226e77d985f36cf39be9a100afea" - defaultHumioImageVersion = "humio/humio-core:1.142.3" + defaultHumioImageVersion = "humio/humio-core:1.153.1" oldSupportedHumioVersion = "humio/humio-core:1.118.0" upgradeJumpHumioVersion = "humio/humio-core:1.128.0" @@ -28,70 +29,70 @@ const ( func DefaultHelperImageVersion() string { version := []string{defaultHelperImageVersion} - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { version = append(version, dummyImageSuffix) } return strings.Join(version, "") } func DefaultHumioImageVersion() string { version := []string{defaultHumioImageVersion} - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { version = append(version, dummyImageSuffix) } return strings.Join(version, "") } func OldSupportedHumioVersion() string { version := []string{oldSupportedHumioVersion} - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { version = append(version, dummyImageSuffix) } return strings.Join(version, "") } func UpgradeJumpHumioVersion() string { version := []string{upgradeJumpHumioVersion} - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { version = append(version, dummyImageSuffix) } return strings.Join(version, "") } func OldUnsupportedHumioVersion() string { version := []string{oldUnsupportedHumioVersion} - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { version = append(version, dummyImageSuffix) } return strings.Join(version, "") } func UpgradeHelperImageVersion() string { version := []string{upgradeHelperImageVersion} - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { version = append(version, dummyImageSuffix) } return strings.Join(version, "") } func UpgradePatchBestEffortOldVersion() string { version := []string{upgradePatchBestEffortOldVersion} - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { version = append(version, dummyImageSuffix) } return strings.Join(version, "") } func UpgradePatchBestEffortNewVersion() string { version := []string{upgradePatchBestEffortNewVersion} - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { version = append(version, dummyImageSuffix) } return strings.Join(version, "") } func UpgradeRollingBestEffortVersionJumpOldVersion() string { version := []string{upgradeRollingBestEffortVersionJumpOldVersion} - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { version = append(version, dummyImageSuffix) } return strings.Join(version, "") } func UpgradeRollingBestEffortVersionJumpNewVersion() string { version := []string{upgradeRollingBestEffortVersionJumpNewVersion} - if os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" { + if helpers.UseDummyImage() { version = append(version, dummyImageSuffix) } return strings.Join(version, "") diff --git a/images/helper/go.mod b/images/helper/go.mod index 3e3cb278..2cd62a1f 100644 --- a/images/helper/go.mod +++ b/images/helper/go.mod @@ -3,8 +3,6 @@ module github.com/humio/humio-operator/images/helper go 1.22 require ( - github.com/cli/shurcooL-graphql v0.0.4 - github.com/humio/cli v0.33.1-0.20240425153346-f278dc8465f3 k8s.io/api v0.29.5 k8s.io/apimachinery v0.29.5 k8s.io/client-go v0.29.5 @@ -30,7 +28,6 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect golang.org/x/net v0.24.0 // indirect golang.org/x/oauth2 v0.10.0 // indirect - golang.org/x/sync v0.5.0 // indirect golang.org/x/sys v0.19.0 // indirect golang.org/x/term v0.19.0 // indirect golang.org/x/text v0.14.0 // indirect diff --git a/images/helper/go.sum b/images/helper/go.sum index 36f7067a..17901a8b 100644 --- a/images/helper/go.sum +++ b/images/helper/go.sum @@ -1,5 +1,3 @@ -github.com/cli/shurcooL-graphql v0.0.4 h1:6MogPnQJLjKkaXPyGqPRXOI2qCsQdqNfUY1QSJu2GuY= -github.com/cli/shurcooL-graphql v0.0.4/go.mod h1:3waN4u02FiZivIV+p1y4d0Jo1jc6BViMA73C+sZo2fk= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= @@ -33,8 +31,6 @@ github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJY github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/humio/cli v0.33.1-0.20240425153346-f278dc8465f3 h1:9UVZdMFGt7FktPvRjJ58RQFHFSYIEfkcbCg4Xq8z9HM= -github.com/humio/cli v0.33.1-0.20240425153346-f278dc8465f3/go.mod h1:GGgOajbd4z5osw50k5+dXYrcSkj9nZssAWS4Lv77yc4= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= @@ -95,8 +91,6 @@ golang.org/x/oauth2 v0.10.0/go.mod h1:kTpgurOux7LqtuxjuyZa4Gj2gdezIt/jQtGnNFfypQ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE= -golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/images/helper/main.go b/images/helper/main.go index 791188e4..02aaf3a7 100644 --- a/images/helper/main.go +++ b/images/helper/main.go @@ -73,7 +73,7 @@ func initMode() { } else { zone, found := node.Labels[corev1.LabelZoneFailureDomainStable] if !found { - zone, _ = node.Labels[corev1.LabelZoneFailureDomain] + zone = node.Labels[corev1.LabelZoneFailureDomain] } err := os.WriteFile(targetFile, []byte(zone), 0644) // #nosec G306 if err != nil { diff --git a/pkg/helpers/helpers.go b/pkg/helpers/helpers.go index 79dcbcac..e9f37624 100644 --- a/pkg/helpers/helpers.go +++ b/pkg/helpers/helpers.go @@ -59,12 +59,6 @@ func RemoveElement(list []string, s string) []string { return list } -// UseCertManager returns whether the operator will use cert-manager -func UseCertManager() bool { - certmanagerEnabled, found := os.LookupEnv("USE_CERTMANAGER") - return found && certmanagerEnabled == "true" -} - // TLSEnabled returns whether we a cluster should configure TLS or not func TLSEnabled(hc *humiov1alpha1.HumioCluster) bool { if hc.Spec.TLS == nil { @@ -141,3 +135,39 @@ func GetWatchNamespace() (string, error) { } return ns, nil } + +// UseCertManager returns whether the operator will use cert-manager +func UseCertManager() bool { + return !UseEnvtest() && os.Getenv("USE_CERTMANAGER") == "true" +} + +// GetDefaultHumioCoreImageFromEnvVar returns the user-defined default image for humio-core containers +func GetDefaultHumioCoreImageFromEnvVar() string { + return os.Getenv("HUMIO_OPERATOR_DEFAULT_HUMIO_CORE_IMAGE") +} + +// GetDefaultHumioHelperImageFromEnvVar returns the user-defined default image for helper containers +func GetDefaultHumioHelperImageFromEnvVar() string { + return os.Getenv("HUMIO_OPERATOR_DEFAULT_HUMIO_HELPER_IMAGE") +} + +// UseEnvtest returns whether the Kubernetes API is provided by envtest +func UseEnvtest() bool { + return os.Getenv("TEST_USING_ENVTEST") == "true" +} + +// UseDummyImage returns whether we are using a dummy image replacement instead of real container images +func UseDummyImage() bool { + return os.Getenv("DUMMY_LOGSCALE_IMAGE") == "true" +} + +// GetE2ELicenseFromEnvVar returns the E2E license set as an environment variable +func GetE2ELicenseFromEnvVar() string { + return os.Getenv("HUMIO_E2E_LICENSE") +} + +// PreserveKindCluster returns true if the intention is to not delete kind cluster after test execution. +// This is to allow reruns of tests to be performed where resources can be reused. +func PreserveKindCluster() bool { + return os.Getenv("PRESERVE_KIND_CLUSTER") == "true" +} diff --git a/pkg/kubernetes/nodes.go b/pkg/kubernetes/nodes.go index 0decc988..57041c9c 100644 --- a/pkg/kubernetes/nodes.go +++ b/pkg/kubernetes/nodes.go @@ -16,3 +16,24 @@ func GetNode(ctx context.Context, c client.Client, nodeName string) (*corev1.Nod }, &node) return &node, err } + +var nodeNameToZoneName = map[string]string{} + +func GetZoneForNodeName(ctx context.Context, c client.Client, nodeName string) (string, error) { + zone, inCache := nodeNameToZoneName[nodeName] + if inCache { + return zone, nil + } + + node, err := GetNode(ctx, c, nodeName) + if err != nil { + return "", nil + } + zone, found := node.Labels[corev1.LabelZoneFailureDomainStable] + if !found { + zone = node.Labels[corev1.LabelZoneFailureDomain] + } + + nodeNameToZoneName[nodeName] = zone + return zone, nil +}