Skip to content

Commit

Permalink
feat(chart): update operator configuration resource via helm chart
Browse files Browse the repository at this point in the history
If an operator configuration resource has been requested via Helm
values, and there is already an existing operator configuration resource
in the cluster, update it according to the values provided via Helm
instead of ignoring the Helm values.
  • Loading branch information
basti1302 committed Nov 9, 2024
1 parent 09d9087 commit 5cd1f70
Show file tree
Hide file tree
Showing 6 changed files with 270 additions and 174 deletions.
2 changes: 1 addition & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -820,7 +820,7 @@ func createOperatorConfiguration(
OperatorNamespace: envVars.operatorNamespace,
NamePrefix: envVars.oTelCollectorNamePrefix,
}
if err := handler.CreateOperatorConfigurationResource(ctx, operatorConfiguration, logger); err != nil {
if err := handler.CreateOrUpdateOperatorConfigurationResource(ctx, operatorConfiguration, logger); err != nil {
logger.Error(err, "Failed to create the requested Dash0 operator configuration resource.")
}
}
Expand Down
70 changes: 40 additions & 30 deletions internal/startup/auto_operator_configuration_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,39 +45,41 @@ type AutoOperatorConfigurationResourceHandler struct {

const (
operatorConfigurationAutoResourceName = "dash0-operator-configuration-auto-resource"

alreadyExistsMessage = "The operator is configured to deploy an operator configuration resource at startup, but there is already " +
"an operator configuration resource in the cluster. Hence no action is necessary. (This is not an error.)"
)

func (r *AutoOperatorConfigurationResourceHandler) CreateOperatorConfigurationResource(
func (r *AutoOperatorConfigurationResourceHandler) CreateOrUpdateOperatorConfigurationResource(
ctx context.Context,
operatorConfiguration *OperatorConfigurationValues,
logger *logr.Logger,
) error {

// Fast path: check early on if there is already an operator configuration resource, skip all other steps if so.
// We will repeat this check immediately before creating the operator configuration resource, so if the check fails
// with an error we will ignore that error for now.
allOperatorConfigurationResources := &dash0v1alpha1.Dash0OperatorConfigurationList{}
if err := r.List(ctx, allOperatorConfigurationResources); err == nil {
if len(allOperatorConfigurationResources.Items) >= 1 {
logger.Info(alreadyExistsMessage)
return nil
}
}

if err := r.validateOperatorConfiguration(operatorConfiguration); err != nil {
return err
}

go func() {
// There is a validation webhook for operator configuration resources. Thus, before we can create an operator
// configuration resource, we need to wait for the webhook endpoint to become available.
// There is a validation webhook for operator configuration resources. Thus, before we can create or update an
// operator configuration resource, we need to wait for the webhook endpoint to become available.
if err := r.waitForWebserviceEndpoint(ctx, logger); err != nil {
logger.Error(err, "failed to create the Dash0 operator configuration resource")
}
if err := r.createOperatorConfigurationResourceWithRetry(ctx, operatorConfiguration, logger); err != nil {

// Even if we wait for the validation webhook to become available (see above), we sometimes get a couple of
// retry attempts that fail with
// create/update operator configuration resource at startup failed in attempt x/6, will be retried.
// [...]
// failed calling webhook \"validate-operator-configuration.dash0.com\":
// failed to call webhook: Post \"https://dash0-operator-webhook-service.dash0-system.svc:443/v1alpha1/validate/operator-configuration?timeout=5s\":
// tls: failed to verify certificate: x509: certificate signed by unknown authority
// (possibly because of \"crypto/rsa: verification error\" while trying to verify candidate authority certificate
// \"dash0-operator-ca\")"
//
// This self-heals after a few attempts. Still, the log entries might be confusing. To lower the probability of
// this happening, we wait for a few seconds before creating/updating the operator configuration resource.
if !r.bypassWebhookCheck {
time.Sleep(10 * time.Second)
}

if err := r.createOrUpdateOperatorConfigurationResourceWithRetry(ctx, operatorConfiguration, logger); err != nil {
logger.Error(err, "failed to create the Dash0 operator configuration resource")
}
}()
Expand Down Expand Up @@ -158,13 +160,13 @@ func (r *AutoOperatorConfigurationResourceHandler) checkWebServiceEndpoint(
return fmt.Errorf("the webservice endpoint is not available yet")
}

func (r *AutoOperatorConfigurationResourceHandler) createOperatorConfigurationResourceWithRetry(
func (r *AutoOperatorConfigurationResourceHandler) createOrUpdateOperatorConfigurationResourceWithRetry(
ctx context.Context,
operatorConfiguration *OperatorConfigurationValues,
logger *logr.Logger,
) error {
return util.RetryWithCustomBackoff(
"create operator configuration resource at startup",
"create/update operator configuration resource at startup",
func() error {
return r.createOperatorConfigurationResourceOnce(ctx, operatorConfiguration, logger)
},
Expand All @@ -184,15 +186,6 @@ func (r *AutoOperatorConfigurationResourceHandler) createOperatorConfigurationRe
operatorConfiguration *OperatorConfigurationValues,
logger *logr.Logger,
) error {
allOperatorConfigurationResources := &dash0v1alpha1.Dash0OperatorConfigurationList{}
if err := r.List(ctx, allOperatorConfigurationResources); err != nil {
return fmt.Errorf("failed to list all Dash0 operator configuration resources: %w", err)
}
if len(allOperatorConfigurationResources.Items) >= 1 {
logger.Info(alreadyExistsMessage)
return nil
}

authorization := dash0v1alpha1.Authorization{}
if operatorConfiguration.Token != "" {
authorization.Token = &operatorConfiguration.Token
Expand Down Expand Up @@ -238,6 +231,23 @@ func (r *AutoOperatorConfigurationResourceHandler) createOperatorConfigurationRe
KubernetesInfrastructureMetricsCollectionEnabled: ptr.To(operatorConfiguration.KubernetesInfrastructureMetricsCollectionEnabled),
},
}

allOperatorConfigurationResources := &dash0v1alpha1.Dash0OperatorConfigurationList{}
if err := r.List(ctx, allOperatorConfigurationResources); err != nil {
return fmt.Errorf("failed to list all Dash0 operator configuration resources: %w", err)
}
if len(allOperatorConfigurationResources.Items) >= 1 {
// The validation webhook for the operator configuration resource guarantees that there is only ever one
// resource per cluster. Thus, we can arbitrarily update the first item in the list.
existingOperatorConfigurationResource := allOperatorConfigurationResources.Items[0]
existingOperatorConfigurationResource.Spec = operatorConfigurationResource.Spec
if err := r.Update(ctx, &existingOperatorConfigurationResource); err != nil {
return fmt.Errorf("failed to update the Dash0 operator configuration resource: %w", err)
}
logger.Info("the Dash0 operator configuration resource has been updated")
return nil
}

if err := r.Create(ctx, &operatorConfigurationResource); err != nil {
return fmt.Errorf("failed to create the Dash0 operator configuration resource: %w", err)
}
Expand Down
109 changes: 84 additions & 25 deletions internal/startup/auto_operator_configuration_handler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,24 +46,8 @@ var _ = Describe("Create an operator configuration resource at startup", Ordered
DeleteAllOperatorConfigurationResources(ctx, k8sClient)
})

It("should not do anything if there already is an operator configuration resource in the cluster", func() {
CreateDefaultOperatorConfigurationResource(ctx, k8sClient)
// verify that there is only one resource
list := v1alpha1.Dash0OperatorConfigurationList{}
Expect(k8sClient.List(ctx, &list)).To(Succeed())
Expect(list.Items).To(HaveLen(1))
Expect(list.Items[0].Name).To(Equal(OperatorConfigurationResourceName))

Expect(handler.CreateOperatorConfigurationResource(ctx, &OperatorConfigurationValues{}, &logger)).To(Succeed())
// verify that there is _still_ only one resource, and that its name is not the one that would be automatically
// created by AutoOperatorConfigurationResourceHandler.
Expect(k8sClient.List(ctx, &list)).To(Succeed())
Expect(list.Items).To(HaveLen(1))
Expect(list.Items[0].Name).To(Equal(OperatorConfigurationResourceName))
})

It("should fail validation if no endpoint has been provided", func() {
Expect(handler.CreateOperatorConfigurationResource(ctx, &OperatorConfigurationValues{
Expect(handler.CreateOrUpdateOperatorConfigurationResource(ctx, &OperatorConfigurationValues{
Token: AuthorizationTokenTest,
}, &logger)).To(
MatchError(
Expand All @@ -72,7 +56,7 @@ var _ = Describe("Create an operator configuration resource at startup", Ordered
})

It("should fail validation if no token and no secret reference have been provided", func() {
Expect(handler.CreateOperatorConfigurationResource(ctx, &OperatorConfigurationValues{
Expect(handler.CreateOrUpdateOperatorConfigurationResource(ctx, &OperatorConfigurationValues{
Endpoint: AuthorizationTokenTest,
}, &logger)).To(
MatchError(
Expand All @@ -82,7 +66,7 @@ var _ = Describe("Create an operator configuration resource at startup", Ordered
})

It("should fail validation if no token and no secret reference key have been provided", func() {
Expect(handler.CreateOperatorConfigurationResource(ctx, &OperatorConfigurationValues{
Expect(handler.CreateOrUpdateOperatorConfigurationResource(ctx, &OperatorConfigurationValues{
Endpoint: AuthorizationTokenTest,
SecretRef: SecretRef{
Name: "test-secret",
Expand All @@ -94,9 +78,9 @@ var _ = Describe("Create an operator configuration resource at startup", Ordered
"been provided")))
})

It("should create an operator configuration resource with a token", func() {
It("should create a new operator configuration resource with a token", func() {
Expect(
handler.CreateOperatorConfigurationResource(ctx, &operatorConfigurationValuesWithToken, &logger),
handler.CreateOrUpdateOperatorConfigurationResource(ctx, &operatorConfigurationValuesWithToken, &logger),
).To(Succeed())

Eventually(func(g Gomega) {
Expand All @@ -122,9 +106,9 @@ var _ = Describe("Create an operator configuration resource at startup", Ordered
}, 5*time.Second, 100*time.Millisecond).Should(Succeed())
})

It("should create an operator configuration resource with a secret reference", func() {
It("should create a new operator configuration resource with a secret reference", func() {
Expect(
handler.CreateOperatorConfigurationResource(ctx, &operatorConfigurationValuesWithSecretRef, &logger),
handler.CreateOrUpdateOperatorConfigurationResource(ctx, &operatorConfigurationValuesWithSecretRef, &logger),
).To(Succeed())

Eventually(func(g Gomega) {
Expand Down Expand Up @@ -153,7 +137,7 @@ var _ = Describe("Create an operator configuration resource at startup", Ordered

It("should set the API endpoint", func() {
Expect(
handler.CreateOperatorConfigurationResource(ctx, &OperatorConfigurationValues{
handler.CreateOrUpdateOperatorConfigurationResource(ctx, &OperatorConfigurationValues{
Endpoint: EndpointDash0Test,
Token: AuthorizationTokenTest,
ApiEndpoint: ApiEndpointTest,
Expand All @@ -177,7 +161,7 @@ var _ = Describe("Create an operator configuration resource at startup", Ordered

It("should set a custom dataset", func() {
Expect(
handler.CreateOperatorConfigurationResource(ctx, &OperatorConfigurationValues{
handler.CreateOrUpdateOperatorConfigurationResource(ctx, &OperatorConfigurationValues{
Endpoint: EndpointDash0Test,
Token: AuthorizationTokenTest,
Dataset: "custom",
Expand All @@ -198,4 +182,79 @@ var _ = Describe("Create an operator configuration resource at startup", Ordered
g.Expect(dash0Export.Dataset).To(Equal("custom"))
}, 5*time.Second, 100*time.Millisecond).Should(Succeed())
})

It("should update the existing resource if there already is an auto-operator-configuration-resource", func() {
Expect(
handler.CreateOrUpdateOperatorConfigurationResource(ctx, &OperatorConfigurationValues{
Endpoint: "endpoint-1.dash0.com:4317",
Token: AuthorizationTokenTest,
ApiEndpoint: "https://api-1.dash0.com",
Dataset: "dataset-1",
SelfMonitoringEnabled: false,
KubernetesInfrastructureMetricsCollectionEnabled: true,
}, &logger),
).To(Succeed())

Eventually(func(g Gomega) {
list := v1alpha1.Dash0OperatorConfigurationList{}
g.Expect(k8sClient.List(ctx, &list)).To(Succeed())
g.Expect(list.Items).To(HaveLen(1))
operatorConfiguration := list.Items[0]
g.Expect(operatorConfiguration.Name).To(Equal(operatorConfigurationAutoResourceName))
g.Expect(operatorConfiguration.Annotations).To(HaveLen(1))
g.Expect(operatorConfiguration.Annotations["argocd.argoproj.io/sync-options"]).To(Equal("Prune=false"))
export := operatorConfiguration.Spec.Export
g.Expect(export).ToNot(BeNil())
dash0Export := export.Dash0
g.Expect(dash0Export).ToNot(BeNil())
g.Expect(export.Grpc).To(BeNil())
g.Expect(export.Http).To(BeNil())
g.Expect(dash0Export.Endpoint).To(Equal("endpoint-1.dash0.com:4317"))
g.Expect(dash0Export.Authorization.Token).ToNot(BeNil())
g.Expect(*dash0Export.Authorization.Token).To(Equal(AuthorizationTokenTest))
g.Expect(dash0Export.Authorization.SecretRef).To(BeNil())
g.Expect(dash0Export.ApiEndpoint).To(Equal("https://api-1.dash0.com"))
g.Expect(dash0Export.Dataset).To(Equal("dataset-1"))
g.Expect(*operatorConfiguration.Spec.SelfMonitoring.Enabled).To(BeFalse())
g.Expect(*operatorConfiguration.Spec.KubernetesInfrastructureMetricsCollectionEnabled).To(BeTrue())
}, 5*time.Second, 100*time.Millisecond).Should(Succeed())

// Now call the handler a second time, simulating a new startup of the operator manager process, with different
// operator-configuration-xxx flags
Expect(handler.CreateOrUpdateOperatorConfigurationResource(ctx,
&OperatorConfigurationValues{
Endpoint: "endpoint-2.dash0.com:4317",
SecretRef: secretRef,
ApiEndpoint: "https://api-2.dash0.com",
Dataset: "dataset-2",
SelfMonitoringEnabled: true,
KubernetesInfrastructureMetricsCollectionEnabled: false,
}, &logger)).To(Succeed())

// verify that there is _still_ only one resource, and that its settings have been updated.
Eventually(func(g Gomega) {
list := v1alpha1.Dash0OperatorConfigurationList{}
g.Expect(k8sClient.List(ctx, &list)).To(Succeed())
g.Expect(list.Items).To(HaveLen(1))
operatorConfiguration := list.Items[0]
g.Expect(operatorConfiguration.Name).To(Equal(operatorConfigurationAutoResourceName))
g.Expect(operatorConfiguration.Annotations).To(HaveLen(1))
g.Expect(operatorConfiguration.Annotations["argocd.argoproj.io/sync-options"]).To(Equal("Prune=false"))
export := operatorConfiguration.Spec.Export
g.Expect(export).ToNot(BeNil())
dash0Export := export.Dash0
g.Expect(dash0Export).ToNot(BeNil())
g.Expect(export.Grpc).To(BeNil())
g.Expect(export.Http).To(BeNil())
g.Expect(dash0Export.Endpoint).To(Equal("endpoint-2.dash0.com:4317"))
g.Expect(dash0Export.Authorization.Token).To(BeNil())
g.Expect(dash0Export.Authorization.SecretRef).ToNot(BeNil())
g.Expect(dash0Export.Authorization.SecretRef.Name).To(Equal("test-secret"))
g.Expect(dash0Export.Authorization.SecretRef.Key).To(Equal("test-key"))
g.Expect(dash0Export.ApiEndpoint).To(Equal("https://api-2.dash0.com"))
g.Expect(dash0Export.Dataset).To(Equal("dataset-2"))
g.Expect(*operatorConfiguration.Spec.SelfMonitoring.Enabled).To(BeTrue())
g.Expect(*operatorConfiguration.Spec.KubernetesInfrastructureMetricsCollectionEnabled).To(BeFalse())
}, 5*time.Second, 100*time.Millisecond).Should(Succeed())
})
})
43 changes: 0 additions & 43 deletions internal/webhooks/monitoring_validation_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
package webhooks

import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"

Expand Down Expand Up @@ -64,48 +63,6 @@ var _ = Describe("The validation webhook for the monitoring resource", func() {
"Dash0 operator configuration resources are available.")))
})

It("should reject monitoring resources without export if more than one operator configuration resource is available", func() {
opConfRes1, err := CreateOperatorConfigurationResource(
ctx,
k8sClient,
&dash0v1alpha1.Dash0OperatorConfiguration{
ObjectMeta: metav1.ObjectMeta{
Name: "dash0-operator-test-resource-1",
},
Spec: OperatorConfigurationResourceDefaultSpec,
},
)
Expect(err).ToNot(HaveOccurred())
opConfRes2, err := CreateOperatorConfigurationResource(
ctx,
k8sClient,
&dash0v1alpha1.Dash0OperatorConfiguration{
ObjectMeta: metav1.ObjectMeta{
Name: "dash0-operator-test-resource-2",
},
Spec: OperatorConfigurationResourceDefaultSpec,
},
)
Expect(err).ToNot(HaveOccurred())

opConfRes1.EnsureResourceIsMarkedAsAvailable()
Expect(k8sClient.Status().Update(ctx, opConfRes1)).To(Succeed())
opConfRes2.EnsureResourceIsMarkedAsAvailable()
Expect(k8sClient.Status().Update(ctx, opConfRes2)).To(Succeed())

_, err = CreateMonitoringResourceWithPotentialError(ctx, k8sClient, &dash0v1alpha1.Dash0Monitoring{
ObjectMeta: MonitoringResourceDefaultObjectMeta,
Spec: dash0v1alpha1.Dash0MonitoringSpec{
InstrumentWorkloads: dash0v1alpha1.All,
},
})

Expect(err).To(MatchError(ContainSubstring("admission webhook \"validate-monitoring.dash0.com\" denied " +
"the request: The provided Dash0 monitoring resource does not have an export configuration, and " +
"there is more than one available Dash0 operator configuration, remove all but one Dash0 operator " +
"configuration resource.")))
})

It("should reject monitoring resources without export if there is an operator configuration resource, but it has no export either", func() {
operatorConfigurationResource := CreateOperatorConfigurationResourceWithSpec(
ctx,
Expand Down
19 changes: 18 additions & 1 deletion internal/webhooks/operator_configuration_validation_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ package webhooks

import (
"context"
"fmt"
"net/http"

admissionv1 "k8s.io/api/admission/v1"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
Expand All @@ -33,7 +35,7 @@ func (h *OperatorConfigurationValidationWebhookHandler) SetupWebhookWithManager(
return nil
}

func (h *OperatorConfigurationValidationWebhookHandler) Handle(_ context.Context, request admission.Request) admission.Response {
func (h *OperatorConfigurationValidationWebhookHandler) Handle(ctx context.Context, request admission.Request) admission.Response {
operatorConfigurationResource := &dash0v1alpha1.Dash0OperatorConfiguration{}
if _, _, err := decoder.Decode(request.Object.Raw, nil, operatorConfigurationResource); err != nil {
return admission.Errored(http.StatusBadRequest, err)
Expand All @@ -47,5 +49,20 @@ func (h *OperatorConfigurationValidationWebhookHandler) Handle(_ context.Context
"monitoring telemetry.")

}

if request.Operation == admissionv1.Create {
allOperatorConfigurationResources := &dash0v1alpha1.Dash0OperatorConfigurationList{}
if err := h.Client.List(ctx, allOperatorConfigurationResources); err != nil {
return admission.Errored(http.StatusInternalServerError, fmt.Errorf("failed to list all Dash0 operator configuration resources: %w", err))
}
if len(allOperatorConfigurationResources.Items) > 0 {
return admission.Denied(
fmt.Sprintf("At least one Dash0 operator configuration resource (%s) already exists in this cluster. "+
"Only one operator configuration resource is allowed per cluster.",
allOperatorConfigurationResources.Items[0].Name,
))
}
}

return admission.Allowed("")
}
Loading

0 comments on commit 5cd1f70

Please sign in to comment.