From d4ffdad779a83af0b7f5fac3c495fa6e6116f606 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Stefaniak?= Date: Mon, 12 Aug 2024 13:48:31 +0200 Subject: [PATCH] feat: cpu resource limits removed during boost (#59) --- README.md | 22 ++++++++- cmd/main.go | 23 ++------- internal/config/config.go | 8 +++- internal/config/config_test.go | 7 ++- internal/config/env_provider.go | 15 +++++- internal/config/env_provider_test.go | 8 ++++ internal/webhook/podcpuboost_webhook.go | 15 ++++-- internal/webhook/podcpuboost_webhook_test.go | 50 ++++++++++++++++---- 8 files changed, 109 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 9fec8eb..a1edcef 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ Note: this is not an officially supported Google product. * [[Boost resources] fixed target](#boost-resources-fixed-target) * [[Boost duration] fixed time](#boost-duration-fixed-time) * [[Boost duration] POD condition](#boost-duration-pod-condition) +* [Configuration](#configuration) * [License](#license) ## Description @@ -35,7 +36,9 @@ The Kube Startup CPU Boost leverages [In-place Resource Resize for Kubernetes Po feature introduced in Kubernetes 1.27. It allows to revert workload's CPU resource requests and limits back to their original values without the need to recreate the Pods. -The increase of resources is achieved by Mutating Admission Webhook. +The increase of resources is achieved by Mutating Admission Webhook. By default, the webhook also +removes CPU resource limits if present. The original resource values are set by operator after given +period of time or when the POD condition is met. ## Installation @@ -203,6 +206,23 @@ Define the POD condition, the resource boost effect will last until the conditio status: "True" ``` +## Configuration + +Kube Startup CPU Boost operator can be configured with environmental variables. + +| Variable | Type | Default | Description | +| --- | --- | --- | --- | +| `POD_NAMESPACE` | `string` | `kube-startup-cpu-boost-system` | Kube Startup CPU Boost operator namespace | +| `MGR_CHECK_INTERVAL` | `int` | `5` | Duration in seconds between boost manager checks for time based boost duration policy | +| `LEADER_ELECTION` | `bool` | `false` | Enables leader election for controller manager | +| `METRICS_PROBE_BIND_ADDR` | `string` | `:8080` | Address the metrics endpoint binds to | +| `HEALTH_PROBE_BIND_ADDR` | `string` | `:8081` | Address the health probe endpoint binds to | +| `SECURE_METRICS` | `bool` | `false` | Determines if the metrics endpoint is served securely | +| `ZAP_LOG_LEVEL` | `int` | `0` | Log level for ZAP logger | +| `ZAP_DEVELOPMENT` | `bool` | `false` | Enables development mode for ZAP logger | +| `HTTP2` | `bool` | `false` | Determines if the HTTP/2 protocol is used for webhook and metrics servers| +| `REMOVE_LIMITS` | `bool` | `true` | Enables operator to remove container CPU limits during the boost time | + ## License [Apache License 2.0](LICENSE) diff --git a/cmd/main.go b/cmd/main.go index 3865749..ef1aec8 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -87,17 +87,6 @@ func main() { HealthProbeBindAddress: cfg.HealthProbeBindAddr, LeaderElection: cfg.LeaderElection, LeaderElectionID: leaderElectionID, - // LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily - // when the Manager ends. This requires the binary to immediately end when the - // Manager is stopped, otherwise, this setting is unsafe. Setting this significantly - // speeds up voluntary leader transitions as the new leader don't have to wait - // LeaseDuration time first. - // - // In the default scaffold provided, the program ends immediately after - // the manager stops, so would be fine to enable this option. However, - // if you are doing or is intended to do any operation such as perform cleanups - // after the manager stops then its usage might be unsafe. - // LeaderElectionReleaseOnCancel: true, }) if err != nil { setupLog.Error(err, "unable to start manager") @@ -111,7 +100,7 @@ func main() { } boostMgr := boost.NewManager(mgr.GetClient()) - go setupControllers(mgr, boostMgr, certsReady) + go setupControllers(mgr, boostMgr, cfg, certsReady) if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { setupLog.Error(err, "unable to set up health check") @@ -131,7 +120,7 @@ func main() { } } -func setupControllers(mgr ctrl.Manager, boostMgr boost.Manager, certsReady chan struct{}) { +func setupControllers(mgr ctrl.Manager, boostMgr boost.Manager, cfg *config.Config, certsReady chan struct{}) { setupLog.Info("Waiting for certificate generation to complete") <-certsReady setupLog.Info("Certificate generation has completed") @@ -140,7 +129,7 @@ func setupControllers(mgr ctrl.Manager, boostMgr boost.Manager, certsReady chan setupLog.Error(err, "Unable to create webhook", "webhook", failedWebhook) os.Exit(1) } - cpuBoostWebHook := boostWebhook.NewPodCPUBoostWebHook(boostMgr, scheme) + cpuBoostWebHook := boostWebhook.NewPodCPUBoostWebHook(boostMgr, scheme, cfg.RemoveLimits) mgr.GetWebhookServer().Register("/mutate-v1-pod", cpuBoostWebHook) boostCtrl := &controller.StartupCPUBoostReconciler{ Client: mgr.GetClient(), @@ -153,11 +142,5 @@ func setupControllers(mgr ctrl.Manager, boostMgr boost.Manager, certsReady chan setupLog.Error(err, "unable to create controller", "controller", "StartupCPUBoost") os.Exit(1) } - /* - if err = (&autoscalingv1alpha1.StartupCPUBoost{}).SetupWebhookWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create webhook", "webhook", "StartupCPUBoost") - os.Exit(1) - } - */ //+kubebuilder:scaffold:builder } diff --git a/internal/config/config.go b/internal/config/config.go index e2098b2..aef6bb0 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -25,6 +25,7 @@ const ( ZapLogLevelDefault = 0 // zapcore.InfoLevel ZapDevelopmentDefault = false HTTP2Default = false + RemoveLimitsDefault = true ) // ConfigProvider provides the Kube Startup CPU Boost configuration @@ -42,9 +43,9 @@ type Config struct { // LeaderElection enables leader election for controller manager // Enabling this will ensure there is only one active controller manager LeaderElection bool - // MetricsProbeBindAddr is the address the metric endpoint binds to + // MetricsProbeBindAddr is the address the metrics endpoint binds to MetricsProbeBindAddr string - // HeathProbeBindAddr is the address the probe endpoint binds to + // HeathProbeBindAddr is the address the health probe endpoint binds to HealthProbeBindAddr string // SecureMetrics determines if the metrics endpoint is served securely SecureMetrics bool @@ -54,6 +55,8 @@ type Config struct { ZapDevelopment bool // HTTP2 determines if the HTTP/2 protocol is used for webhook and metrics servers HTTP2 bool + // RemoveLimits determines if CPU resource limits should be removed during boost + RemoveLimits bool } // LoadDefaults loads the default configuration values @@ -67,4 +70,5 @@ func (c *Config) LoadDefaults() { c.ZapLogLevel = ZapLogLevelDefault c.ZapDevelopment = ZapDevelopmentDefault c.HTTP2 = HTTP2Default + c.RemoveLimits = RemoveLimitsDefault } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 003490a..e9dfa41 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -50,11 +50,14 @@ var _ = Describe("Config", func() { It("has valid ZAP log level", func() { Expect(cfg.ZapLogLevel).To(Equal(config.ZapLogLevelDefault)) }) - It("has valid ZAP development ", func() { + It("has valid ZAP development", func() { Expect(cfg.ZapDevelopment).To(Equal(config.ZapDevelopmentDefault)) }) - It("has valid HTTP2 ", func() { + It("has valid HTTP2", func() { Expect(cfg.HTTP2).To(Equal(config.HTTP2Default)) }) + It("has valid RemoveLimits", func() { + Expect(cfg.RemoveLimits).To(Equal(config.RemoveLimitsDefault)) + }) }) }) diff --git a/internal/config/env_provider.go b/internal/config/env_provider.go index 670b344..e481cff 100644 --- a/internal/config/env_provider.go +++ b/internal/config/env_provider.go @@ -30,6 +30,7 @@ const ( ZapLogLevelEnvVar = "ZAP_LOG_LEVEL" ZapDevelopmentEnvVar = "ZAP_DEVELOPMENT" HTTP2EnvVar = "HTTP2" + RemoveLimitsEnvVar = "REMOVE_LIMITS" ) type LookupEnvFunc func(key string) (string, bool) @@ -57,6 +58,7 @@ func (p *EnvConfigProvider) LoadConfig() (*Config, error) { errs = p.loadZapLogLevel(&config, errs) errs = p.loadZapDevelopment(&config, errs) errs = p.loadHTTP2(&config, errs) + errs = p.loadRemoveLimits(&config, errs) var err error if len(errs) > 0 { err = errors.Join(errs...) @@ -142,7 +144,18 @@ func (p *EnvConfigProvider) loadHTTP2(config *Config, curErrs []error) (errs []e boolVal, err := strconv.ParseBool(v) config.HTTP2 = boolVal if err != nil { - errs = append(curErrs, fmt.Errorf("%s value is not a bool: %s", LeaderElectionEnvVar, err)) + errs = append(curErrs, fmt.Errorf("%s value is not a bool: %s", HTTP2EnvVar, err)) + } + } + return +} + +func (p *EnvConfigProvider) loadRemoveLimits(config *Config, curErrs []error) (errs []error) { + if v, ok := p.lookupFunc(RemoveLimitsEnvVar); ok { + boolVal, err := strconv.ParseBool(v) + config.RemoveLimits = boolVal + if err != nil { + errs = append(curErrs, fmt.Errorf("%s value is not a bool: %s", RemoveLimitsEnvVar, err)) } } return diff --git a/internal/config/env_provider_test.go b/internal/config/env_provider_test.go index 9b22ec7..1353ea2 100644 --- a/internal/config/env_provider_test.go +++ b/internal/config/env_provider_test.go @@ -131,5 +131,13 @@ var _ = Describe("EnvProvider", func() { Expect(cfg.HTTP2).To(BeTrue()) }) }) + When("removeLimits variable is set", func() { + BeforeEach(func() { + lookupFuncMap[config.RemoveLimitsEnvVar] = "false" + }) + It("has valid remove limits", func() { + Expect(cfg.RemoveLimits).To(BeFalse()) + }) + }) }) }) diff --git a/internal/webhook/podcpuboost_webhook.go b/internal/webhook/podcpuboost_webhook.go index ad397ae..c83b365 100644 --- a/internal/webhook/podcpuboost_webhook.go +++ b/internal/webhook/podcpuboost_webhook.go @@ -32,15 +32,17 @@ import ( // +kubebuilder:webhook:path=/mutate-v1-pod,mutating=true,failurePolicy=ignore,sideEffects=None,timeoutSeconds=2,groups="",resources=pods,verbs=create,versions=v1,name=cpuboost.autoscaling.x-k8s.io,admissionReviewVersions=v1 type podCPUBoostHandler struct { - decoder admission.Decoder - manager boost.Manager + decoder admission.Decoder + manager boost.Manager + removeLimits bool } -func NewPodCPUBoostWebHook(mgr boost.Manager, scheme *runtime.Scheme) *webhook.Admission { +func NewPodCPUBoostWebHook(mgr boost.Manager, scheme *runtime.Scheme, removeLimits bool) *webhook.Admission { return &webhook.Admission{ Handler: &podCPUBoostHandler{ - manager: mgr, - decoder: admission.NewDecoder(scheme), + manager: mgr, + decoder: admission.NewDecoder(scheme), + removeLimits: removeLimits, }, } } @@ -89,6 +91,9 @@ func (h *podCPUBoostHandler) boostContainerResources(ctx context.Context, b boos "newCpuRequests", resources.Requests.Cpu().String(), "newCpuLimits", resources.Limits.Cpu().String(), ) + if h.removeLimits { + delete(resources.Limits, corev1.ResourceCPU) + } pod.Spec.Containers[i].Resources = *resources log.Info("pod resources increased") } diff --git a/internal/webhook/podcpuboost_webhook_test.go b/internal/webhook/podcpuboost_webhook_test.go index 93828b0..7649582 100644 --- a/internal/webhook/podcpuboost_webhook_test.go +++ b/internal/webhook/podcpuboost_webhook_test.go @@ -41,11 +41,12 @@ import ( var _ = Describe("Pod CPU Boost Webhook", func() { Describe("Handles admission requests", func() { var ( - mockCtrl *gomock.Controller - manager *mock.MockManager - managerCall *gomock.Call - pod *corev1.Pod - response webhook.AdmissionResponse + mockCtrl *gomock.Controller + manager *mock.MockManager + managerCall *gomock.Call + pod *corev1.Pod + response webhook.AdmissionResponse + removeLimits bool ) BeforeEach(func() { pod = podTemplate.DeepCopy() @@ -72,7 +73,7 @@ var _ = Describe("Pod CPU Boost Webhook", func() { }, }, } - hook := bwebhook.NewPodCPUBoostWebHook(manager, scheme.Scheme) + hook := bwebhook.NewPodCPUBoostWebHook(manager, scheme.Scheme, removeLimits) response = hook.Handle(context.TODO(), admissionReq) }) When("there is no matching Startup CPU Boost", func() { @@ -130,6 +131,7 @@ var _ = Describe("Pod CPU Boost Webhook", func() { resPolicyCallOne = boost.EXPECT().ResourcePolicy(gomock.Eq(containerOneName)).Return(resPolicy, true) resPolicyCallTwo = boost.EXPECT().ResourcePolicy(gomock.Eq(containerTwoName)).Return(nil, false) managerCall.Return(boost, true) + removeLimits = true }) It("retrieves resource policy for containers", func() { resPolicyCallOne.Times(1) @@ -162,10 +164,28 @@ var _ = Describe("Pod CPU Boost Webhook", func() { patch := containerResourcePatch(pod, resPolicy, "requests", 0) Expect(response.Patches).To(ContainElement(patch)) }) - It("returns admission with container-one limits patch", func() { - patch := containerResourcePatch(pod, resPolicy, "limits", 0) + It("returns admission with container-one remove limits patch", func() { + patch := containerRemoveRequirementPatch("limits", 0) Expect(response.Patches).To(ContainElement(patch)) }) + When("container has memory limits set", func() { + BeforeEach(func() { + pod.Spec.Containers[0].Resources.Limits[corev1.ResourceMemory] = apiResource.MustParse("100Mi") + }) + It("returns admission with container-one remove CPU limits patch", func() { + patch := containerRemoveCPURequirementPatch("limits", 0) + Expect(response.Patches).To(ContainElement(patch)) + }) + }) + When("removeLimits is not set", func() { + BeforeEach(func() { + removeLimits = false + }) + It("returns admission with container-one limits patch", func() { + patch := containerResourcePatch(pod, resPolicy, "limits", 0) + Expect(response.Patches).To(ContainElement(patch)) + }) + }) When("container has no request and no limits set", func() { BeforeEach(func() { pod.Spec.Containers[0].Resources.Requests = nil @@ -294,3 +314,17 @@ func containerResourcePatch(pod *corev1.Pod, policy resource.ContainerPolicy, re Value: newQuantity.String(), } } + +func containerRemoveCPURequirementPatch(requirement string, containerIdx int) jsonpatch.Operation { + return jsonpatch.Operation{ + Operation: "remove", + Path: fmt.Sprintf("/spec/containers/%d/resources/%s/cpu", containerIdx, requirement), + } +} + +func containerRemoveRequirementPatch(requirement string, containerIdx int) jsonpatch.Operation { + return jsonpatch.Operation{ + Operation: "remove", + Path: fmt.Sprintf("/spec/containers/%d/resources/%s", containerIdx, requirement), + } +}