From 9bfa788b0bdb242aad7e0c9be0958b9b398a7c85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20T=C3=B6lle?= Date: Wed, 24 Apr 2024 11:25:00 +0200 Subject: [PATCH] fix(hetzner): missing error return in scale up/down There is no Node Group/Autoscaling Group in Hetzner Cloud API, so the Hetzner provider implemented this by manually creating as many servers as needed. The current code did not return any of the errors that could have happened. Without any returned errors, cluster-autoscaler assumed that everything was fine with the Node Group. In cases where there is a temporary issue with one of the node groups (ie. Location is unavailable, no leftover capacity for the requested server type), cluster-autoscaler should consider this and try to scale up a different Node Group. This will automatically happen once we return an error, as cluster-autoscaler backs off from scaling Node Groups that have recently returned errors. --- .../hetzner/hetzner_node_group.go | 69 ++++++++++++++----- 1 file changed, 51 insertions(+), 18 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go b/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go index b010dfbadc3e..166fc412f384 100644 --- a/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go +++ b/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go @@ -18,6 +18,7 @@ package hetzner import ( "context" + "errors" "fmt" "maps" "math/rand" @@ -91,7 +92,8 @@ func (n *hetznerNodeGroup) IncreaseSize(delta int) error { return fmt.Errorf("delta must be positive, have: %d", delta) } - targetSize := n.targetSize + delta + expectedDelta := delta + targetSize := n.targetSize + expectedDelta if targetSize > n.MaxSize() { return fmt.Errorf("size increase is too large. current: %d desired: %d max: %d", n.targetSize, targetSize, n.MaxSize()) } @@ -109,25 +111,43 @@ func (n *hetznerNodeGroup) IncreaseSize(delta int) error { return fmt.Errorf("server type %s not available in region %s", n.instanceType, n.region) } + defer func() { + // create new servers cache + if _, err := n.manager.cachedServers.servers(); err != nil { + klog.Errorf("failed to update servers cache: %v", err) + } + + // Update target size + n.resetTargetSize(expectedDelta) + }() + + // There is no "Server Group" in Hetzner Cloud, we need to create every + // server manually. This operation might fail for some of the servers + // because of quotas, rate limiting or server type availability. We need to + // collect the errors and inform cluster-autoscaler about this, so it can + // try other node groups if configured. waitGroup := sync.WaitGroup{} + errsCh := make(chan error, delta) for i := 0; i < delta; i++ { waitGroup.Add(1) go func() { defer waitGroup.Done() err := createServer(n) if err != nil { - targetSize-- - klog.Errorf("failed to create error: %v", err) + expectedDelta-- + errsCh <- err } }() } waitGroup.Wait() + close(errsCh) - n.targetSize = targetSize - - // create new servers cache - if _, err := n.manager.cachedServers.servers(); err != nil { - klog.Errorf("failed to get servers: %v", err) + errs := make([]error, 0, delta) + for err = range errsCh { + errs = append(errs, err) + } + if len(errs) > 0 { + return fmt.Errorf("failed to create all servers: %w", errors.Join(errs...)) } return nil @@ -146,13 +166,23 @@ func (n *hetznerNodeGroup) DeleteNodes(nodes []*apiv1.Node) error { n.clusterUpdateMutex.Lock() defer n.clusterUpdateMutex.Unlock() - targetSize := n.targetSize - len(nodes) + expectedDelta := len(nodes) + targetSize := n.targetSize - expectedDelta if targetSize < n.MinSize() { return fmt.Errorf("size decrease is too large. current: %d desired: %d min: %d", n.targetSize, targetSize, n.MinSize()) } - waitGroup := sync.WaitGroup{} + defer func() { + // create new servers cache + if _, err := n.manager.cachedServers.servers(); err != nil { + klog.Errorf("failed to update servers cache: %v", err) + } + + n.resetTargetSize(-expectedDelta) + }() + waitGroup := sync.WaitGroup{} + errsCh := make(chan error, len(nodes)) for _, node := range nodes { waitGroup.Add(1) go func(node *apiv1.Node) { @@ -160,20 +190,23 @@ func (n *hetznerNodeGroup) DeleteNodes(nodes []*apiv1.Node) error { err := n.manager.deleteByNode(node) if err != nil { - klog.Errorf("failed to delete server ID %s error: %v", node.Name, err) + expectedDelta-- + errsCh <- fmt.Errorf("failed to delete server for node %q: %w", node.Name, err) } waitGroup.Done() }(node) } waitGroup.Wait() + close(errsCh) - // create new servers cache - if _, err := n.manager.cachedServers.servers(); err != nil { - klog.Errorf("failed to get servers: %v", err) + errs := make([]error, 0, len(nodes)) + for err := range errsCh { + errs = append(errs, err) + } + if len(errs) > 0 { + return fmt.Errorf("failed to delete all nodes: %w", errors.Join(errs...)) } - - n.resetTargetSize(-len(nodes)) return nil } @@ -561,8 +594,8 @@ func waitForServerAction(m *hetznerManager, serverName string, action *hcloud.Ac func (n *hetznerNodeGroup) resetTargetSize(expectedDelta int) { servers, err := n.manager.allServers(n.id) if err != nil { - klog.Errorf("failed to set node pool %s size, using delta %d error: %v", n.id, expectedDelta, err) - n.targetSize = n.targetSize - expectedDelta + klog.Warningf("failed to set node pool %s size, using delta %d error: %v", n.id, expectedDelta, err) + n.targetSize = n.targetSize + expectedDelta } else { klog.Infof("Set node group %s size from %d to %d, expected delta %d", n.id, n.targetSize, len(servers), expectedDelta) n.targetSize = len(servers)