Skip to content

Commit

Permalink
fix(hetzner): missing error return in scale up/down
Browse files Browse the repository at this point in the history
There is no Node Group/Autoscaling Group in Hetzner Cloud API, so the
Hetzner provider implemented this by manually creating as many servers
as needed.

The current code did not return any of the errors that could have
happened. Without any returned errors, cluster-autoscaler assumed that
everything was fine with the Node Group.

In cases where there is a temporary issue with one of the node groups
(ie. Location is unavailable, no leftover capacity for the requested
server type), cluster-autoscaler should consider this and try to scale
up a different Node Group. This will automatically happen once we return
an error, as cluster-autoscaler backs off from scaling Node Groups that
have recently returned errors.
  • Loading branch information
apricote committed Apr 24, 2024
1 parent 70c0ce9 commit 9bfa788
Showing 1 changed file with 51 additions and 18 deletions.
69 changes: 51 additions & 18 deletions cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package hetzner

import (
"context"
"errors"
"fmt"
"maps"
"math/rand"
Expand Down Expand Up @@ -91,7 +92,8 @@ func (n *hetznerNodeGroup) IncreaseSize(delta int) error {
return fmt.Errorf("delta must be positive, have: %d", delta)
}

targetSize := n.targetSize + delta
expectedDelta := delta
targetSize := n.targetSize + expectedDelta
if targetSize > n.MaxSize() {
return fmt.Errorf("size increase is too large. current: %d desired: %d max: %d", n.targetSize, targetSize, n.MaxSize())
}
Expand All @@ -109,25 +111,43 @@ func (n *hetznerNodeGroup) IncreaseSize(delta int) error {
return fmt.Errorf("server type %s not available in region %s", n.instanceType, n.region)
}

defer func() {
// create new servers cache
if _, err := n.manager.cachedServers.servers(); err != nil {
klog.Errorf("failed to update servers cache: %v", err)
}

// Update target size
n.resetTargetSize(expectedDelta)
}()

// There is no "Server Group" in Hetzner Cloud, we need to create every
// server manually. This operation might fail for some of the servers
// because of quotas, rate limiting or server type availability. We need to
// collect the errors and inform cluster-autoscaler about this, so it can
// try other node groups if configured.
waitGroup := sync.WaitGroup{}
errsCh := make(chan error, delta)
for i := 0; i < delta; i++ {
waitGroup.Add(1)
go func() {
defer waitGroup.Done()
err := createServer(n)
if err != nil {
targetSize--
klog.Errorf("failed to create error: %v", err)
expectedDelta--
errsCh <- err
}
}()
}
waitGroup.Wait()
close(errsCh)

n.targetSize = targetSize

// create new servers cache
if _, err := n.manager.cachedServers.servers(); err != nil {
klog.Errorf("failed to get servers: %v", err)
errs := make([]error, 0, delta)
for err = range errsCh {
errs = append(errs, err)
}
if len(errs) > 0 {
return fmt.Errorf("failed to create all servers: %w", errors.Join(errs...))
}

return nil
Expand All @@ -146,34 +166,47 @@ func (n *hetznerNodeGroup) DeleteNodes(nodes []*apiv1.Node) error {
n.clusterUpdateMutex.Lock()
defer n.clusterUpdateMutex.Unlock()

targetSize := n.targetSize - len(nodes)
expectedDelta := len(nodes)
targetSize := n.targetSize - expectedDelta
if targetSize < n.MinSize() {
return fmt.Errorf("size decrease is too large. current: %d desired: %d min: %d", n.targetSize, targetSize, n.MinSize())
}

waitGroup := sync.WaitGroup{}
defer func() {
// create new servers cache
if _, err := n.manager.cachedServers.servers(); err != nil {
klog.Errorf("failed to update servers cache: %v", err)
}

n.resetTargetSize(-expectedDelta)
}()

waitGroup := sync.WaitGroup{}
errsCh := make(chan error, len(nodes))
for _, node := range nodes {
waitGroup.Add(1)
go func(node *apiv1.Node) {
klog.Infof("Evicting server %s", node.Name)

err := n.manager.deleteByNode(node)
if err != nil {
klog.Errorf("failed to delete server ID %s error: %v", node.Name, err)
expectedDelta--
errsCh <- fmt.Errorf("failed to delete server for node %q: %w", node.Name, err)
}

waitGroup.Done()
}(node)
}
waitGroup.Wait()
close(errsCh)

// create new servers cache
if _, err := n.manager.cachedServers.servers(); err != nil {
klog.Errorf("failed to get servers: %v", err)
errs := make([]error, 0, len(nodes))
for err := range errsCh {
errs = append(errs, err)
}
if len(errs) > 0 {
return fmt.Errorf("failed to delete all nodes: %w", errors.Join(errs...))
}

n.resetTargetSize(-len(nodes))

return nil
}
Expand Down Expand Up @@ -561,8 +594,8 @@ func waitForServerAction(m *hetznerManager, serverName string, action *hcloud.Ac
func (n *hetznerNodeGroup) resetTargetSize(expectedDelta int) {
servers, err := n.manager.allServers(n.id)
if err != nil {
klog.Errorf("failed to set node pool %s size, using delta %d error: %v", n.id, expectedDelta, err)
n.targetSize = n.targetSize - expectedDelta
klog.Warningf("failed to set node pool %s size, using delta %d error: %v", n.id, expectedDelta, err)
n.targetSize = n.targetSize + expectedDelta
} else {
klog.Infof("Set node group %s size from %d to %d, expected delta %d", n.id, n.targetSize, len(servers), expectedDelta)
n.targetSize = len(servers)
Expand Down

0 comments on commit 9bfa788

Please sign in to comment.