From 68cce2ae4b2607376ea247f85cf06e50380df89a Mon Sep 17 00:00:00 2001 From: Roman Zavodskikh Date: Tue, 30 Apr 2024 19:32:22 +0200 Subject: [PATCH] Add -min-drop-probability cmdline option for the PHC (#3053) Signed-off-by: Roman Zavodskikh Co-authored-by: Roman Zavodskikh --- docs/operation/operation.md | 13 ++++---- metricsinit_test.go | 1 + proxy/healthy_endpoints.go | 2 +- proxy/healthy_endpoints_test.go | 1 + proxy/proxy.go | 19 +++++++++++- proxy/proxy_test.go | 55 +++++++++++++++++++++++++++++++++ routing/endpointregistry.go | 11 ++++--- skipper.go | 1 + 8 files changed, 91 insertions(+), 12 deletions(-) diff --git a/docs/operation/operation.md b/docs/operation/operation.md index 8969916091..8be9bc3c31 100644 --- a/docs/operation/operation.md +++ b/docs/operation/operation.md @@ -901,15 +901,15 @@ Passive Health Check(PHC). PHC works the following way: the entire uptime is divided in chunks of `period`, per every period Skipper calculates the total amount of requests and amount of requests failed per every endpoint. While next period is going on, the Skipper takes a look at previous period and if the amount of requests in the previous period is more than `min-requests` -for the given endpoints then Skipper will send reduced (the more `max-drop-probability` -and failed requests ratio in previous period are, the stronger reduction is) -amount of requests compared to amount sent without PHC. +and failed requests ratio is more than `min-drop-probability` for the given endpoints +then Skipper will send reduced (the more `max-drop-probability` and failed requests ratio +in previous period are, the stronger reduction is) amount of requests compared to amount sent without PHC. Having this, we expect less requests to fail because a lot of them would be sent to endpoints that seem to be healthy instead. To enable this feature, you need to provide `-passive-health-check` option having all forementioned parameters -(`period`, `min-requests`, `max-drop-probability`) defined, -for instance: `-passive-health-check=period=1s,min-requests=10,max-drop-probability=0.9`. +(`period`, `min-requests`, `min-drop-probability`, `max-drop-probability`) defined, +for instance: `-passive-health-check=period=1s,min-requests=10,min-drop-probability=0.05,max-drop-probability=0.9`. You need to define all parameters on your side, there are no defaults, and skipper will not run if PHC params are passed only partially. @@ -918,7 +918,8 @@ However, Skipper will run without this feature, if no `-passive-health-check` is The parameters of `-passive-health-check` option are: + `period=` - the duration of stats reset period + `min-requests=` - the minimum number of requests per `period` per backend endpoint required to activate PHC for this endpoint -+ `max-drop-probabilty=` - the maximum possible probability of unhealthy endpoint being not considered ++ `min-drop-probabilty=<0.0 <= p <= 1.0>` - the minimum possible probability of unhealthy endpoint being not considered while choosing the endpoint for the given request. The same value is in fact used as minimal failed requests ratio for PHC to be enabled for this endpoint ++ `max-drop-probabilty=<0.0 <= p <= 1.0>` - the maximum possible probability of unhealthy endpoint being not considered while choosing the endpoint for the given request ### Metrics diff --git a/metricsinit_test.go b/metricsinit_test.go index ff9740254e..c48172486f 100644 --- a/metricsinit_test.go +++ b/metricsinit_test.go @@ -57,6 +57,7 @@ func TestInitOrderAndDefault(t *testing.T) { "period": "1m", "min-requests": "10", "max-drop-probability": "0.9", + "min-drop-probability": "0.05", }, } diff --git a/proxy/healthy_endpoints.go b/proxy/healthy_endpoints.go index fbdfcbd773..8e172993bd 100644 --- a/proxy/healthy_endpoints.go +++ b/proxy/healthy_endpoints.go @@ -22,7 +22,7 @@ func (h *healthyEndpoints) filterHealthyEndpoints(ctx *context, endpoints []rout filtered := make([]routing.LBEndpoint, 0, len(endpoints)) for _, e := range endpoints { dropProbability := e.Metrics.HealthCheckDropProbability() - if dropProbability > 0.05 && p < dropProbability { + if p < dropProbability { ctx.Logger().Infof("Dropping endpoint %q due to passive health check: p=%0.2f, dropProbability=%0.2f", e.Host, p, dropProbability) metrics.IncCounter("passive-health-check.endpoints.dropped") diff --git a/proxy/healthy_endpoints_test.go b/proxy/healthy_endpoints_test.go index 0ce0682879..ad5a35da11 100644 --- a/proxy/healthy_endpoints_test.go +++ b/proxy/healthy_endpoints_test.go @@ -24,6 +24,7 @@ func defaultEndpointRegistry() *routing.EndpointRegistry { StatsResetPeriod: period, MinRequests: 10, MaxHealthCheckDropProbability: 1.0, + MinHealthCheckDropProbability: 0.01, }) } diff --git a/proxy/proxy.go b/proxy/proxy.go index dc61d23d14..2de0161d01 100644 --- a/proxy/proxy.go +++ b/proxy/proxy.go @@ -154,6 +154,11 @@ type PassiveHealthCheck struct { // potentially opt out the endpoint from the list of healthy endpoints MinRequests int64 + // The minimal ratio of failed requests in a single period to potentially opt out the endpoint + // from the list of healthy endpoints. This ratio is equal to the minimal non-zero probability of + // dropping endpoint out from load balancing for every specific request + MinDropProbability float64 + // The maximum probability of unhealthy endpoint to be dropped out from load balancing for every specific request MaxDropProbability float64 } @@ -186,6 +191,15 @@ func InitPassiveHealthChecker(o map[string]string) (bool, *PassiveHealthCheck, e return false, nil, fmt.Errorf("passive health check: invalid minRequests value: %s", value) } result.MinRequests = int64(minRequests) + case "min-drop-probability": + minDropProbability, err := strconv.ParseFloat(value, 64) + if err != nil { + return false, nil, fmt.Errorf("passive health check: invalid minDropProbability value: %s", value) + } + if minDropProbability < 0 || minDropProbability > 1 { + return false, nil, fmt.Errorf("passive health check: invalid minDropProbability value: %s", value) + } + result.MinDropProbability = minDropProbability case "max-drop-probability": maxDropProbability, err := strconv.ParseFloat(value, 64) if err != nil { @@ -202,9 +216,12 @@ func InitPassiveHealthChecker(o map[string]string) (bool, *PassiveHealthCheck, e keysInitialized[key] = struct{}{} } - if len(keysInitialized) != 3 { + if len(keysInitialized) != 4 { return false, nil, fmt.Errorf("passive health check: missing required parameters") } + if result.MinDropProbability >= result.MaxDropProbability { + return false, nil, fmt.Errorf("passive health check: minDropProbability should be less than maxDropProbability") + } return true, result, nil } diff --git a/proxy/proxy_test.go b/proxy/proxy_test.go index 0ba1fce3eb..ea3cc79463 100644 --- a/proxy/proxy_test.go +++ b/proxy/proxy_test.go @@ -2299,6 +2299,7 @@ func TestInitPassiveHealthChecker(t *testing.T) { "period": "somethingInvalid", "min-requests": "10", "max-drop-probability": "0.9", + "min-drop-probability": "0.05", }, expectedEnabled: false, expectedParams: nil, @@ -2309,12 +2310,14 @@ func TestInitPassiveHealthChecker(t *testing.T) { "period": "1m", "min-requests": "10", "max-drop-probability": "0.9", + "min-drop-probability": "0.05", }, expectedEnabled: true, expectedParams: &PassiveHealthCheck{ Period: 1 * time.Minute, MinRequests: 10, MaxDropProbability: 0.9, + MinDropProbability: 0.05, }, expectedError: nil, }, @@ -2323,6 +2326,7 @@ func TestInitPassiveHealthChecker(t *testing.T) { "period": "-1m", "min-requests": "10", "max-drop-probability": "0.9", + "min-drop-probability": "0.05", }, expectedEnabled: false, expectedParams: nil, @@ -2333,6 +2337,7 @@ func TestInitPassiveHealthChecker(t *testing.T) { "period": "1m", "min-requests": "somethingInvalid", "max-drop-probability": "0.9", + "min-drop-probability": "0.05", }, expectedEnabled: false, expectedParams: nil, @@ -2343,6 +2348,7 @@ func TestInitPassiveHealthChecker(t *testing.T) { "period": "1m", "min-requests": "-10", "max-drop-probability": "0.9", + "min-drop-probability": "0.05", }, expectedEnabled: false, expectedParams: nil, @@ -2353,6 +2359,7 @@ func TestInitPassiveHealthChecker(t *testing.T) { "period": "1m", "min-requests": "10", "max-drop-probability": "somethingInvalid", + "min-drop-probability": "0.05", }, expectedEnabled: false, expectedParams: nil, @@ -2363,6 +2370,7 @@ func TestInitPassiveHealthChecker(t *testing.T) { "period": "1m", "min-requests": "10", "max-drop-probability": "-0.1", + "min-drop-probability": "0.05", }, expectedEnabled: false, expectedParams: nil, @@ -2373,6 +2381,7 @@ func TestInitPassiveHealthChecker(t *testing.T) { "period": "1m", "min-requests": "10", "max-drop-probability": "3.1415", + "min-drop-probability": "0.05", }, expectedEnabled: false, expectedParams: nil, @@ -2383,6 +2392,51 @@ func TestInitPassiveHealthChecker(t *testing.T) { "period": "1m", "min-requests": "10", "max-drop-probability": "0.9", + "min-drop-probability": "somethingInvalid", + }, + expectedEnabled: false, + expectedParams: nil, + expectedError: fmt.Errorf("passive health check: invalid minDropProbability value: somethingInvalid"), + }, + { + inputArg: map[string]string{ + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "-0.1", + }, + expectedEnabled: false, + expectedParams: nil, + expectedError: fmt.Errorf("passive health check: invalid minDropProbability value: -0.1"), + }, + { + inputArg: map[string]string{ + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "3.1415", + }, + expectedEnabled: false, + expectedParams: nil, + expectedError: fmt.Errorf("passive health check: invalid minDropProbability value: 3.1415"), + }, + { + inputArg: map[string]string{ + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.05", + "min-drop-probability": "0.9", + }, + expectedEnabled: false, + expectedParams: nil, + expectedError: fmt.Errorf("passive health check: minDropProbability should be less than maxDropProbability"), + }, + { + inputArg: map[string]string{ + "period": "1m", + "min-requests": "10", + "max-drop-probability": "0.9", + "min-drop-probability": "0.05", "non-existing": "non-existing", }, expectedEnabled: false, @@ -2394,6 +2448,7 @@ func TestInitPassiveHealthChecker(t *testing.T) { "period": "1m", "min-requests": "10", /* forgot max-drop-probability */ + "min-drop-probability": "0.05", }, expectedEnabled: false, expectedParams: nil, diff --git a/routing/endpointregistry.go b/routing/endpointregistry.go index 539f7c8fc4..4609d7bc96 100644 --- a/routing/endpointregistry.go +++ b/routing/endpointregistry.go @@ -98,6 +98,7 @@ type EndpointRegistry struct { lastSeenTimeout time.Duration statsResetPeriod time.Duration minRequests int64 + minHealthCheckDropProbability float64 maxHealthCheckDropProbability float64 quit chan struct{} @@ -113,6 +114,7 @@ type RegistryOptions struct { PassiveHealthCheckEnabled bool StatsResetPeriod time.Duration MinRequests int64 + MinHealthCheckDropProbability float64 MaxHealthCheckDropProbability float64 } @@ -164,17 +166,17 @@ func (r *EndpointRegistry) updateStats() { e.totalFailedRoundTrips[nextSlot].Store(0) e.totalRequests[nextSlot].Store(0) + newDropProbability := 0.0 failed := e.totalFailedRoundTrips[curSlot].Load() requests := e.totalRequests[curSlot].Load() if requests > r.minRequests { failedRoundTripsRatio := float64(failed) / float64(requests) - if failedRoundTripsRatio > 0.0 { + if failedRoundTripsRatio > r.minHealthCheckDropProbability { log.Infof("Passive health check: marking %q as unhealthy due to failed round trips ratio: %0.2f", key, failedRoundTripsRatio) + newDropProbability = min(failedRoundTripsRatio, r.maxHealthCheckDropProbability) } - e.healthCheckDropProbability.Store(min(failedRoundTripsRatio, r.maxHealthCheckDropProbability)) - } else { - e.healthCheckDropProbability.Store(0.0) } + e.healthCheckDropProbability.Store(newDropProbability) e.curSlot.Store(nextSlot) return true @@ -201,6 +203,7 @@ func NewEndpointRegistry(o RegistryOptions) *EndpointRegistry { lastSeenTimeout: o.LastSeenTimeout, statsResetPeriod: o.StatsResetPeriod, minRequests: o.MinRequests, + minHealthCheckDropProbability: o.MinHealthCheckDropProbability, maxHealthCheckDropProbability: o.MaxHealthCheckDropProbability, quit: make(chan struct{}), diff --git a/skipper.go b/skipper.go index f8b6d30130..4bcf0336f1 100644 --- a/skipper.go +++ b/skipper.go @@ -1945,6 +1945,7 @@ func run(o Options, sig chan os.Signal, idleConnsCH chan struct{}) error { PassiveHealthCheckEnabled: passiveHealthCheckEnabled, StatsResetPeriod: passiveHealthCheck.Period, MinRequests: passiveHealthCheck.MinRequests, + MinHealthCheckDropProbability: passiveHealthCheck.MinDropProbability, MaxHealthCheckDropProbability: passiveHealthCheck.MaxDropProbability, }) ro := routing.Options{