From c5a2784202177e10323ebaa689a151c05822831c Mon Sep 17 00:00:00 2001 From: Simone Basso Date: Mon, 25 Sep 2023 23:36:37 +0200 Subject: [PATCH] fix(enginenetx): stabilize happy eyeballs algorithm (#1301) * Use 1s as the base delay (which leads to simpler computations) * Acknowledge that with very large indexes we still need to produce reasonable positive values, while the current algorithm breaks for indexes large than `63` * Acknowledge that, after incrementing exponentially for a while, it makes sense to continue in a flat fashion, where we increment linearly, while the current algorithm was aiming to always return 30s, which means several attempts would actually run in parallel * Acknowledge that we should use the same zero for all timings rather than having a goroutine dependent zero * Acknowledge that for now we don't need to mock `time.Now` Part of https://github.com/ooni/probe/issues/2531 --- internal/enginenetx/happyeyeballs.go | 25 ++++++++------ internal/enginenetx/happyeyeballs_test.go | 30 ++++++++--------- internal/enginenetx/httpsdialercore.go | 35 ++++++++++++++------ internal/enginenetx/httpsdialernull.go | 9 +---- internal/enginenetx/httpsdialerstats.go | 15 +++------ internal/enginenetx/network_internal_test.go | 3 -- 6 files changed, 59 insertions(+), 58 deletions(-) diff --git a/internal/enginenetx/happyeyeballs.go b/internal/enginenetx/happyeyeballs.go index 3f2e962b1a..75259b9aa5 100644 --- a/internal/enginenetx/happyeyeballs.go +++ b/internal/enginenetx/happyeyeballs.go @@ -2,29 +2,32 @@ package enginenetx import "time" -// happyEyeballsDelay implements an happy-eyeballs like algorithm with the -// given base delay and with the given index. The index is the attempt number +// happyEyeballsDelay implements an happy-eyeballs like algorithm with a +// base delay of 1 second and the given index. The index is the attempt number // and the first attempt should have zero as its index. // -// The algorithm should emit 0 as the first delay, the baseDelay as the +// The standard Go library uses a 300ms delay for connecting. Because a TCP +// connect is one round trip and the TLS handshake is two round trips (roughly), +// we use 1 second as the base delay increment here. +// +// The algorithm should emit 0 as the first delay, the base delay as the // second delay, and then it should double the base delay at each attempt, -// until we reach the 30 seconds, after which the delay is constant. +// until we reach the 8 seconds, after which the delay increments +// linearly spacing each subsequent attempts 8 seconds in the future. // // By doubling the base delay, we account for the case where there are // actual issues inside the network. By using this algorithm, we are still // able to overlap and pack more dialing attempts overall. -func happyEyeballsDelay(baseDelay time.Duration, idx int) time.Duration { - const cutoff = 30 * time.Second +func happyEyeballsDelay(idx int) time.Duration { + const baseDelay = time.Second switch { case idx <= 0: return 0 case idx == 1: return baseDelay + case idx <= 4: + return baseDelay << (idx - 1) default: - delay := baseDelay << (idx - 1) - if delay > cutoff { - delay = cutoff - } - return delay + return baseDelay << 3 * (time.Duration(idx) - 3) } } diff --git a/internal/enginenetx/happyeyeballs_test.go b/internal/enginenetx/happyeyeballs_test.go index 965fc4fe16..b1f7c4af56 100644 --- a/internal/enginenetx/happyeyeballs_test.go +++ b/internal/enginenetx/happyeyeballs_test.go @@ -12,30 +12,28 @@ func TestHappyEyeballsDelay(t *testing.T) { expect time.Duration } - const delay = 900 * time.Millisecond - cases := []testcase{ {-1, 0}, // make sure we gracefully handle negative numbers (i.e., we don't crash) {0, 0}, - {1, delay}, - {2, delay * 2}, - {3, delay * 4}, - {4, delay * 8}, - {5, delay * 16}, - {6, delay * 32}, - {7, 30 * time.Second}, - {8, 30 * time.Second}, - {9, 30 * time.Second}, - {10, 30 * time.Second}, + {1, time.Second}, + {2, 2 * time.Second}, + {3, 4 * time.Second}, + {4, 8 * time.Second}, + {5, 2 * 8 * time.Second}, + {6, 3 * 8 * time.Second}, + {7, 4 * 8 * time.Second}, + {8, 5 * 8 * time.Second}, + {9, 6 * 8 * time.Second}, + {10, 7 * 8 * time.Second}, } for _, tc := range cases { - t.Run(fmt.Sprintf("delay=%v tc.idx=%v", delay, tc.idx), func(t *testing.T) { - got := happyEyeballsDelay(delay, tc.idx) + t.Run(fmt.Sprintf("tc.idx=%v", tc.idx), func(t *testing.T) { + got := happyEyeballsDelay(tc.idx) if got != tc.expect { - t.Fatalf("with delay=%v tc.idx=%v we got %v but expected %v", delay, tc.idx, got, tc.expect) + t.Fatalf("with tc.idx=%v we got %v but expected %v", tc.idx, got, tc.expect) } - t.Logf("with delay=%v tc.idx=%v: got %v", delay, tc.idx, got) + t.Logf("with tc.idx=%v: got %v", tc.idx, got) }) } } diff --git a/internal/enginenetx/httpsdialercore.go b/internal/enginenetx/httpsdialercore.go index c8f92dc627..a525f2b740 100644 --- a/internal/enginenetx/httpsdialercore.go +++ b/internal/enginenetx/httpsdialercore.go @@ -191,8 +191,9 @@ func (hd *HTTPSDialer) DialTLSContext(ctx context.Context, network string, endpo collector := make(chan *httpsDialerErrorOrConn) joiner := make(chan any) const parallelism = 16 + t0 := time.Now() for idx := 0; idx < parallelism; idx++ { - go hd.worker(ctx, joiner, emitter, collector) + go hd.worker(ctx, joiner, emitter, t0, collector) } // wait until all goroutines have joined @@ -242,8 +243,13 @@ func httpsDialerReduceResult(connv []model.TLSConn, errorv []error) (model.TLSCo // worker attempts to establish a TLS connection using and emits a single // [*httpsDialerErrorOrConn] for each tactic. -func (hd *HTTPSDialer) worker(ctx context.Context, joiner chan<- any, - reader <-chan *HTTPSDialerTactic, writer chan<- *httpsDialerErrorOrConn) { +func (hd *HTTPSDialer) worker( + ctx context.Context, + joiner chan<- any, + reader <-chan *HTTPSDialerTactic, + t0 time.Time, + writer chan<- *httpsDialerErrorOrConn, +) { // let the parent know that we terminated defer func() { joiner <- true }() @@ -253,7 +259,7 @@ func (hd *HTTPSDialer) worker(ctx context.Context, joiner chan<- any, Logger: hd.logger, } - conn, err := hd.dialTLS(ctx, prefixLogger, tactic) + conn, err := hd.dialTLS(ctx, prefixLogger, t0, tactic) writer <- &httpsDialerErrorOrConn{Conn: conn, Err: err} } @@ -261,9 +267,13 @@ func (hd *HTTPSDialer) worker(ctx context.Context, joiner chan<- any, // dialTLS performs the actual TLS dial. func (hd *HTTPSDialer) dialTLS( - ctx context.Context, logger model.Logger, tactic *HTTPSDialerTactic) (model.TLSConn, error) { + ctx context.Context, + logger model.Logger, + t0 time.Time, + tactic *HTTPSDialerTactic, +) (model.TLSConn, error) { // wait for the tactic to be ready to run - if err := httpsDialerTacticWaitReady(ctx, tactic); err != nil { + if err := httpsDialerTacticWaitReady(ctx, t0, tactic); err != nil { return nil, err } @@ -330,13 +340,18 @@ func (hd *HTTPSDialer) dialTLS( // httpsDialerWaitReady waits for the given delay to expire or the context to be canceled. If the // delay is zero or negative, we immediately return nil. We also return nil when the delay expires. We // return the context error if the context expires. -func httpsDialerTacticWaitReady(ctx context.Context, tactic *HTTPSDialerTactic) error { - delay := tactic.InitialDelay - if delay <= 0 { +func httpsDialerTacticWaitReady( + ctx context.Context, + t0 time.Time, + tactic *HTTPSDialerTactic, +) error { + deadline := t0.Add(tactic.InitialDelay) + delta := time.Until(deadline) + if delta <= 0 { return nil } - timer := time.NewTimer(delay) + timer := time.NewTimer(delta) defer timer.Stop() select { diff --git a/internal/enginenetx/httpsdialernull.go b/internal/enginenetx/httpsdialernull.go index fdaed6dffa..f9d88e4c29 100644 --- a/internal/enginenetx/httpsdialernull.go +++ b/internal/enginenetx/httpsdialernull.go @@ -3,7 +3,6 @@ package enginenetx import ( "context" "net" - "time" "github.com/ooni/probe-cli/v3/internal/model" "github.com/ooni/probe-cli/v3/internal/netxlite" @@ -29,12 +28,6 @@ type HTTPSDialerNullPolicy struct { var _ HTTPSDialerPolicy = &HTTPSDialerNullPolicy{} -// httpsDialerHappyEyeballsDelay is the delay after which we should start a new TCP -// connect and TLS handshake using another tactic. The standard Go library uses a 300ms -// delay for connecting. Because a TCP connect is one round trip and the TLS handshake -// is two round trips (roughly), we multiply this value by three. -const httpsDialerHappyEyeballsDelay = 900 * time.Millisecond - // LookupTactics implements HTTPSDialerPolicy. func (p *HTTPSDialerNullPolicy) LookupTactics( ctx context.Context, domain, port string) <-chan *HTTPSDialerTactic { @@ -57,7 +50,7 @@ func (p *HTTPSDialerNullPolicy) LookupTactics( for idx, addr := range addrs { tactic := &HTTPSDialerTactic{ Endpoint: net.JoinHostPort(addr, port), - InitialDelay: happyEyeballsDelay(httpsDialerHappyEyeballsDelay, idx), + InitialDelay: happyEyeballsDelay(idx), SNI: domain, VerifyHostname: domain, } diff --git a/internal/enginenetx/httpsdialerstats.go b/internal/enginenetx/httpsdialerstats.go index 27a2142dee..cbf16ad2d4 100644 --- a/internal/enginenetx/httpsdialerstats.go +++ b/internal/enginenetx/httpsdialerstats.go @@ -118,10 +118,6 @@ func NewHTTPSDialerStatsRootContainer() *HTTPSDialerStatsRootContainer { // The zero value of this structure is not ready to use; please, use the // [NewHTTPSDialerStatsManager] factory to create a new instance. type HTTPSDialerStatsManager struct { - // TimeNow is a field that allows you to override how we obtain the - // current time; modify this field BEFORE using this structure. - TimeNow func() time.Time - // kvStore is the key-value store we're using kvStore model.KeyValueStore @@ -178,7 +174,6 @@ func NewHTTPSDialerStatsManager(kvStore model.KeyValueStore, logger model.Logger } return &HTTPSDialerStatsManager{ - TimeNow: time.Now, root: root, kvStore: kvStore, logger: logger, @@ -216,7 +211,7 @@ func (mt *HTTPSDialerStatsManager) OnStarting(tactic *HTTPSDialerTactic) { // update stats record.CountStarted++ - record.LastUpdated = mt.TimeNow() + record.LastUpdated = time.Now() } // OnTCPConnectError implements HTTPSDialerStatsManager. @@ -233,7 +228,7 @@ func (mt *HTTPSDialerStatsManager) OnTCPConnectError(ctx context.Context, tactic } // update stats - record.LastUpdated = mt.TimeNow() + record.LastUpdated = time.Now() if ctx.Err() != nil { record.CountTCPConnectInterrupt++ return @@ -256,7 +251,7 @@ func (mt *HTTPSDialerStatsManager) OnTLSHandshakeError(ctx context.Context, tact } // update stats - record.LastUpdated = mt.TimeNow() + record.LastUpdated = time.Now() if ctx.Err() != nil { record.CountTLSHandshakeInterrupt++ return @@ -281,7 +276,7 @@ func (mt *HTTPSDialerStatsManager) OnTLSVerifyError(tactic *HTTPSDialerTactic, e // update stats record.CountTLSVerificationError++ record.HistoTLSVerificationError[err.Error()]++ - record.LastUpdated = mt.TimeNow() + record.LastUpdated = time.Now() } // OnSuccess implements HTTPSDialerStatsManager. @@ -299,7 +294,7 @@ func (mt *HTTPSDialerStatsManager) OnSuccess(tactic *HTTPSDialerTactic) { // update stats record.CountSuccess++ - record.LastUpdated = mt.TimeNow() + record.LastUpdated = time.Now() } // Close implements io.Closer diff --git a/internal/enginenetx/network_internal_test.go b/internal/enginenetx/network_internal_test.go index 3e667ff42a..550add880b 100644 --- a/internal/enginenetx/network_internal_test.go +++ b/internal/enginenetx/network_internal_test.go @@ -3,7 +3,6 @@ package enginenetx import ( "sync" "testing" - "time" "github.com/ooni/probe-cli/v3/internal/kvstore" "github.com/ooni/probe-cli/v3/internal/mocks" @@ -33,7 +32,6 @@ func TestNetworkUnit(t *testing.T) { }, }, stats: &HTTPSDialerStatsManager{ - TimeNow: time.Now, kvStore: &kvstore.Memory{}, logger: model.DiscardLogger, mu: sync.Mutex{}, @@ -59,7 +57,6 @@ func TestNetworkUnit(t *testing.T) { netx := &Network{ reso: expected, stats: &HTTPSDialerStatsManager{ - TimeNow: time.Now, kvStore: &kvstore.Memory{}, logger: model.DiscardLogger, mu: sync.Mutex{},