From b124b6971bb7f9247f73b9db83de09ff5a7b8aee Mon Sep 17 00:00:00 2001 From: Simone Basso Date: Fri, 10 May 2024 15:36:02 +0200 Subject: [PATCH] fix(enginenetx): compute zero time when we start dialing (#1594) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, the code was computing the zero time when we started resolving. However, I have observed in the wild that, if the DNS lookup time is high, we're going to have several ready tactics. We did not previously see this bug because we gave priority to bridges and stats tactics, hence we always had some ready tactics from the get go. This PR is part of settling the dust after the changes requested in the https://github.com/ooni/probe-cli/pull/1552 review. The related tracking issue is https://github.com/ooni/probe/issues/2704. --------- Co-authored-by: Arturo Filastò --- internal/enginenetx/filter.go | 3 ++ internal/enginenetx/httpsdialer.go | 51 +++++++++++++++++++++++------- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/internal/enginenetx/filter.go b/internal/enginenetx/filter.go index 19fb47086..998c1c98d 100644 --- a/internal/enginenetx/filter.go +++ b/internal/enginenetx/filter.go @@ -60,6 +60,9 @@ func filterAssignInitialDelays(input <-chan *httpsDialerTactic) <-chan *httpsDia index := 0 for tx := range input { + // TODO(bassosimone): what do we do now about the user configured + // initial delays? Should we declare them as deprecated? + // rewrite the delays tx.InitialDelay = happyEyeballsDelay(index) index++ diff --git a/internal/enginenetx/httpsdialer.go b/internal/enginenetx/httpsdialer.go index 0375967d8..f59ceb938 100644 --- a/internal/enginenetx/httpsdialer.go +++ b/internal/enginenetx/httpsdialer.go @@ -8,6 +8,7 @@ import ( "errors" "fmt" "net" + "sync" "sync/atomic" "time" @@ -206,24 +207,22 @@ func (hd *httpsDialer) DialTLSContext(ctx context.Context, network string, endpo // pattern used by `./internal/httpclientx` to perform attempts faster // in case there is an initial early failure. - // TODO(bassosimone): the algorithm to filter and assign initial - // delays is broken because, if the DNS runs for more than one - // second, then several policies will immediately be due. We should - // probably use a better strategy that takes as the zero the time - // when the first dialing policy becomes available. - // We need a cancellable context to interrupt the tactics emitter early when we // immediately get a valid response and we don't need to use other tactics. ctx, cancel := context.WithCancel(ctx) defer cancel() + // Create structure for computing the zero dialing time once during + // the first dial, so that subsequent attempts use happy eyeballs based + // on the moment in which we tried the first dial. + t0 := &httpsDialerWorkerZeroTime{} + // The emitter will emit tactics and then close the channel when done. We spawn 16 workers // that handle tactics in parallel and post results on the collector channel. emitter := httpsDialerFilterTactics(hd.policy.LookupTactics(ctx, hostname, port)) collector := make(chan *httpsDialerErrorOrConn) joiner := make(chan any) const parallelism = 16 - t0 := time.Now() for idx := 0; idx < parallelism; idx++ { go hd.worker(ctx, joiner, emitter, t0, collector) } @@ -257,6 +256,36 @@ func (hd *httpsDialer) DialTLSContext(ctx context.Context, network string, endpo return httpsDialerReduceResult(connv, errorv) } +// httpsDialerWorkerZeroTime contains the zero time used when dialing. We set this +// zero time when we start the first dialing attempt, such that subsequent attempts +// are correctly spaced starting from such a zero time. +// +// A previous approach was that we were taking the zero time when we started +// getting tactics, but this approach was wrong, because it caused several tactics +// to be ready, when the DNS lookup was slow. +// +// The zero value of this structure is ready to use. +type httpsDialerWorkerZeroTime struct { + // mu provides mutual exclusion. + mu sync.Mutex + + // t is the zero time. + t time.Time +} + +// Get returns the zero dialing time. The first invocation of this method +// saves the zero dialing time and subsquent invocations just return it. +// +// This method is safe to be called concurrently by goroutines. +func (t0 *httpsDialerWorkerZeroTime) Get() time.Time { + defer t0.mu.Unlock() + t0.mu.Lock() + if t0.t.IsZero() { + t0.t = time.Now() + } + return t0.t +} + // httpsDialerFilterTactics filters the tactics to: // // 1. be paranoid and filter out nil tactics if any; @@ -297,7 +326,7 @@ func (hd *httpsDialer) worker( ctx context.Context, joiner chan<- any, reader <-chan *httpsDialerTactic, - t0 time.Time, + t0 *httpsDialerWorkerZeroTime, writer chan<- *httpsDialerErrorOrConn, ) { // let the parent know that we terminated @@ -321,7 +350,7 @@ func (hd *httpsDialer) worker( func (hd *httpsDialer) dialTLS( ctx context.Context, logger model.Logger, - t0 time.Time, + t0 *httpsDialerWorkerZeroTime, tactic *httpsDialerTactic, ) (model.TLSConn, error) { // honor happy-eyeballs delays and wait for the tactic to be ready to run @@ -398,10 +427,10 @@ func (hd *httpsDialer) dialTLS( // return the context error if the context expires. func httpsDialerTacticWaitReady( ctx context.Context, - t0 time.Time, + t0 *httpsDialerWorkerZeroTime, tactic *httpsDialerTactic, ) error { - deadline := t0.Add(tactic.InitialDelay) + deadline := t0.Get().Add(tactic.InitialDelay) delta := time.Until(deadline) if delta <= 0 { return nil