From 4a44a52fd93900a3c21ed3ef0805e8c25eeaa09b Mon Sep 17 00:00:00 2001 From: Erik Sipsma Date: Tue, 11 Jun 2024 02:38:09 -0700 Subject: [PATCH] executor: fix deadlock in service exit from netns workers (#7610) There was a race condition where contexts get canceled after a job is sent to the netns worker but before the result got read. This caused runInNetNS to exit (due to canceled context) but the result chan to never be read from. It was crucially an unbuffered chan, which resulted in the worker never being able to exit and the whole container cleanup to block indefinitely. The fix here is just to make that chan buffered with a size of 1 so that the worker doesn't ever get blocked trying to write to it. There's a few other related changes of making some other chans buffered and explicitly closing them with a defer (to handle panic cases) which aren't needed to fix this issue but seemed worth tidying up now. Signed-off-by: Erik Sipsma --- engine/buildkit/linux_namespace.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/engine/buildkit/linux_namespace.go b/engine/buildkit/linux_namespace.go index 45601800d..40e3e7873 100644 --- a/engine/buildkit/linux_namespace.go +++ b/engine/buildkit/linux_namespace.go @@ -29,7 +29,7 @@ func runInNetNS[T any]( value T err error } - resultCh := make(chan result) + resultCh := make(chan result, 1) select { case <-ctx.Done(): @@ -37,6 +37,7 @@ func runInNetNS[T any]( case <-state.done: return zero, fmt.Errorf("container exited") case state.netNSJobs <- func() { + defer close(resultCh) v, err := fn() resultCh <- result{value: v, err: err} }: @@ -108,8 +109,9 @@ func (w *Worker) runNetNSWorkers(ctx context.Context, state *execState) error { }}} // must run in it's own isolated goroutine since it will lock to threads - errCh := make(chan error) + errCh := make(chan error, 1) go func() { + defer close(errCh) errCh <- nsw.run(ctx, state.netNSJobs) }() err := <-errCh