Skip to content

Commit f3abd11

Browse files
committed
don't use things that instantly exit in a code path tests hit
The testing package is full of subtle magic, and one of the most subtle is this: t.Logf, etcetera, all write to a buffer which is then displayed after the test is run. Which means that, if you exit, the buffer is never displayed. This means that, if a test case can fail in a way that causes an instant exit, you don't hit defers, you don't get your log messages, you just get a mysterious exit of the process. We have two cases where backup commands were calling log.Fatal instead of returning an error. The error in question is displayed correctly and informatively if returned, so we return it. We also have one case where we were using os.Exit to avoid a deadlock. Instead, we make the thing that would deadlock conditional on the test not having failed. In the event that the test fails, we now print our failure message correctly, then also report an unclosed cluster. That's fine.
1 parent 549566b commit f3abd11

File tree

3 files changed

+20
-15
lines changed

3 files changed

+20
-15
lines changed

ctl/backup.go

+1-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ import (
1010
"fmt"
1111
"io"
1212
"io/fs"
13-
"log"
1413
"os"
1514
"path/filepath"
1615
"time"
@@ -549,7 +548,7 @@ func (cmd *BackupCommand) backupShardDataframe(ctx context.Context, indexName st
549548
resp, err := client.GetDataframeShard(ctx, indexName, shard)
550549
// no error if doesn't exist
551550
if err != nil {
552-
log.Fatal(err)
551+
return fmt.Errorf("getting dataframe: %w", err)
553552
}
554553
defer resp.Body.Close()
555554
if resp.StatusCode == 404 {

ctl/backup_tar.go

+1-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ import (
88
"encoding/json"
99
"fmt"
1010
"io"
11-
"log"
1211
"os"
1312
"path"
1413
"path/filepath"
@@ -329,7 +328,7 @@ func (cmd *BackupTarCommand) backupTarShardDataframe(ctx context.Context, tw *ta
329328
resp, err := client.GetDataframeShard(ctx, indexName, shard)
330329
// no error if doesn't exist
331330
if err != nil {
332-
log.Fatal(err)
331+
return fmt.Errorf("getting dataframe: %w", err)
333332
}
334333
defer resp.Body.Close()
335334
if resp.StatusCode == 404 {

index_test.go

+18-11
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ import (
66
"context"
77
"fmt"
88
"math/rand"
9-
"os"
109
"reflect"
1110
"testing"
1211
"time"
@@ -258,7 +257,23 @@ func isNotFoundError(err error) bool {
258257
// For details, check out https://molecula.atlassian.net/browse/CORE-919
259258
func TestIndex_RecreateFieldOnRestart(t *testing.T) {
260259
c := test.MustRunUnsharedCluster(t, 1)
261-
defer c.Close()
260+
defer func() {
261+
// We anticipate a deadlock, and if we hit the deadlock,
262+
// closing would ALSO deadlock, so we won't want to do that.
263+
//
264+
// The alternative, of trying to call os.Exit, prevents us
265+
// from reporting anything at all, because testing doesn't
266+
// actually display messages until it's done.
267+
//
268+
// So, if we're bailing because of a fatal error, we don't
269+
// try to close the cluster, because Something Went Wrong and
270+
// it may well have been a deadlock. We leak one cluster on
271+
// a failed test, but we correctly report the test as failed
272+
// before also reporting the unclosed resources.
273+
if !t.Failed() {
274+
c.Close()
275+
}
276+
}()
262277

263278
// create index
264279
indexName := fmt.Sprintf("idx_%d", rand.Uint64())
@@ -310,15 +325,7 @@ func TestIndex_RecreateFieldOnRestart(t *testing.T) {
310325
}()
311326
select {
312327
case <-time.After(10 * time.Second):
313-
// We have to use os.Exit here instead of t.Fatal or panic since
314-
// on panic, deferred statements are still ran. Given that
315-
// we have deferred cluster.Close(), it deadlocks on the same
316-
// issue this test is, well, is testing on.
317-
// With os.Exit, the process exits at that point without running the
318-
// deferred actions. This is more of a work-around fix to make the
319-
// test meaningful on timeout.
320-
t.Logf("recreating field took too long")
321-
os.Exit(1)
328+
t.Fatal("recreating field took too long")
322329
case err := <-errCh:
323330
if err != nil {
324331
t.Fatal(err)

0 commit comments

Comments
 (0)