From 72ff0be6857c0f714b2d588bd108afb2cab3de43 Mon Sep 17 00:00:00 2001 From: Lucian Petrut Date: Thu, 5 Sep 2024 07:57:33 +0000 Subject: [PATCH] Propagate raft_recover errors and improve tracing At the moment, the raft_recover error messages are not propagated, so the caller gets a cryptic error code (1), without further details. This change ensures that the error messages are propagated and adds a few extra trace messages. Furthermore, tracing is disabled unless we start the node (not the case when we're actually trying to recover it). For now, we're adding a "dqliteTracingMaybeEnable" call in "dqlite_node_recover_ext". --- src/raft/raft.c | 1 + src/raft/uv.c | 11 +++++++++++ src/server.c | 7 +++++++ 3 files changed, 19 insertions(+) diff --git a/src/raft/raft.c b/src/raft/raft.c index ce3996b90..b0dd7d39f 100644 --- a/src/raft/raft.c +++ b/src/raft/raft.c @@ -227,6 +227,7 @@ int raft_recover(struct raft *r, const struct raft_configuration *conf) rv = r->io->recover(r->io, conf); if (rv != 0) { + ErrMsgTransfer(r->io->errmsg, r->errmsg, "io"); return rv; } diff --git a/src/raft/uv.c b/src/raft/uv.c index f1450e742..b093715e1 100644 --- a/src/raft/uv.c +++ b/src/raft/uv.c @@ -324,6 +324,7 @@ static int uvFilterSegments(struct uv *uv, ErrMsgPrintf(uv->io->errmsg, "closed segment %s is past last snapshot %s", segment->filename, snapshot_filename); + tracef("corrupted raft state, error: %s", uv->io->errmsg); return RAFT_CORRUPT; } @@ -369,6 +370,7 @@ static int uvLoadSnapshotAndEntries(struct uv *uv, rv = UvList(uv, &snapshots, &n_snapshots, &segments, &n_segments, uv->io->errmsg); if (rv != 0) { + tracef("failed to list snapshots and segments, error: %d", rv); goto err; } @@ -377,12 +379,14 @@ static int uvLoadSnapshotAndEntries(struct uv *uv, char snapshot_filename[UV__FILENAME_LEN]; *snapshot = RaftHeapMalloc(sizeof **snapshot); if (*snapshot == NULL) { + tracef("malloc failed"); rv = RAFT_NOMEM; goto err; } rv = UvSnapshotLoad(uv, &snapshots[n_snapshots - 1], *snapshot, uv->io->errmsg); if (rv != 0) { + tracef("snapshot load failed: %d", rv); RaftHeapFree(*snapshot); *snapshot = NULL; goto err; @@ -401,6 +405,7 @@ static int uvLoadSnapshotAndEntries(struct uv *uv, rv = uvFilterSegments(uv, (*snapshot)->index, snapshot_filename, &segments, &n_segments); if (rv != 0) { + tracef("failed to filter segments: %d", rv); goto err; } if (segments != NULL) { @@ -420,6 +425,7 @@ static int uvLoadSnapshotAndEntries(struct uv *uv, rv = uvSegmentLoadAll(uv, *start_index, segments, n_segments, entries, n); if (rv != 0) { + tracef("failed to load all segments: %d", rv); goto err; } @@ -447,6 +453,9 @@ static int uvLoadSnapshotAndEntries(struct uv *uv, err: assert(rv != 0); + tracef("auto-recovery: %d, load depth: %d, error: %s", + uv->auto_recovery, depth, uv->io->errmsg); + if (*snapshot != NULL) { snapshotDestroy(*snapshot); *snapshot = NULL; @@ -583,6 +592,7 @@ static int uvRecover(struct raft_io *io, const struct raft_configuration *conf) rv = uvLoadSnapshotAndEntries(uv, &snapshot, &start_index, &entries, &n_entries, 0); if (rv != 0) { + tracef("failed to load raft snapshot and entries, error: %d", rv); return rv; } @@ -599,6 +609,7 @@ static int uvRecover(struct raft_io *io, const struct raft_configuration *conf) rv = uvSegmentCreateClosedWithConfiguration(uv, next_index, conf); if (rv != 0) { + tracef("failed to create segment, error: %d", rv); return rv; } diff --git a/src/server.c b/src/server.c index 7e44b1c49..3f4f33fa5 100644 --- a/src/server.c +++ b/src/server.c @@ -1022,6 +1022,7 @@ int dqlite_node_recover_ext(dqlite_node *n, struct dqlite_node_info_ext infos[], int n_info) { + dqliteTracingMaybeEnable(true); tracef("dqlite node recover ext"); struct raft_configuration configuration; int i; @@ -1031,6 +1032,7 @@ int dqlite_node_recover_ext(dqlite_node *n, for (i = 0; i < n_info; i++) { struct dqlite_node_info_ext *info = &infos[i]; if (!node_info_valid(info)) { + tracef("invalid node info"); rv = DQLITE_MISUSE; goto out; } @@ -1040,6 +1042,7 @@ int dqlite_node_recover_ext(dqlite_node *n, rv = raft_configuration_add(&configuration, info->id, address, raft_role); if (rv != 0) { + tracef("unable to add server to raft configuration, error: %d", rv); assert(rv == RAFT_NOMEM); rv = DQLITE_NOMEM; goto out; @@ -1049,11 +1052,15 @@ int dqlite_node_recover_ext(dqlite_node *n, int lock_fd; rv = acquire_dir(n->config.raft_dir, &lock_fd); if (rv != 0) { + tracef("couldn't acquire lock, error: %d", rv); goto out; } rv = raft_recover(&n->raft, &configuration); if (rv != 0) { + tracef("raft recovery failed, error: %d", rv); + snprintf(n->errmsg, DQLITE_ERRMSG_BUF_SIZE, "raft_recover(): %s", + raft_errmsg(&n->raft)); rv = DQLITE_ERROR; goto out; }