From 0ce4cf7a4aa18a3e0eac2cac4957ed6ffe5524ec Mon Sep 17 00:00:00 2001 From: Aman Mangal Date: Tue, 17 Oct 2023 23:59:29 +0530 Subject: [PATCH] fix(restore): Do not retry restore proposal (#8058) Do not retry the restore proposal. It can cause issues in the edge case scenarios. Consider the following scenario: 1. alpha-2 gets the restore request (leader is alpha-0) 2. alpha-2 sends the request to alpha-0 (leader). 3. alpha-0 called proposeAndWait which proposed the req (index 24) at time=15:56:10 4. alpha-0 was still waiting for the proposal to be applied and RPC call for `Restore` by alpha-2 got "transport closing error" at time=15:59:08 5. transport closing is a retriable error, so alpha-2 again tried to proposeoOrSend, this time leader was alpha-1, so it sent it to alpha-1 6. alpha-1 proposed the restore request (index 28) at time=15:59:09 --- worker/online_restore.go | 38 +------------------------------------- 1 file changed, 1 insertion(+), 37 deletions(-) diff --git a/worker/online_restore.go b/worker/online_restore.go index c13df8ebd13..8fcf44294ac 100644 --- a/worker/online_restore.go +++ b/worker/online_restore.go @@ -22,7 +22,6 @@ import ( "path/filepath" "strings" "sync" - "time" "github.com/golang/glog" "github.com/golang/protobuf/proto" @@ -149,7 +148,7 @@ func ProcessRestoreRequest(ctx context.Context, req *pb.RestoreRequest, wg *sync reqCopy.GroupId = gid wg.Add(1) go func() { - errCh <- tryRestoreProposal(ctx, reqCopy) + errCh <- proposeRestoreOrSend(ctx, reqCopy) }() } @@ -182,41 +181,6 @@ func proposeRestoreOrSend(ctx context.Context, req *pb.RestoreRequest) error { return err } -func retriableRestoreError(err error) bool { - switch { - case err == conn.ErrNoConnection: - // Try to recover from temporary connection issues. - return true - case strings.Contains(err.Error(), "Raft isn't initialized yet"): - // Try to recover if raft has not been initialized. - return true - case strings.Contains(err.Error(), errRestoreProposal): - // Do not try to recover from other errors when sending the proposal. - return false - default: - // Try to recover from other errors (e.g wrong group, waiting for timestamp, etc). - return true - } -} - -func tryRestoreProposal(ctx context.Context, req *pb.RestoreRequest) error { - var err error - for i := 0; i < 10; i++ { - err = proposeRestoreOrSend(ctx, req) - if err == nil { - glog.Info("proposeRestoreOrSend done.") - return nil - } - glog.Errorf("Got error while proposeRestoreOrSend: %v, retrying!", err) - if retriableRestoreError(err) { - time.Sleep(time.Second) - continue - } - return err - } - return err -} - // Restore implements the Worker interface. func (w *grpcWorker) Restore(ctx context.Context, req *pb.RestoreRequest) (*pb.Status, error) { var emptyRes pb.Status