Skip to content

Commit

Permalink
Bootstrapper: don't restart healthy tablets (#9659)
Browse files Browse the repository at this point in the history
  • Loading branch information
snaury authored Sep 26, 2024
1 parent 87ff4fa commit 6bec40a
Show file tree
Hide file tree
Showing 16 changed files with 1,183 additions and 417 deletions.
12 changes: 8 additions & 4 deletions ydb/core/base/statestorage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,13 +164,14 @@ void TStateStorageInfo::TSelection::MergeReply(EStatus status, EStatus *owner, u
ui32 unknown = 0;
ui32 ok = 0;
ui32 outdated = 0;
ui32 unavailable = 0;

const ui32 majority = Sz / 2 + 1;

ui32 cookie = 0;
for (ui32 i = 0; i < Sz; ++i) {
EStatus &st = Status[i];
if (resetOld && st != StatusUnknown)
if (resetOld && st != StatusUnknown && st != StatusUnavailable)
st = StatusOutdated;

if (cookie == targetCookie)
Expand All @@ -190,16 +191,19 @@ void TStateStorageInfo::TSelection::MergeReply(EStatus status, EStatus *owner, u
case StatusOutdated:
++outdated;
break;
case StatusUnavailable:
++unavailable;
break;
}
}

if (owner) {
if (ok >= majority) {
*owner = StatusOk;
} else if (outdated >= majority) {
*owner = StatusOutdated;
} else if (ok + unknown < majority) {
if (outdated)
if (unavailable > (Sz - majority))
*owner = StatusUnavailable;
else if (outdated)
*owner = StatusOutdated;
else
*owner = StatusNoInfo;
Expand Down
1 change: 1 addition & 0 deletions ydb/core/base/statestorage.h
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,7 @@ struct TStateStorageInfo : public TThrRefBase {
StatusOk,
StatusNoInfo,
StatusOutdated,
StatusUnavailable,
};

ui32 Sz;
Expand Down
29 changes: 22 additions & 7 deletions ydb/core/base/statestorage_proxy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ class TStateStorageProxyRequest : public TActor<TStateStorageProxyRequest> {
Signature[cookie] = Max<ui64>();
++RepliesMerged;

ReplicaSelection->MergeReply(TStateStorageInfo::TSelection::StatusNoInfo, &ReplyStatus, cookie, false);
ReplicaSelection->MergeReply(TStateStorageInfo::TSelection::StatusUnavailable, &ReplyStatus, cookie, false);
}
}

Expand All @@ -192,7 +192,8 @@ class TStateStorageProxyRequest : public TActor<TStateStorageProxyRequest> {
++RepliesMerged;
++SignaturesMerged;

if (status == NKikimrProto::OK) {
switch (status) {
case NKikimrProto::OK: {
const ui32 gen = record.GetCurrentGeneration();
const ui32 step = record.GetCurrentStep();
const TActorId leader = ActorIdFromProto(record.GetCurrentLeader());
Expand Down Expand Up @@ -221,9 +222,14 @@ class TStateStorageProxyRequest : public TActor<TStateStorageProxyRequest> {

ReplicaSelection->MergeReply(TStateStorageInfo::TSelection::StatusOk, &ReplyStatus, cookie, reset);
}
} else if (status == NKikimrProto::ERROR) {
break;
}
// NOTE: replicas currently reply with ERROR when there is no data for the tablet
case NKikimrProto::ERROR:
case NKikimrProto::NODATA:
ReplicaSelection->MergeReply(TStateStorageInfo::TSelection::StatusNoInfo, &ReplyStatus, cookie, false);
} else {
break;
default:
Y_ABORT();
}

Expand Down Expand Up @@ -307,11 +313,14 @@ class TStateStorageProxyRequest : public TActor<TStateStorageProxyRequest> {
ReplyAndDie(NKikimrProto::OK);
return;
case TStateStorageInfo::TSelection::StatusNoInfo:
ReplyAndDie(NKikimrProto::ERROR);
ReplyAndDie(NKikimrProto::NODATA);
return;
case TStateStorageInfo::TSelection::StatusOutdated:
ReplyAndDie(NKikimrProto::RACE);
return;
case TStateStorageInfo::TSelection::StatusUnavailable:
ReplyAndDie(NKikimrProto::ERROR);
return;
}
Y_DEBUG_ABORT_UNLESS(false);
PassAway();
Expand All @@ -332,12 +341,15 @@ class TStateStorageProxyRequest : public TActor<TStateStorageProxyRequest> {
return;
case TStateStorageInfo::TSelection::StatusNoInfo:
if (RepliesMerged == Replicas) { // for negative response always waits for full reply set to avoid herding of good replicas by fast retry cycle
ReplyAndSig(NKikimrProto::ERROR);
ReplyAndSig(NKikimrProto::NODATA);
}
return;
case TStateStorageInfo::TSelection::StatusOutdated:
ReplyAndSig(NKikimrProto::RACE);
return;
case TStateStorageInfo::TSelection::StatusUnavailable:
ReplyAndSig(NKikimrProto::ERROR);
return;
}
}
}
Expand Down Expand Up @@ -379,6 +391,8 @@ class TStateStorageProxyRequest : public TActor<TStateStorageProxyRequest> {
}
return;
case TStateStorageInfo::TSelection::StatusNoInfo:
case TStateStorageInfo::TSelection::StatusUnavailable:
// Note: StatusNoInfo shouldn't really happen for update queries
ReplyAndDie(NKikimrProto::ERROR);
return;
case TStateStorageInfo::TSelection::StatusOutdated:
Expand All @@ -404,7 +418,8 @@ class TStateStorageProxyRequest : public TActor<TStateStorageProxyRequest> {
}
return;
case TStateStorageInfo::TSelection::StatusNoInfo:
// should not happens for update queries
case TStateStorageInfo::TSelection::StatusUnavailable:
// Note: StatusNoInfo shouldn't really happen for update queries
ReplyAndSig(NKikimrProto::ERROR);
return;
case TStateStorageInfo::TSelection::StatusOutdated:
Expand Down
1 change: 1 addition & 0 deletions ydb/core/base/statestorage_replica.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ class TStateStorageReplica : public TActorBootstrapped<TStateStorageReplica> {
}
}
} else {
// FIXME: change to NODATA in a future version
msg.Reset(new TEvStateStorage::TEvReplicaInfo(tabletId, NKikimrProto::ERROR));
}
msg->Record.SetCookie(cookie);
Expand Down
Loading

0 comments on commit 6bec40a

Please sign in to comment.