diff --git a/ydb/core/base/statestorage.cpp b/ydb/core/base/statestorage.cpp index 942250db4545..ad9fdd5a7a18 100644 --- a/ydb/core/base/statestorage.cpp +++ b/ydb/core/base/statestorage.cpp @@ -164,13 +164,14 @@ void TStateStorageInfo::TSelection::MergeReply(EStatus status, EStatus *owner, u ui32 unknown = 0; ui32 ok = 0; ui32 outdated = 0; + ui32 unavailable = 0; const ui32 majority = Sz / 2 + 1; ui32 cookie = 0; for (ui32 i = 0; i < Sz; ++i) { EStatus &st = Status[i]; - if (resetOld && st != StatusUnknown) + if (resetOld && st != StatusUnknown && st != StatusUnavailable) st = StatusOutdated; if (cookie == targetCookie) @@ -190,16 +191,19 @@ void TStateStorageInfo::TSelection::MergeReply(EStatus status, EStatus *owner, u case StatusOutdated: ++outdated; break; + case StatusUnavailable: + ++unavailable; + break; } } if (owner) { if (ok >= majority) { *owner = StatusOk; - } else if (outdated >= majority) { - *owner = StatusOutdated; } else if (ok + unknown < majority) { - if (outdated) + if (unavailable > (Sz - majority)) + *owner = StatusUnavailable; + else if (outdated) *owner = StatusOutdated; else *owner = StatusNoInfo; diff --git a/ydb/core/base/statestorage.h b/ydb/core/base/statestorage.h index fdb80d2702d8..7e6eaf82648a 100644 --- a/ydb/core/base/statestorage.h +++ b/ydb/core/base/statestorage.h @@ -470,6 +470,7 @@ struct TStateStorageInfo : public TThrRefBase { StatusOk, StatusNoInfo, StatusOutdated, + StatusUnavailable, }; ui32 Sz; diff --git a/ydb/core/base/statestorage_proxy.cpp b/ydb/core/base/statestorage_proxy.cpp index 61c1579d5ced..b56727f867e7 100644 --- a/ydb/core/base/statestorage_proxy.cpp +++ b/ydb/core/base/statestorage_proxy.cpp @@ -177,7 +177,7 @@ class TStateStorageProxyRequest : public TActor { Signature[cookie] = Max(); ++RepliesMerged; - ReplicaSelection->MergeReply(TStateStorageInfo::TSelection::StatusNoInfo, &ReplyStatus, cookie, false); + ReplicaSelection->MergeReply(TStateStorageInfo::TSelection::StatusUnavailable, &ReplyStatus, cookie, false); } } @@ -192,7 +192,8 @@ class TStateStorageProxyRequest : public TActor { ++RepliesMerged; ++SignaturesMerged; - if (status == NKikimrProto::OK) { + switch (status) { + case NKikimrProto::OK: { const ui32 gen = record.GetCurrentGeneration(); const ui32 step = record.GetCurrentStep(); const TActorId leader = ActorIdFromProto(record.GetCurrentLeader()); @@ -221,9 +222,14 @@ class TStateStorageProxyRequest : public TActor { ReplicaSelection->MergeReply(TStateStorageInfo::TSelection::StatusOk, &ReplyStatus, cookie, reset); } - } else if (status == NKikimrProto::ERROR) { + break; + } + // NOTE: replicas currently reply with ERROR when there is no data for the tablet + case NKikimrProto::ERROR: + case NKikimrProto::NODATA: ReplicaSelection->MergeReply(TStateStorageInfo::TSelection::StatusNoInfo, &ReplyStatus, cookie, false); - } else { + break; + default: Y_ABORT(); } @@ -307,11 +313,14 @@ class TStateStorageProxyRequest : public TActor { ReplyAndDie(NKikimrProto::OK); return; case TStateStorageInfo::TSelection::StatusNoInfo: - ReplyAndDie(NKikimrProto::ERROR); + ReplyAndDie(NKikimrProto::NODATA); return; case TStateStorageInfo::TSelection::StatusOutdated: ReplyAndDie(NKikimrProto::RACE); return; + case TStateStorageInfo::TSelection::StatusUnavailable: + ReplyAndDie(NKikimrProto::ERROR); + return; } Y_DEBUG_ABORT_UNLESS(false); PassAway(); @@ -332,12 +341,15 @@ class TStateStorageProxyRequest : public TActor { return; case TStateStorageInfo::TSelection::StatusNoInfo: if (RepliesMerged == Replicas) { // for negative response always waits for full reply set to avoid herding of good replicas by fast retry cycle - ReplyAndSig(NKikimrProto::ERROR); + ReplyAndSig(NKikimrProto::NODATA); } return; case TStateStorageInfo::TSelection::StatusOutdated: ReplyAndSig(NKikimrProto::RACE); return; + case TStateStorageInfo::TSelection::StatusUnavailable: + ReplyAndSig(NKikimrProto::ERROR); + return; } } } @@ -379,6 +391,8 @@ class TStateStorageProxyRequest : public TActor { } return; case TStateStorageInfo::TSelection::StatusNoInfo: + case TStateStorageInfo::TSelection::StatusUnavailable: + // Note: StatusNoInfo shouldn't really happen for update queries ReplyAndDie(NKikimrProto::ERROR); return; case TStateStorageInfo::TSelection::StatusOutdated: @@ -404,7 +418,8 @@ class TStateStorageProxyRequest : public TActor { } return; case TStateStorageInfo::TSelection::StatusNoInfo: - // should not happens for update queries + case TStateStorageInfo::TSelection::StatusUnavailable: + // Note: StatusNoInfo shouldn't really happen for update queries ReplyAndSig(NKikimrProto::ERROR); return; case TStateStorageInfo::TSelection::StatusOutdated: diff --git a/ydb/core/base/statestorage_replica.cpp b/ydb/core/base/statestorage_replica.cpp index 1d352f6eb51c..c99e645d4b78 100644 --- a/ydb/core/base/statestorage_replica.cpp +++ b/ydb/core/base/statestorage_replica.cpp @@ -104,6 +104,7 @@ class TStateStorageReplica : public TActorBootstrapped { } } } else { + // FIXME: change to NODATA in a future version msg.Reset(new TEvStateStorage::TEvReplicaInfo(tabletId, NKikimrProto::ERROR)); } msg->Record.SetCookie(cookie); diff --git a/ydb/core/tablet/bootstrapper.cpp b/ydb/core/tablet/bootstrapper.cpp index 375cac8787a8..3a8b5030d9d6 100644 --- a/ydb/core/tablet/bootstrapper.cpp +++ b/ydb/core/tablet/bootstrapper.cpp @@ -1,10 +1,13 @@ #include "bootstrapper.h" +#include "bootstrapper_impl.h" #include +#include #include #include #include +#include #include #include #include @@ -13,138 +16,37 @@ namespace NKikimr { -struct TEvBootstrapper::TEvWatch : public TEventPB { - TEvWatch() - {} +class TBootstrapper : public TActorBootstrapped { + const TIntrusivePtr TabletInfo; + const TIntrusivePtr BootstrapperInfo; + bool ModeStandby; - TEvWatch(ui64 tabletId, ui64 selfSeed, ui64 round) - { - Record.SetTabletID(tabletId); - Record.SetSelfSeed(selfSeed); - Record.SetRound(round); - } -}; - -struct TEvBootstrapper::TEvWatchResult : public TEventPB { - TEvWatchResult() - {} - - TEvWatchResult(ui64 tabletId, NKikimrBootstrapper::TEvWatchResult::EState state, ui64 seed, ui64 round) - { - Record.SetTabletID(tabletId); - Record.SetState(state); - Record.SetSeed(seed); - Record.SetRound(round); - } -}; - -struct TEvBootstrapper::TEvNotify : public TEventPB { - TEvNotify() - {} - - TEvNotify(ui64 tabletId, NKikimrBootstrapper::TEvNotify::EOp op, ui64 round) - { - Record.SetTabletID(tabletId); - Record.SetOp(op); - Record.SetRound(round); - } -}; - - -class TBootstrapper : public TActor { - TIntrusivePtr TabletInfo; - TIntrusivePtr BootstrapperInfo; + TActorId KnownLeaderPipe; TActorId LookOnActorID; TActorId FollowerActorID; - ui64 RoundCounter; - ui64 SelfSeed; + ui64 RoundCounter = 0xdeadbeefdeadbeefull; + ui64 SelfSeed = 0xdeadbeefdeadbeefull; TInstant BootDelayedUntil; - // we watch someone and do not act w/o shutdown - struct TWatch { - struct TWatched { - TActorId ActorID; - bool Owner; - - TWatched() - : Owner(false) - {} - TWatched(const TActorId &actorID, bool owner) - : ActorID(actorID) - , Owner(owner) - {} - }; - - bool CheckWatch(ui32 fromNode) { - bool seenOwner = false; - for (TWatched &x : Watched) { - seenOwner |= x.Owner; - if (x.ActorID.NodeId() == fromNode) - return (x.Owner == false); - } - return !seenOwner; - } - - bool RemoveAlienEntry(ui32 idx) { - Watched[idx] = Watched.back(); - Watched.pop_back(); - if (!AnyOf(Watched, [](const TWatch::TWatched &x) { return x.Owner; })) - Watched.clear(); - return Watched.empty(); - } - - bool RemoveAlien(const TActorId &alien) { - for (ui32 i = 0, e = Watched.size(); i != e; ++i) { - if (Watched[i].ActorID == alien) - return RemoveAlienEntry(i); - } - return false; - } - - bool RemoveAlienNode(ui32 node) { - for (ui32 i = 0, e = Watched.size(); i != e; ++i) { - if (Watched[i].ActorID.NodeId() == node) - return RemoveAlienEntry(i); - } - return false; - } - - TVector Watched; - }; - - // we are under watch, must notify on error - struct TWatched { - struct TWatcher { - TActorId ActorID; - ui64 Round; - - TWatcher() - : ActorID() - , Round() - {} - TWatcher(const TActorId &actorID, ui64 round) - : ActorID(actorID) - , Round(round) - {} - }; - - void AddWatcher(const TActorId &actorId, ui64 round) { - for (TWatcher &x : Watchers) { - if (actorId.NodeId() == x.ActorID.NodeId()) { - x.ActorID = actorId; - x.Round = round; - return; - } - } - Watchers.push_back(TWatcher(actorId, round)); - } - - TVector Watchers; +private: + /** + * A remote watcher waiting for our notification + */ + struct TWatcher { + TActorId ActorId{}; + ui64 Round{}; }; +private: + /** + * Present when this bootstrapper initiates a voting round + * We wait for the state of other nodes, when all other nodes are either + * free or unavailable the node with the minimum seed becomes owner and + * boots the tablet. + */ struct TRound { enum class EAlienState { Wait, @@ -155,26 +57,75 @@ class TBootstrapper : public TActor { }; struct TAlien { - EAlienState State; - ui64 Seed; - - TAlien() - : State(EAlienState::Wait) - , Seed() - {} - TAlien(EAlienState state, ui64 seed) - : State(state) - , Seed(seed) - {} + EAlienState State = EAlienState::Wait; + ui64 Seed = Max(); }; TVector Aliens; + TVector Watchers; + + explicit TRound(size_t count) + : Aliens(count) + , Watchers(count) + {} }; - TAutoPtr Watches; // we watch them - TAutoPtr Watched; // we are under watch - TAutoPtr Round; + std::optional Round; + +private: + /** + * Present when we watch some other node + * To avoid cycles we only ever watch a single node per round + */ + struct TWatching { + TActorId ActorId; + bool Owner; + + TWatching(const TActorId& actorId, bool owner) + : ActorId(actorId) + , Owner(owner) + {} + + /** + * Called when TEvWatch is received from another node while watching + * Returns true when we should move to a new cycle + */ + bool OnWatch(ui32 node) { + return ActorId.NodeId() == node; + } + + /** + * Called when TEvNotify indicates that a node is no longer owner/waiting + * Returns true when we should move to a new cycle + */ + bool OnNotify(const TActorId& actorId) { + return ActorId == actorId; + } + /** + * Called when a node has disconnected + * Returns true when we should move to a new cycle + */ + bool OnDisconnect(ui32 node) { + return ActorId.NodeId() == node; + } + }; + + std::optional Watching; + +private: + /** + * A set of watchers waiting for our notification + */ + struct TWatchedBy : public THashMap { + void Add(const TActorId& actorId, ui64 round) { + (*this)[actorId.NodeId()] = TWatcher{ actorId, round }; + } + }; + + std::optional WatchedBy; // we are watched by others + +private: const char* GetTabletTypeName() { return TTabletTypes::TypeToStr((TTabletTypes::EType)TabletInfo->TabletType); } @@ -183,159 +134,356 @@ class TBootstrapper : public TActor { return NKikimrBootstrapper::TEvWatchResult::EState_Name(state).c_str(); } - ui32 AlienIndex(ui32 alienNodeId) { - for (ui32 i = 0, e = BootstrapperInfo->OtherNodes.size(); i != e; ++i) + size_t AlienIndex(ui32 alienNodeId) { + for (size_t i = 0, e = BootstrapperInfo->OtherNodes.size(); i != e; ++i) if (BootstrapperInfo->OtherNodes[i] == alienNodeId) return i; - return Max(); + return Max(); } - void BeginNewRound(const TActorContext &ctx) { - auto localNodeId = SelfId().NodeId(); - auto whiteboardId = NNodeWhiteboard::MakeNodeWhiteboardServiceId(localNodeId); - Send(whiteboardId, new NNodeWhiteboard::TEvWhiteboard::TEvSystemStateAddRole("Bootstrapper")); + TDuration GetSleepDuration() { + TDuration wx = BootstrapperInfo->WatchThreshold; + // Note: we use RandomProvider for repeatability between test runs + ui64 seed = AppData()->RandomProvider->GenRand64(); + float k = float(seed % 0x10000) / 0x20000; + return wx * (0.5f + k); + } - if (BootstrapperInfo->OtherNodes.empty()) - return Boot(ctx); + ui64 GenerateSeed() { + ui64 seed = AppData()->RandomProvider->GenRand64(); + if (Y_UNLIKELY(seed == 0)) { + seed = 1; // avoid value zero (used by forced winners) + } else if (Y_UNLIKELY(seed == Max())) { + seed = Max() - 1; // avoid max value (used by non-participants) + } + return seed; + } + +private: + /** + * Starts a new cycle: + * - lookup tablet in state storage (sleep when unavailable) + * - begin voting round when tablet has no leader address + * - try connecting to leader address + * - begin voting round when connect fails + * - wait until pipe disconnects + * - notify watchers and start new cycle + */ + void BeginNewCycle() { + ++RoundCounter; - SelfSeed = AppData(ctx)->RandomProvider->GenRand64(); - LOG_INFO(ctx, NKikimrServices::BOOTSTRAPPER, "tablet: %" PRIu64 ", type: %s, begin new round, seed: %" PRIu64, - TabletInfo->TabletID, GetTabletTypeName(), SelfSeed); + LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::BOOTSTRAPPER, + "tablet: " << TabletInfo->TabletID << ", type: " << GetTabletTypeName() + << ", begin new cycle (lookup in state storage)"); - const ui64 tabletId = TabletInfo->TabletID; - ++RoundCounter; + // We shouldn't start a new cycle with a connected leader pipe + Y_ABORT_UNLESS(!KnownLeaderPipe); - Round.Reset(new TRound()); - Round->Aliens.resize(BootstrapperInfo->OtherNodes.size()); + Send(MakeStateStorageProxyID(), new TEvStateStorage::TEvLookup(TabletInfo->TabletID, 0), IEventHandle::FlagTrackDelivery); + Become(&TThis::StateLookup); + } - for (ui32 alienNode : BootstrapperInfo->OtherNodes) { - ctx.Send(MakeBootstrapperID(tabletId, alienNode), new TEvBootstrapper::TEvWatch(tabletId, SelfSeed, RoundCounter), IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession, RoundCounter); + void HandleUnknown(TEvBootstrapper::TEvWatch::TPtr& ev) { + const NKikimrBootstrapper::TEvWatch& record = ev->Get()->Record; + Y_ABORT_UNLESS(record.GetTabletID() == TabletInfo->TabletID); + + Send(ev->Sender, new TEvBootstrapper::TEvWatchResult(record.GetTabletID(), + NKikimrBootstrapper::TEvWatchResult::UNKNOWN, Max(), record.GetRound())); + } + + void HandleLookup(TEvStateStorage::TEvInfo::TPtr& ev) { + auto* msg = ev->Get(); + + LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::BOOTSTRAPPER, + "tablet: " << TabletInfo->TabletID << ", type: " << GetTabletTypeName() + << ", lookup: " << msg->Status << ", leader: " << msg->CurrentLeader); + + switch (msg->Status) { + case NKikimrProto::OK: { + // We have state storage quorum and some known leader + KnownLeaderPipe = RegisterWithSameMailbox( + NTabletPipe::CreateClient(SelfId(), TabletInfo->TabletID, { + .RetryPolicy = NTabletPipe::TClientRetryPolicy::WithoutRetries(), + .HintTablet = msg->CurrentLeader, + })); + Become(&TThis::StateConnectLeader); + return; + } + case NKikimrProto::NODATA: { + // We have state storage quorum and no known leader + BeginNewRound(); + return; + } + default: { + // We have unavailable storage storage, sleep and retry + auto sleepDuration = GetSleepDuration(); + LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::BOOTSTRAPPER, + "tablet: " << TabletInfo->TabletID << ", type: " << GetTabletTypeName() + << ", state storage unavailable, sleeping for " << sleepDuration); + Schedule(sleepDuration, new TEvents::TEvWakeup(RoundCounter)); + return; + } } + } - Become(&TThis::StateFree); // todo: global timeout? + void HandleLookup(TEvents::TEvWakeup::TPtr& ev) { + if (ev->Get()->Tag != RoundCounter) { + return; + } + + BeginNewCycle(); } - void Boot(const TActorContext &ctx) { - Y_ABORT_UNLESS(!LookOnActorID); + void HandleConnectLeader(TEvTabletPipe::TEvClientConnected::TPtr& ev) { + if (ev->Sender != KnownLeaderPipe) { + return; + } - LOG_NOTICE(ctx, NKikimrServices::BOOTSTRAPPER, "tablet: %" PRIu64 ", type: %s, boot", - TabletInfo->TabletID, GetTabletTypeName()); + auto* msg = ev->Get(); - if (FollowerActorID) { - LookOnActorID = FollowerActorID; - FollowerActorID = TActorId(); - ctx.Send(LookOnActorID, new TEvTablet::TEvPromoteToLeader(0, TabletInfo)); - } else { - TTabletSetupInfo *x = BootstrapperInfo->SetupInfo.Get(); - LookOnActorID = x->Tablet(TabletInfo.Get(), ctx.SelfID, ctx, 0, AppData(ctx)->ResourceProfiles); + LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::BOOTSTRAPPER, + "tablet: " << TabletInfo->TabletID << ", type: " << GetTabletTypeName() + << ", connect: " << msg->Status); + + if (msg->Status != NKikimrProto::OK) { + // Current leader unavailable, begin new round + KnownLeaderPipe = {}; + BeginNewRound(); + return; } - Y_ABORT_UNLESS(LookOnActorID); + LOG_INFO_S(*TlsActivationContext, NKikimrServices::BOOTSTRAPPER, + "tablet: " << TabletInfo->TabletID << ", type: " << GetTabletTypeName() + << ", connected to leader, waiting"); - Watched.Reset(new TWatched()); + // We have connected to leader, wait until it disconnects + WatchedBy.emplace(); + BootDelayedUntil = {}; + Become(&TThis::StateWatchLeader); - Become(&TThis::StateOwner); + BootFollower(); } - void Handle(TEvTablet::TEvTabletDead::TPtr &ev, const TActorContext &ctx) { - if (ev->Sender == LookOnActorID) { - LOG_INFO(ctx, NKikimrServices::BOOTSTRAPPER, "tablet: %" PRIu64 ", type: %s, tablet dead", - TabletInfo->TabletID, GetTabletTypeName()); + void HandleWatchLeader(TEvBootstrapper::TEvWatch::TPtr& ev) { + const NKikimrBootstrapper::TEvWatch& record = ev->Get()->Record; + Y_ABORT_UNLESS(record.GetTabletID() == TabletInfo->TabletID); - LookOnActorID = TActorId(); - NotifyAndRound(ctx); - } + WatchedBy.value().Add(ev->Sender, record.GetRound()); + Send(ev->Sender, new TEvBootstrapper::TEvWatchResult(record.GetTabletID(), + NKikimrBootstrapper::TEvWatchResult::WAITFOR, 0, record.GetRound())); } - void Stop() { - if (LookOnActorID) { - Send(LookOnActorID, new TEvents::TEvPoisonPill()); - LookOnActorID = TActorId(); + void HandleWatchLeader(TEvTabletPipe::TEvClientDestroyed::TPtr& ev) { + if (ev->Sender != KnownLeaderPipe) { + return; } - if (FollowerActorID) { - Send(FollowerActorID, new TEvents::TEvPoisonPill()); - FollowerActorID = TActorId(); - } + LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::BOOTSTRAPPER, + "tablet: " << TabletInfo->TabletID << ", type: " << GetTabletTypeName() + << ", disconnected"); + KnownLeaderPipe = {}; NotifyWatchers(); + BeginNewCycle(); + } + +private: + /** + * Begins new voting round between bootstrapper nodes + * - Starts watching all other nodes + * - Based on the cluster state may become owner, watcher or sleeper + * - Owner boots a new leader tablet + * - Watcher waits for notification from some other node + * - Sleeper sleeps before starting a new cycle + */ + void BeginNewRound() { + // Note: make sure notifications from previous states don't interfere + ++RoundCounter; + + if (BootstrapperInfo->OtherNodes.empty()) { + return Boot(); + } + + SelfSeed = GenerateSeed(); + LOG_INFO_S(*TlsActivationContext, NKikimrServices::BOOTSTRAPPER, + "tablet:" << TabletInfo->TabletID << ", type: " << GetTabletTypeName() + << ", begin new round, seed: " << SelfSeed); + + const ui64 tabletId = TabletInfo->TabletID; - BootDelayedUntil = { }; - Round.Destroy(); + Round.emplace(BootstrapperInfo->OtherNodes.size()); + for (ui32 alienNode : BootstrapperInfo->OtherNodes) { + Send(MakeBootstrapperID(tabletId, alienNode), + new TEvBootstrapper::TEvWatch(tabletId, SelfSeed, RoundCounter), + IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession, + RoundCounter); + } + + Become(&TThis::StateFree); } - void HandlePoison() { - Stop(); - PassAway(); + void HandleFree(TEvBootstrapper::TEvWatch::TPtr& ev) { + const NKikimrBootstrapper::TEvWatch& record = ev->Get()->Record; + Y_ABORT_UNLESS(record.GetTabletID() == TabletInfo->TabletID); + + Send(ev->Sender, new TEvBootstrapper::TEvWatchResult(record.GetTabletID(), + NKikimrBootstrapper::TEvWatchResult::FREE, SelfSeed, record.GetRound())); + + const size_t alienNodeIdx = AlienIndex(ev->Sender.NodeId()); + if (alienNodeIdx == Max()) + return; + + auto& watcher = Round.value().Watchers.at(alienNodeIdx); + watcher.ActorId = ev->Sender; + watcher.Round = record.GetRound(); + + // We may have previously observed some state (e.g. UNKNOWN or DISCONNECTED) + // Since we have received a new TEvWatch afterwards, it implies that node + // has started a new voting round. Make sure we reflect that in our + // current state. + if (ApplyAlienState(ev->Sender, NKikimrBootstrapper::TEvWatchResult::FREE, record.GetSelfSeed(), /* updateOnly */ true)) + return; + + CheckRoundCompletion(); } - void Standby() { - Stop(); - Become(&TThis::StateStandBy); + void HandleFree(TEvBootstrapper::TEvWatchResult::TPtr& ev) { + const NKikimrBootstrapper::TEvWatchResult& record = ev->Get()->Record; + Y_ABORT_UNLESS(record.GetTabletID() == TabletInfo->TabletID); + + if (record.GetRound() != RoundCounter) + return; + + if (ApplyAlienState(ev->Sender, record.GetState(), record.GetSeed())) + return; + + CheckRoundCompletion(); } - void BecomeWatch(const TActorId &watchOn, bool owner, const TActorContext &ctx) { - Y_UNUSED(ctx); + void HandleFree(TEvents::TEvUndelivered::TPtr& ev) { + const ui64 round = ev->Cookie; + LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::BOOTSTRAPPER, + "tablet: " << TabletInfo->TabletID << ", type: " << GetTabletTypeName() + << ", undelivered from " << ev->Sender << ", round " << round); - BootDelayedUntil = { }; - Round.Destroy(); + if (round != RoundCounter) + return; - LOG_INFO(ctx, NKikimrServices::BOOTSTRAPPER, "tablet: %" PRIu64 ", type: %s, become watch", - TabletInfo->TabletID, GetTabletTypeName()); + if (ApplyAlienState(ev->Sender, NKikimrBootstrapper::TEvWatchResult::UNDELIVERED, Max())) + return; - Watches.Reset(new TWatch()); - Watched.Reset(new TWatched()); + CheckRoundCompletion(); + } - LOG_DEBUG(ctx, NKikimrServices::BOOTSTRAPPER, "tablet: %" PRIu64 ", type: %s, add watched node: %" PRIu32, - TabletInfo->TabletID, GetTabletTypeName(), watchOn.NodeId()); - Watches->Watched.push_back(TWatch::TWatched(watchOn, owner)); + void HandleFree(TEvInterconnect::TEvNodeDisconnected::TPtr& ev) { + const ui32 node = ev->Get()->NodeId; + const ui64 round = ev->Cookie; + LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::BOOTSTRAPPER, + "tablet: " << TabletInfo->TabletID << ", type: " << GetTabletTypeName() + << ", disconnected from " << node << ", round " << round); - if (BootstrapperInfo->StartFollowers && !FollowerActorID) { - LOG_NOTICE(ctx, NKikimrServices::BOOTSTRAPPER, "tablet: %" PRIu64 ", type: %s, boot follower", - TabletInfo->TabletID, GetTabletTypeName()); - TTabletSetupInfo *x = BootstrapperInfo->SetupInfo.Get(); - FollowerActorID = x->Follower(TabletInfo.Get(), ctx.SelfID, ctx, 0, AppData(ctx)->ResourceProfiles); - } + if (round != RoundCounter) + return; - Become(&TThis::StateWatch); + if (ApplyAlienState(TActorId(node, 0, 0, 0), NKikimrBootstrapper::TEvWatchResult::DISCONNECTED, Max())) + return; + + CheckRoundCompletion(); } - bool ApplyAlienState(const TActorId &alien, NKikimrBootstrapper::TEvWatchResult::EState state, ui64 seed, const TActorContext &ctx) { - const ui32 alienNodeIdx = AlienIndex(alien.NodeId()); - if (alienNodeIdx == Max()) + bool ApplyAlienState(const TActorId& alien, NKikimrBootstrapper::TEvWatchResult::EState state, ui64 seed, bool updateOnly = false) { + const size_t alienNodeIdx = AlienIndex(alien.NodeId()); + if (alienNodeIdx == Max()) return true; - if (Round->Aliens[alienNodeIdx].State != TRound::EAlienState::Wait) - return false; + // Note: a single alien may be updated multiple times + auto& alienEntry = Round.value().Aliens.at(alienNodeIdx); + if (updateOnly && alienEntry.State == TRound::EAlienState::Wait) { + // This update should only be applied after the initial result + return true; + } + + LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::BOOTSTRAPPER, + "tablet: " << TabletInfo->TabletID << ", type: " << GetTabletTypeName() + << ", apply alien " << alien.NodeId() << " state: " << GetStateName(state)); - LOG_DEBUG(ctx, NKikimrServices::BOOTSTRAPPER, "tablet: %" PRIu64 ", type: %s, apply alien %" PRIu32 " state: %s", - TabletInfo->TabletID, GetTabletTypeName(), alien.NodeId(), GetStateName(state)); + alienEntry.Seed = seed; switch (state) { - case NKikimrBootstrapper::TEvWatchResult::UNKNOWN: - Round->Aliens[alienNodeIdx] = TRound::TAlien(TRound::EAlienState::Unknown, Max()); - return false; - case NKikimrBootstrapper::TEvWatchResult::FREE: - Round->Aliens[alienNodeIdx] = TRound::TAlien(TRound::EAlienState::Free, seed); - return false; - case NKikimrBootstrapper::TEvWatchResult::OWNER: - BecomeWatch(alien, true, ctx); - return true; - case NKikimrBootstrapper::TEvWatchResult::WAITFOR: - BecomeWatch(alien, false, ctx); - return true; - case NKikimrBootstrapper::TEvWatchResult::UNDELIVERED: - Round->Aliens[alienNodeIdx] = TRound::TAlien(TRound::EAlienState::Undelivered, Max()); - return false; - case NKikimrBootstrapper::TEvWatchResult::DISCONNECTED: - Round->Aliens[alienNodeIdx] = TRound::TAlien(TRound::EAlienState::Disconnected, Max()); - return false; - default: - Y_ABORT("unhandled case"); + case NKikimrBootstrapper::TEvWatchResult::UNKNOWN: + alienEntry.State = TRound::EAlienState::Unknown; + return false; + case NKikimrBootstrapper::TEvWatchResult::FREE: + alienEntry.State = TRound::EAlienState::Free; + return false; + case NKikimrBootstrapper::TEvWatchResult::OWNER: + BecomeWatch(alien, true); + return true; + case NKikimrBootstrapper::TEvWatchResult::WAITFOR: + BecomeWatch(alien, false); + return true; + case NKikimrBootstrapper::TEvWatchResult::UNDELIVERED: + alienEntry.State = TRound::EAlienState::Undelivered; + return false; + case NKikimrBootstrapper::TEvWatchResult::DISCONNECTED: + alienEntry.State = TRound::EAlienState::Disconnected; + return false; + default: + Y_ABORT("unhandled case"); } } - bool CheckBootPermitted(size_t undelivered, size_t disconnected, const TActorContext &ctx) { + void CheckRoundCompletion() { + ui64 winnerSeed = SelfSeed; + ui32 winner = SelfId().NodeId(); + + size_t undelivered = 0; + size_t disconnected = 0; + auto& round = Round.value(); + for (size_t i = 0, e = round.Aliens.size(); i != e; ++i) { + const auto& alien = round.Aliens[i]; + const ui32 node = BootstrapperInfo->OtherNodes.at(i); + switch (alien.State) { + case TRound::EAlienState::Wait: + return; + case TRound::EAlienState::Unknown: + break; + case TRound::EAlienState::Free: + if (winnerSeed > alien.Seed || winnerSeed == alien.Seed && winner > node) { + winnerSeed = alien.Seed; + winner = node; + } + break; + case TRound::EAlienState::Undelivered: + ++undelivered; + break; + case TRound::EAlienState::Disconnected: + ++disconnected; + break; + } + } + + if (winner != SelfId().NodeId()) { + auto sleepDuration = GetSleepDuration(); + LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::BOOTSTRAPPER, + "tablet: " << TabletInfo->TabletID << ", type: " << GetTabletTypeName() + << ", lost round, wait for " << sleepDuration); + + Round.reset(); + Schedule(sleepDuration, new TEvents::TEvWakeup(RoundCounter)); + Become(&TThis::StateSleep); + return; + } + + if (!CheckBootPermitted(undelivered, disconnected)) { + return; + } + + // Note: current Round is used by Boot to update watchers + Boot(); + } + + bool CheckBootPermitted(size_t undelivered, size_t disconnected) { // Total number of nodes that participate in tablet booting size_t total = 1 + BootstrapperInfo->OtherNodes.size(); Y_DEBUG_ABORT_UNLESS(total >= 1 + undelivered + disconnected); @@ -354,201 +502,288 @@ class TBootstrapper : public TActor { // If there are enough nodes online, just boot immediately if (online >= quorum) { - BootDelayedUntil = { }; + BootDelayedUntil = {}; return true; } - auto now = ctx.Now(); + auto now = TActivationContext::Now(); if (!BootDelayedUntil) { // Delay boot decision until some later time BootDelayedUntil = now + BootstrapperInfo->OfflineDelay; } else if (BootDelayedUntil <= now) { // We don't have enough online nodes, but try to boot anyway - BootDelayedUntil = { }; + BootDelayedUntil = {}; return true; } - LOG_DEBUG(ctx, NKikimrServices::BOOTSTRAPPER, "tablet: %" PRIu64 ", type: %s, %" PRISZT "/%" PRISZT " nodes online (need %" PRISZT "), wait for threshold", - TabletInfo->TabletID, GetTabletTypeName(), online, total, quorum); + auto sleepDuration = Min(GetSleepDuration(), BootDelayedUntil - now); + LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::BOOTSTRAPPER, + "tablet:" << TabletInfo->TabletID << ", type: " << GetTabletTypeName() + << ", " << online << "/" << total << " nodes online (need " << quorum << ")" + << ", wait for " << sleepDuration); - const ui64 wx = BootstrapperInfo->WatchThreshold.MicroSeconds(); - const auto sleepDuration = TDuration::MicroSeconds(wx / 2 + wx * (SelfSeed % 0x10000) / 0x20000); - - ctx.ExecutorThread.ActorSystem->Schedule( - Min(sleepDuration, BootDelayedUntil - now), - new IEventHandle(ctx.SelfID, ctx.SelfID, new TEvents::TEvWakeup(), 0, RoundCounter)); + Round.reset(); + Schedule(sleepDuration, new TEvents::TEvWakeup(RoundCounter)); Become(&TThis::StateSleep); return false; } - void CheckRoundCompletion(const TActorContext &ctx) { - ui64 minAlienSeed = Max(); - ui32 minAlien = Max(); - size_t undelivered = 0; - size_t disconnected = 0; - for (ui32 i = 0, e = Round->Aliens.size(); i != e; ++i) { - const TRound::TAlien &alien = Round->Aliens[i]; - switch (alien.State) { - case TRound::EAlienState::Wait: - return; - case TRound::EAlienState::Unknown: - break; - case TRound::EAlienState::Free: - if (minAlienSeed > alien.Seed) { - minAlienSeed = alien.Seed; - minAlien = BootstrapperInfo->OtherNodes[i]; - } - break; - case TRound::EAlienState::Undelivered: - ++undelivered; - break; - case TRound::EAlienState::Disconnected: - ++disconnected; - break; - } - } +private: + /** + * Starts a watcher phase + * - Starts an optional follower + * - Waits for notification from some other owner or watcher + * - When watched by others will notify on state change + * - Begins a new cycle when notified or disconnected + */ + void BecomeWatch(const TActorId& watchOn, bool owner) { + LOG_INFO_S(*TlsActivationContext, NKikimrServices::BOOTSTRAPPER, + "tablet: " << TabletInfo->TabletID << ", type: " << GetTabletTypeName() + << ", become watch on node " << watchOn.NodeId() << (owner ? " (owner)" : "")); - Round.Destroy(); + Watching.emplace(watchOn, owner); + WatchedBy.emplace(); - // ok, we got all reactions, now boot tablet or sleep for threshold - if (minAlienSeed < SelfSeed || minAlienSeed == SelfSeed && ctx.SelfID.NodeId() > minAlien) { - LOG_DEBUG(ctx, NKikimrServices::BOOTSTRAPPER, "tablet: %" PRIu64 ", type: %s, lost round, wait for threshold", - TabletInfo->TabletID, GetTabletTypeName()); + FinishRound(NKikimrBootstrapper::TEvWatchResult::WAITFOR); - const ui64 wx = BootstrapperInfo->WatchThreshold.MicroSeconds(); - const auto sleepDuration = TDuration::MicroSeconds(wx / 2 + wx * (SelfSeed % 0x10000) / 0x20000); - - Become(&TThis::StateSleep); - ctx.ExecutorThread.ActorSystem->Schedule(sleepDuration, new IEventHandle(ctx.SelfID, ctx.SelfID, new TEvents::TEvWakeup(), 0, RoundCounter)); - return; - } else if (!CheckBootPermitted(undelivered, disconnected, ctx)) { - return; - } else { - Boot(ctx); - return; - } + Become(&TThis::StateWatch); + BootFollower(); } - void NotifyWatchers() { - if (Watched) { - for (const TWatched::TWatcher &xw : Watched->Watchers) - Send(xw.ActorID, new TEvBootstrapper::TEvNotify(TabletInfo->TabletID, NKikimrBootstrapper::TEvNotify::DROP, xw.Round)); - Watched.Destroy(); - Watches.Destroy(); + void FinishRound(NKikimrBootstrapper::TEvWatchResult::EState state) { + if (Round) { + for (auto& watcher : Round->Watchers) { + if (watcher.ActorId) { + Send(watcher.ActorId, new TEvBootstrapper::TEvWatchResult( + TabletInfo->TabletID, state, 0, watcher.Round)); + WatchedBy.value().Add(watcher.ActorId, watcher.Round); + } + } + + BootDelayedUntil = {}; + Round.reset(); } } - void NotifyAndRound(const TActorContext &ctx) { - NotifyWatchers(); - BeginNewRound(ctx); + void BootFollower() { + if (BootstrapperInfo->StartFollowers && !FollowerActorID) { + LOG_NOTICE_S(*TlsActivationContext, NKikimrServices::BOOTSTRAPPER, + "tablet: " << TabletInfo->TabletID << ", type: " << GetTabletTypeName() + << ", boot follower"); + TTabletSetupInfo* x = BootstrapperInfo->SetupInfo.Get(); + FollowerActorID = x->Follower(TabletInfo.Get(), + SelfId(), TActivationContext::ActorContextFor(SelfId()), + 0, AppData()->ResourceProfiles); + } } - void HandleFree(TEvBootstrapper::TEvWatchResult::TPtr &ev, const TActorContext &ctx) { - const NKikimrBootstrapper::TEvWatchResult &record = ev->Get()->Record; + void HandleWatch(TEvBootstrapper::TEvWatch::TPtr& ev) { + const NKikimrBootstrapper::TEvWatch& record = ev->Get()->Record; Y_ABORT_UNLESS(record.GetTabletID() == TabletInfo->TabletID); - if (record.GetRound() != RoundCounter) + if (Watching.value().OnWatch(ev->Sender.NodeId())) { + // We have been watching this node, but now it's trying to watch us + // This guards against notify/disconnect getting lost (shouldn't happen) + NotifyWatchers(); + Send(ev->Sender, new TEvBootstrapper::TEvWatchResult(record.GetTabletID(), + NKikimrBootstrapper::TEvWatchResult::UNKNOWN, 0, record.GetRound())); + BeginNewCycle(); return; + } - if (ApplyAlienState(ev->Sender, record.GetState(), record.GetSeed(), ctx)) + WatchedBy.value().Add(ev->Sender, record.GetRound()); + Send(ev->Sender, new TEvBootstrapper::TEvWatchResult(record.GetTabletID(), + NKikimrBootstrapper::TEvWatchResult::WAITFOR, 0, record.GetRound())); + } + + void HandleWatch(TEvBootstrapper::TEvNotify::TPtr& ev) { + if (ev->Get()->Record.GetRound() != RoundCounter) return; - CheckRoundCompletion(ctx); + if (Watching.value().OnNotify(ev->Sender)) { + NotifyWatchers(); + BeginNewCycle(); + } } - void HandleFree(TEvents::TEvUndelivered::TPtr &ev, const TActorContext &ctx) { + void HandleWatch(TEvInterconnect::TEvNodeDisconnected::TPtr& ev) { + const ui32 node = ev->Get()->NodeId; const ui64 round = ev->Cookie; - LOG_DEBUG(ctx, NKikimrServices::BOOTSTRAPPER, "tablet: %" PRIu64 ", type: %s, undelivered from %s, round %" PRIu64, - TabletInfo->TabletID, GetTabletTypeName(), ev->Sender.ToString().c_str(), round); + LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::BOOTSTRAPPER, + "tablet: " << TabletInfo->TabletID << ", type: " << GetTabletTypeName() + << ", disconnected from " << node << ", round " << round); if (round != RoundCounter) return; - if (ApplyAlienState(ev->Sender, NKikimrBootstrapper::TEvWatchResult::UNDELIVERED, Max(), ctx)) - return; - - CheckRoundCompletion(ctx); + if (Watching.value().OnDisconnect(node)) { + NotifyWatchers(); + BeginNewCycle(); + } } - void HandleFree(TEvInterconnect::TEvNodeDisconnected::TPtr &ev, const TActorContext &ctx) { - const ui32 node = ev->Get()->NodeId; - LOG_DEBUG(ctx, NKikimrServices::BOOTSTRAPPER, "tablet: %" PRIu64 ", type: %s, disconnected from %" PRIu32, - TabletInfo->TabletID, GetTabletTypeName(), node); - - if (ApplyAlienState(TActorId(node, 0, 0, 0), NKikimrBootstrapper::TEvWatchResult::DISCONNECTED, Max(), ctx)) - return; + void NotifyWatchers() { + if (WatchedBy) { + for (const auto& pr : *WatchedBy) { + Send(pr.second.ActorId, + new TEvBootstrapper::TEvNotify( + TabletInfo->TabletID, NKikimrBootstrapper::TEvNotify::DROP, pr.second.Round)); + } - CheckRoundCompletion(ctx); + WatchedBy.reset(); + Watching.reset(); + } } - void HandleFree(TEvBootstrapper::TEvWatch::TPtr &ev, const TActorContext &ctx) { - const NKikimrBootstrapper::TEvWatch &record = ev->Get()->Record; - Y_ABORT_UNLESS(record.GetTabletID() == TabletInfo->TabletID); +private: + /** + * Starts an owner phase + * - Boots a new leader tablet (or promotes an existing follower) + * - When watched by others will notify on state change + * - Waits until the new instance stops + * - Begins a new cycle + */ + void Boot() { + Y_ABORT_UNLESS(!LookOnActorID); - ctx.Send(ev->Sender, new TEvBootstrapper::TEvWatchResult(record.GetTabletID(), NKikimrBootstrapper::TEvWatchResult::FREE, SelfSeed, record.GetRound())); - } + LOG_NOTICE_S(*TlsActivationContext, NKikimrServices::BOOTSTRAPPER, + "tablet: " << TabletInfo->TabletID << ", type: " << GetTabletTypeName() + << ", boot"); - void HandleWatch(TEvBootstrapper::TEvNotify::TPtr &ev, const TActorContext &ctx) { - const TActorId alien = ev->Sender; - if (Watches->RemoveAlien(alien)) { - NotifyAndRound(ctx); + if (FollowerActorID) { + LookOnActorID = FollowerActorID; + FollowerActorID = {}; + Send(LookOnActorID, new TEvTablet::TEvPromoteToLeader(0, TabletInfo)); + } else { + TTabletSetupInfo* x = BootstrapperInfo->SetupInfo.Get(); + LookOnActorID = x->Tablet(TabletInfo.Get(), + SelfId(), TActivationContext::ActorContextFor(SelfId()), + 0, AppData()->ResourceProfiles); } - } - void HandleWatch(TEvInterconnect::TEvNodeDisconnected::TPtr &ev, const TActorContext &ctx) { - const ui32 node = ev->Get()->NodeId; - LOG_DEBUG(ctx, NKikimrServices::BOOTSTRAPPER, "tablet: %" PRIu64 ", type: %s, disconnected from %" PRIu32, - TabletInfo->TabletID, GetTabletTypeName(), node); + Y_ABORT_UNLESS(LookOnActorID); - if (Watches->RemoveAlienNode(node)) { - NotifyAndRound(ctx); - } + WatchedBy.emplace(); + FinishRound(NKikimrBootstrapper::TEvWatchResult::OWNER); + + Become(&TThis::StateOwner); } - void HandleOwner(TEvBootstrapper::TEvWatch::TPtr &ev, const TActorContext &ctx) { - const NKikimrBootstrapper::TEvWatch &record = ev->Get()->Record; + void HandleOwner(TEvBootstrapper::TEvWatch::TPtr& ev) { + const NKikimrBootstrapper::TEvWatch& record = ev->Get()->Record; Y_ABORT_UNLESS(record.GetTabletID() == TabletInfo->TabletID); // add to watchers list (if node not already there) - Watched->AddWatcher(ev->Sender, record.GetRound()); - ctx.Send(ev->Sender, new TEvBootstrapper::TEvWatchResult(record.GetTabletID(), NKikimrBootstrapper::TEvWatchResult::OWNER, 0, record.GetRound())); + WatchedBy.value().Add(ev->Sender, record.GetRound()); + Send(ev->Sender, new TEvBootstrapper::TEvWatchResult(record.GetTabletID(), + NKikimrBootstrapper::TEvWatchResult::OWNER, 0, record.GetRound())); } - void HandleWatch(TEvBootstrapper::TEvWatch::TPtr &ev, const TActorContext &ctx) { - const NKikimrBootstrapper::TEvWatch &record = ev->Get()->Record; - Y_ABORT_UNLESS(record.GetTabletID() == TabletInfo->TabletID); - - if (Watches->CheckWatch(ev->Sender.NodeId())) { - ctx.Send(ev->Sender, new TEvBootstrapper::TEvWatchResult(record.GetTabletID(), NKikimrBootstrapper::TEvWatchResult::UNKNOWN, 0, record.GetRound())); - } else { - Watched->AddWatcher(ev->Sender, record.GetRound()); - ctx.Send(ev->Sender, new TEvBootstrapper::TEvWatchResult(record.GetTabletID(), NKikimrBootstrapper::TEvWatchResult::WAITFOR, 0, record.GetRound())); + void Handle(TEvTablet::TEvTabletDead::TPtr& ev) { + if (ev->Sender == LookOnActorID) { + LOG_INFO_S(*TlsActivationContext, NKikimrServices::BOOTSTRAPPER, + "tablet: " << TabletInfo->TabletID << ", type: " << GetTabletTypeName() + << ", tablet dead"); + + LookOnActorID = {}; + NotifyWatchers(); + BeginNewCycle(); + } else if (ev->Sender == FollowerActorID) { + FollowerActorID = {}; + BootFollower(); } } - void HandleSleep(TEvBootstrapper::TEvWatch::TPtr &ev, const TActorContext &ctx) { - HandleFree(ev, ctx); +private: + void HandleSleep(TEvBootstrapper::TEvWatch::TPtr& ev) { + const NKikimrBootstrapper::TEvWatch& record = ev->Get()->Record; + Y_ABORT_UNLESS(record.GetTabletID() == TabletInfo->TabletID); + + // We have lost the round and are not going to boot right now + // However make sure our round seed is somewhat stable while sleeping + Send(ev->Sender, new TEvBootstrapper::TEvWatchResult(record.GetTabletID(), + NKikimrBootstrapper::TEvWatchResult::FREE, SelfSeed, record.GetRound())); } - void HandleSleep(TEvents::TEvWakeup::TPtr &ev, const TActorContext &ctx) { - const ui64 roundCookie = ev->Cookie; - if (roundCookie != RoundCounter) + void HandleSleep(TEvents::TEvWakeup::TPtr& ev) { + if (ev->Get()->Tag != RoundCounter) return; - BeginNewRound(ctx); + BeginNewCycle(); + } + +private: + /** + * Switches bootstrapper to a cold standby + * - Stops all activity (including followers) + * - Waits until explicitly activated + */ + void Standby() { + Y_ABORT_UNLESS(!ModeStandby); + Stop(); + ModeStandby = true; + Become(&TThis::StateStandBy); + } + + /** + * Activates a cold standby and begins a new cycle + */ + void Activate() { + Y_ABORT_UNLESS(ModeStandby); + ModeStandby = false; + OnActivated(); + BeginNewCycle(); + } + + void OnActivated() { + auto localNodeId = SelfId().NodeId(); + auto whiteboardId = NNodeWhiteboard::MakeNodeWhiteboardServiceId(localNodeId); + Send(whiteboardId, new NNodeWhiteboard::TEvWhiteboard::TEvSystemStateAddRole("Bootstrapper")); } - void HandleStandBy(TEvBootstrapper::TEvWatch::TPtr &ev, const TActorContext &ctx) { - const NKikimrBootstrapper::TEvWatch &record = ev->Get()->Record; + void HandleStandBy(TEvBootstrapper::TEvWatch::TPtr& ev) { + const NKikimrBootstrapper::TEvWatch& record = ev->Get()->Record; Y_ABORT_UNLESS(record.GetTabletID() == TabletInfo->TabletID); - ctx.Send(ev->Sender, new TEvBootstrapper::TEvWatchResult(record.GetTabletID(), NKikimrBootstrapper::TEvWatchResult::UNKNOWN, Max(), record.GetRound())); + Send(ev->Sender, new TEvBootstrapper::TEvWatchResult(record.GetTabletID(), + NKikimrBootstrapper::TEvWatchResult::UNKNOWN, Max(), record.GetRound())); } void HandlePoisonStandBy() { PassAway(); } +private: + /** + * Common cleanup that may stop any phase + */ + void Stop() { + if (KnownLeaderPipe) { + NTabletPipe::CloseClient(SelfId(), KnownLeaderPipe); + KnownLeaderPipe = {}; + } + + if (LookOnActorID) { + Send(LookOnActorID, new TEvents::TEvPoisonPill()); + LookOnActorID = {}; + } + + if (FollowerActorID) { + Send(FollowerActorID, new TEvents::TEvPoisonPill()); + FollowerActorID = {}; + } + + NotifyWatchers(); + + BootDelayedUntil = {}; + Round.reset(); + } + + void HandlePoison() { + Stop(); + PassAway(); + } + void PassAway() override { for (ui32 nodeId : BootstrapperInfo->OtherNodes) { Send(TActivationContext::InterconnectProxy(nodeId), new TEvents::TEvUnsubscribe); @@ -562,52 +797,72 @@ class TBootstrapper : public TActor { return NKikimrServices::TActivity::TABLET_BOOTSTRAPPER; } - TBootstrapper(TTabletStorageInfo *tabletInfo, TBootstrapperInfo *bootstrapperInfo, bool standby) - : TActor(standby ? &TThis::StateStandBy : &TThis::StateBoot) - , TabletInfo(tabletInfo) + TBootstrapper(TTabletStorageInfo* tabletInfo, TBootstrapperInfo* bootstrapperInfo, bool standby) + : TabletInfo(tabletInfo) , BootstrapperInfo(bootstrapperInfo) - , RoundCounter(0xdeadbeefdeadbeefull) - , SelfSeed(0xdeadbeefdeadbeefull) + , ModeStandby(standby) { Y_ABORT_UNLESS(TTabletTypes::TypeInvalid != TabletInfo->TabletType); } - TAutoPtr AfterRegister(const TActorId &selfId, const TActorId &parentId) override { - Y_UNUSED(parentId); - return new IEventHandle(selfId, selfId, new TEvents::TEvBootstrap()); + void Bootstrap() { + if (ModeStandby) { + Become(&TThis::StateStandBy); + } else { + OnActivated(); + BeginNewCycle(); + } } - STFUNC(StateBoot) { + STFUNC(StateLookup) { switch (ev->GetTypeRewrite()) { - CFunc(TEvents::TSystem::Bootstrap, BeginNewRound); + hFunc(TEvBootstrapper::TEvWatch, HandleUnknown); + hFunc(TEvStateStorage::TEvInfo, HandleLookup); + hFunc(TEvTablet::TEvTabletDead, Handle); + hFunc(TEvents::TEvWakeup, HandleLookup); + cFunc(TEvents::TSystem::PoisonPill, HandlePoison); // => die + cFunc(TEvBootstrapper::EvStandBy, Standby); } } - STFUNC(StateFree) { + STFUNC(StateConnectLeader) { switch (ev->GetTypeRewrite()) { - HFunc(TEvBootstrapper::TEvWatchResult, HandleFree); // => noop|sleep|owner|watch - HFunc(TEvBootstrapper::TEvWatch, HandleFree); // => reply - HFunc(TEvents::TEvUndelivered, HandleFree); // => watchresult with unknown + hFunc(TEvBootstrapper::TEvWatch, HandleUnknown); + hFunc(TEvTabletPipe::TEvClientConnected, HandleConnectLeader); + hFunc(TEvTablet::TEvTabletDead, Handle); cFunc(TEvents::TSystem::PoisonPill, HandlePoison); // => die - HFunc(TEvInterconnect::TEvNodeDisconnected, HandleFree); // => watchresult with unknown cFunc(TEvBootstrapper::EvStandBy, Standby); } } - STFUNC(StateSleep) { + STFUNC(StateWatchLeader) { switch (ev->GetTypeRewrite()) { - HFunc(TEvBootstrapper::TEvWatch, HandleSleep); - HFunc(TEvents::TEvWakeup, HandleSleep); - cFunc(TEvents::TSystem::PoisonPill, HandlePoison); + hFunc(TEvBootstrapper::TEvWatch, HandleWatchLeader); + hFunc(TEvTabletPipe::TEvClientDestroyed, HandleWatchLeader); + hFunc(TEvTablet::TEvTabletDead, Handle); + cFunc(TEvents::TSystem::PoisonPill, HandlePoison); // => die + cFunc(TEvBootstrapper::EvStandBy, Standby); + } + } + + STFUNC(StateFree) { + switch (ev->GetTypeRewrite()) { + hFunc(TEvBootstrapper::TEvWatch, HandleFree); // => reply + hFunc(TEvBootstrapper::TEvWatchResult, HandleFree); // => noop|sleep|owner|watch + hFunc(TEvents::TEvUndelivered, HandleFree); // => watchresult with unknown + hFunc(TEvInterconnect::TEvNodeDisconnected, HandleFree); // => watchresult with unknown + hFunc(TEvTablet::TEvTabletDead, Handle); + cFunc(TEvents::TSystem::PoisonPill, HandlePoison); // => die cFunc(TEvBootstrapper::EvStandBy, Standby); } } STFUNC(StateWatch) { switch (ev->GetTypeRewrite()) { - HFunc(TEvBootstrapper::TEvWatch, HandleWatch); - HFunc(TEvBootstrapper::TEvNotify, HandleWatch); - HFunc(TEvInterconnect::TEvNodeDisconnected, HandleWatch); + hFunc(TEvBootstrapper::TEvWatch, HandleWatch); + hFunc(TEvBootstrapper::TEvNotify, HandleWatch); + hFunc(TEvInterconnect::TEvNodeDisconnected, HandleWatch); + hFunc(TEvTablet::TEvTabletDead, Handle); cFunc(TEvents::TSystem::PoisonPill, HandlePoison); cFunc(TEvBootstrapper::EvStandBy, Standby); } @@ -615,8 +870,18 @@ class TBootstrapper : public TActor { STFUNC(StateOwner) { switch (ev->GetTypeRewrite()) { - HFunc(TEvBootstrapper::TEvWatch, HandleOwner); - HFunc(TEvTablet::TEvTabletDead, Handle); + hFunc(TEvBootstrapper::TEvWatch, HandleOwner); + hFunc(TEvTablet::TEvTabletDead, Handle); + cFunc(TEvents::TSystem::PoisonPill, HandlePoison); + cFunc(TEvBootstrapper::EvStandBy, Standby); + } + } + + STFUNC(StateSleep) { + switch (ev->GetTypeRewrite()) { + hFunc(TEvBootstrapper::TEvWatch, HandleSleep); + hFunc(TEvents::TEvWakeup, HandleSleep); + hFunc(TEvTablet::TEvTabletDead, Handle); cFunc(TEvents::TSystem::PoisonPill, HandlePoison); cFunc(TEvBootstrapper::EvStandBy, Standby); } @@ -624,14 +889,14 @@ class TBootstrapper : public TActor { STFUNC(StateStandBy) { switch (ev->GetTypeRewrite()) { - HFunc(TEvBootstrapper::TEvWatch, HandleStandBy); - CFunc(TEvBootstrapper::EvActivate, BeginNewRound); + hFunc(TEvBootstrapper::TEvWatch, HandleStandBy); + cFunc(TEvBootstrapper::EvActivate, Activate); cFunc(TEvents::TSystem::PoisonPill, HandlePoisonStandBy); } } }; -IActor* CreateBootstrapper(TTabletStorageInfo *tabletInfo, TBootstrapperInfo *bootstrapperInfo, bool standby) { +IActor* CreateBootstrapper(TTabletStorageInfo* tabletInfo, TBootstrapperInfo* bootstrapperInfo, bool standby) { return new TBootstrapper(tabletInfo, bootstrapperInfo, standby); } diff --git a/ydb/core/tablet/bootstrapper.h b/ydb/core/tablet/bootstrapper.h index 06aaa51bf304..9ec9f7dee9b1 100644 --- a/ydb/core/tablet/bootstrapper.h +++ b/ydb/core/tablet/bootstrapper.h @@ -38,7 +38,7 @@ struct TBootstrapperInfo : public TThrRefBase { TDuration OfflineDelay; bool StartFollowers; - TBootstrapperInfo(TTabletSetupInfo *setupInfo) + TBootstrapperInfo(TTabletSetupInfo* setupInfo) : SetupInfo(setupInfo) , WatchThreshold(TDuration::MilliSeconds(200)) , OfflineDelay(TDuration::Seconds(3)) @@ -46,7 +46,7 @@ struct TBootstrapperInfo : public TThrRefBase { {} }; -IActor* CreateBootstrapper(TTabletStorageInfo *tabletInfo, TBootstrapperInfo *bootstrapperInfo, bool standby = false); +IActor* CreateBootstrapper(TTabletStorageInfo* tabletInfo, TBootstrapperInfo* bootstrapperInfo, bool standby = false); TActorId MakeBootstrapperID(ui64 tablet, ui32 node); } diff --git a/ydb/core/tablet/bootstrapper_impl.h b/ydb/core/tablet/bootstrapper_impl.h new file mode 100644 index 000000000000..7d046f95b953 --- /dev/null +++ b/ydb/core/tablet/bootstrapper_impl.h @@ -0,0 +1,44 @@ +#pragma once +#include "bootstrapper.h" +#include + +namespace NKikimr { + +struct TEvBootstrapper::TEvWatch : public TEventPB { + TEvWatch() + {} + + TEvWatch(ui64 tabletId, ui64 selfSeed, ui64 round) + { + Record.SetTabletID(tabletId); + Record.SetSelfSeed(selfSeed); + Record.SetRound(round); + } +}; + +struct TEvBootstrapper::TEvWatchResult : public TEventPB { + TEvWatchResult() + {} + + TEvWatchResult(ui64 tabletId, NKikimrBootstrapper::TEvWatchResult::EState state, ui64 seed, ui64 round) + { + Record.SetTabletID(tabletId); + Record.SetState(state); + Record.SetSeed(seed); + Record.SetRound(round); + } +}; + +struct TEvBootstrapper::TEvNotify : public TEventPB { + TEvNotify() + {} + + TEvNotify(ui64 tabletId, NKikimrBootstrapper::TEvNotify::EOp op, ui64 round) + { + Record.SetTabletID(tabletId); + Record.SetOp(op); + Record.SetRound(round); + } +}; + +} // namespace NKikimr diff --git a/ydb/core/tablet/bootstrapper_ut.cpp b/ydb/core/tablet/bootstrapper_ut.cpp new file mode 100644 index 000000000000..eb3f22ec34b9 --- /dev/null +++ b/ydb/core/tablet/bootstrapper_ut.cpp @@ -0,0 +1,428 @@ +#include "bootstrapper.h" +#include "bootstrapper_impl.h" +#include +#include +#include +#include +#include +#include +#include + +namespace NKikimr { + +Y_UNIT_TEST_SUITE(BootstrapperTest) { + + class TSimpleTablet + : public TActor + , public NTabletFlatExecutor::TTabletExecutedFlat + { + public: + TSimpleTablet(const TActorId &tablet, TTabletStorageInfo *info) + : TActor(&TThis::StateInit) + , TTabletExecutedFlat(info, tablet, nullptr) + {} + + private: + void DefaultSignalTabletActive(const TActorContext&) override { + // must be empty + } + + void OnActivateExecutor(const TActorContext& ctx) override { + Become(&TThis::StateWork); + SignalTabletActive(ctx); + } + + void OnDetach(const TActorContext &ctx) override { + return Die(ctx); + } + + void OnTabletDead(TEvTablet::TEvTabletDead::TPtr &, const TActorContext &ctx) override { + return Die(ctx); + } + + void Handle(TEvents::TEvPing::TPtr& ev) { + Send(ev->Sender, new TEvents::TEvPong); + } + + STFUNC(StateInit) { + StateInitImpl(ev, SelfId()); + } + + STFUNC(StateWork) { + switch (ev->GetTypeRewrite()) { + hFunc(TEvents::TEvPing, Handle); + default: + HandleDefaultEvents(ev, SelfId()); + } + } + }; + + TIntrusivePtr CreateSimpleTabletStorageInfo(ui64 tabletId = TTestTxConfig::TxTablet0) { + return CreateTestTabletInfo(tabletId, TTabletTypes::Dummy); + } + + TIntrusivePtr CreateSimpleTabletSetupInfo() { + return MakeIntrusive( + [](const TActorId& tablet, TTabletStorageInfo* info) { + return new TSimpleTablet(tablet, info); + }, + TMailboxType::Simple, + ui32(0), + TMailboxType::Simple, + ui32(0)); + } + + TActorId StartSimpleTablet( + TTestActorRuntime& runtime, + const TActorId& launcher = {}, + ui32 nodeIdx = 0, + ui64 tabletId = TTestTxConfig::TxTablet0) + { + auto tabletInfo = CreateSimpleTabletStorageInfo(tabletId); + auto setupInfo = CreateSimpleTabletSetupInfo(); + auto actor = runtime.Register( + CreateTablet(launcher, tabletInfo.Get(), setupInfo.Get(), /* generation */ 0), + nodeIdx); + runtime.EnableScheduleForActor(actor); + return actor; + } + + std::vector StartSimpleTabletBootstrappers( + TTestActorRuntime& runtime, + const std::vector& nodeIdxs, + ui64 tabletId = TTestTxConfig::TxTablet0) + { + std::vector boots; + auto tabletInfo = CreateSimpleTabletStorageInfo(tabletId); + auto setupInfo = CreateSimpleTabletSetupInfo(); + for (ui32 nodeIdx : nodeIdxs) { + auto bootInfo = MakeIntrusive(setupInfo.Get()); + for (ui32 otherNodeIdx : nodeIdxs) { + if (otherNodeIdx != nodeIdx) { + bootInfo->OtherNodes.push_back(runtime.GetNodeId(otherNodeIdx)); + } + } + boots.push_back(runtime.Register(CreateBootstrapper(tabletInfo.Get(), bootInfo.Get()), nodeIdx)); + runtime.EnableScheduleForActor(boots.back()); + // Make this bootstrapper discoverable by tablet id / node id + runtime.RegisterService( + MakeBootstrapperID(tabletId, runtime.GetNodeId(nodeIdx)), + boots.back(), + nodeIdx); + } + return boots; + } + + TActorId StartSimpleTabletBootstrapper( + TTestActorRuntime& runtime, + ui32 nodeIdx = 0, + ui64 tabletId = TTestTxConfig::TxTablet0) + { + return StartSimpleTabletBootstrappers(runtime, {nodeIdx}, tabletId).at(0); + } + + Y_UNIT_TEST(LoneBootstrapper) { + TTestBasicRuntime runtime; + SetupTabletServices(runtime); + auto sender = runtime.AllocateEdgeActor(); + + StartSimpleTabletBootstrapper(runtime); + + ui32 gen1; + auto client = runtime.ConnectToPipe(TTestTxConfig::TxTablet0, sender, 0, NTabletPipe::TClientRetryPolicy::WithRetries()); + { + Cerr << "... waiting for pipe to connect" << Endl; + auto ev = runtime.GrabEdgeEventRethrow(sender); + UNIT_ASSERT_VALUES_EQUAL(ev->Get()->Status, NKikimrProto::OK); + gen1 = ev->Get()->Generation; + } + Cerr << "... stopping current instance" << Endl; + runtime.SendToPipe(client, sender, new TEvents::TEvPoison); + { + Cerr << "... waiting for pipe to disconnect" << Endl; + auto ev = runtime.GrabEdgeEventRethrow(sender); + } + + ui32 gen2; + client = runtime.ConnectToPipe(TTestTxConfig::TxTablet0, sender, 0, NTabletPipe::TClientRetryPolicy::WithRetries()); + { + Cerr << "... waiting for pipe to connect" << Endl; + auto ev = runtime.GrabEdgeEventRethrow(sender); + UNIT_ASSERT_VALUES_EQUAL(ev->Get()->Status, NKikimrProto::OK); + gen2 = ev->Get()->Generation; + } + + UNIT_ASSERT_C(gen1 < gen2, "Unexpected gen1# " << gen1 << " not before gen2# " << gen2); + } + + Y_UNIT_TEST(KeepExistingTablet) { + TTestBasicRuntime runtime(2); + SetupTabletServices(runtime); + + auto launcher = runtime.AllocateEdgeActor(1); + auto instance = StartSimpleTablet(runtime, launcher, 1); + + bool dead = false; + auto deadObserver = runtime.AddObserver( + [&](TEvTablet::TEvTabletDead::TPtr& ev) { + if (ev->Sender == instance) { + dead = true; + } + }); + + auto sender = runtime.AllocateEdgeActor(); + + ui32 gen1; + auto client = runtime.ConnectToPipe(TTestTxConfig::TxTablet0, sender, 0, NTabletPipe::TClientRetryPolicy::WithRetries()); + { + Cerr << "... waiting for pipe to connect" << Endl; + auto ev = runtime.GrabEdgeEventRethrow(sender); + UNIT_ASSERT_VALUES_EQUAL(ev->Get()->Status, NKikimrProto::OK); + gen1 = ev->Get()->Generation; + } + + StartSimpleTabletBootstrapper(runtime); + + Cerr << "... sleeping (original instance should be preserved)" << Endl; + runtime.SimulateSleep(TDuration::Seconds(1)); + UNIT_ASSERT_C(!dead, "The original instance should be preserved"); + + runtime.Send(new IEventHandle(instance, launcher, new TEvents::TEvPoison), 1); + runtime.WaitFor("original instance to stop", [&]{ return dead; }); + + ui32 gen2; + client = runtime.ConnectToPipe(TTestTxConfig::TxTablet0, sender, 0, NTabletPipe::TClientRetryPolicy::WithRetries()); + { + Cerr << "... waiting for pipe to connect" << Endl; + auto ev = runtime.GrabEdgeEventRethrow(sender); + UNIT_ASSERT_VALUES_EQUAL(ev->Get()->Status, NKikimrProto::OK); + gen2 = ev->Get()->Generation; + } + + UNIT_ASSERT_C(gen1 < gen2, "Unexpected gen1# " << gen1 << " not before gen2# " << gen2); + } + + Y_UNIT_TEST(RestartUnavailableTablet) { + TTestBasicRuntime runtime(3); + SetupTabletServices(runtime); + + auto launcher = runtime.AllocateEdgeActor(1); + auto instance = StartSimpleTablet(runtime, launcher, 1); + Y_UNUSED(instance); + + auto sender = runtime.AllocateEdgeActor(); + + ui32 gen1; + auto client = runtime.ConnectToPipe(TTestTxConfig::TxTablet0, sender, 0, NTabletPipe::TClientRetryPolicy::WithRetries()); + { + Cerr << "... waiting for pipe to connect" << Endl; + auto ev = runtime.GrabEdgeEventRethrow(sender); + UNIT_ASSERT_VALUES_EQUAL(ev->Get()->Status, NKikimrProto::OK); + UNIT_ASSERT_VALUES_EQUAL(ev->Get()->ServerId.NodeId(), runtime.GetNodeId(1)); + gen1 = ev->Get()->Generation; + } + + TBlockEvents blockedConnect(runtime, + [&](const auto& ev) { + return ev->Get()->Record.GetTabletId() == TTestTxConfig::TxTablet0; + }); + + StartSimpleTabletBootstrapper(runtime, 2); + + runtime.WaitFor("blocked connect attempt", [&]{ return blockedConnect.size() >= 1; }); + blockedConnect.Stop(); + + Cerr << "... disconnecting nodes 2 <-> 1" << Endl; + runtime.DisconnectNodes(2, 1); + + { + Cerr << "... waiting for pipe to disconnect" << Endl; + auto ev = runtime.GrabEdgeEventRethrow(sender); + } + + ui32 gen2; + client = runtime.ConnectToPipe(TTestTxConfig::TxTablet0, sender, 0, NTabletPipe::TClientRetryPolicy::WithRetries()); + { + Cerr << "... waiting for pipe to connect" << Endl; + auto ev = runtime.GrabEdgeEventRethrow(sender); + UNIT_ASSERT_VALUES_EQUAL(ev->Get()->Status, NKikimrProto::OK); + UNIT_ASSERT_VALUES_EQUAL(ev->Get()->ServerId.NodeId(), runtime.GetNodeId(2)); + gen2 = ev->Get()->Generation; + } + + UNIT_ASSERT_C(gen1 < gen2, "Unexpected gen1# " << gen1 << " not before gen2# " << gen2); + } + + Y_UNIT_TEST(UnavailableStateStorage) { + TTestBasicRuntime runtime(3); + SetupTabletServices(runtime); + runtime.SetLogPriority(NKikimrServices::BOOTSTRAPPER, NActors::NLog::PRI_DEBUG); + + auto launcher = runtime.AllocateEdgeActor(1); + auto instance = StartSimpleTablet(runtime, launcher, 1); + + bool dead = false; + auto deadObserver = runtime.AddObserver( + [&](TEvTablet::TEvTabletDead::TPtr& ev) { + if (ev->Sender == instance) { + dead = true; + } + }); + + auto sender = runtime.AllocateEdgeActor(); + + ui32 gen1; + auto client = runtime.ConnectToPipe(TTestTxConfig::TxTablet0, sender, 0, NTabletPipe::TClientRetryPolicy::WithRetries()); + { + Cerr << "... waiting for pipe to connect" << Endl; + auto ev = runtime.GrabEdgeEventRethrow(sender); + UNIT_ASSERT_VALUES_EQUAL(ev->Get()->Status, NKikimrProto::OK); + UNIT_ASSERT_VALUES_EQUAL(ev->Get()->ServerId.NodeId(), runtime.GetNodeId(1)); + gen1 = ev->Get()->Generation; + } + + ui32 node2 = runtime.GetNodeId(2); + TBlockEvents blockedReplicaLookup(runtime, + [&](const auto& ev) { + // Block EvReplicaLookup requests from node 2 and disconnect every time + if (ev->Sender.NodeId() == node2 && ev->Get()->Record.GetTabletID() == TTestTxConfig::TxTablet0) { + Cerr << "... disconnecting nodes 2 <-> 0 (" << ev->Get()->ToString() << " for " << ev->GetRecipientRewrite() << ")" << Endl; + runtime.DisconnectNodes(2, 0); + return true; + } + return false; + }); + + StartSimpleTabletBootstrapper(runtime, 2); + runtime.WaitFor("multiple state storage lookup attempts", [&]{ return blockedReplicaLookup.size() >= 6; }); + + UNIT_ASSERT_C(!dead, "The original instance should be preserved"); + + Y_UNUSED(client); + Y_UNUSED(gen1); + } + + Y_UNIT_TEST(MultipleBootstrappers) { + TTestBasicRuntime runtime(4); + SetupTabletServices(runtime); + runtime.SetLogPriority(NKikimrServices::BOOTSTRAPPER, NActors::NLog::PRI_DEBUG); + + StartSimpleTabletBootstrappers(runtime, {1, 2, 3}); + + Cerr << "... sleeping for 2 seconds" << Endl; + runtime.SimulateSleep(TDuration::Seconds(2)); + + auto sender = runtime.AllocateEdgeActor(); + + ui32 gen1; + ui32 initialNode; + ui32 initialNodeIdx; + auto client = runtime.ConnectToPipe(TTestTxConfig::TxTablet0, sender, 0, NTabletPipe::TClientRetryPolicy::WithRetries()); + { + Cerr << "... waiting for pipe to connect" << Endl; + auto ev = runtime.GrabEdgeEventRethrow(sender); + UNIT_ASSERT_VALUES_EQUAL(ev->Get()->Status, NKikimrProto::OK); + gen1 = ev->Get()->Generation; + initialNode = ev->Get()->ServerId.NodeId(); + initialNodeIdx = initialNode - runtime.GetNodeId(0); + } + + Cerr << "... tablet initially started on node " << initialNode + << " (idx " << initialNodeIdx << ")" << " in gen " << gen1 << Endl; + UNIT_ASSERT_VALUES_EQUAL_C(gen1, 2u, "Expected tablet to start from gen 2"); + + std::vector otherNodeIdxs; + for (ui32 nodeIdx : {1, 2, 3}) { + if (nodeIdx != initialNodeIdx) { + otherNodeIdxs.push_back(nodeIdx); + } + } + + Cerr << "... disconnecting other nodes" << Endl; + for (ui32 nodeIdx : otherNodeIdxs) { + runtime.DisconnectNodes(initialNodeIdx, nodeIdx); + } + + Cerr << "... sleeping for 2 seconds (tablet expected to survive)" << Endl; + runtime.SimulateSleep(TDuration::Seconds(2)); + + { + runtime.SendToPipe(client, sender, new TEvents::TEvPing); + runtime.GrabEdgeEventRethrow(sender); + } + + // Block new connect attempts to tablet at that node + TBlockEvents blockConnect(runtime, + [&](const auto& ev) { + if (ev->GetRecipientRewrite().NodeId() == initialNode && + ev->Get()->Record.GetTabletId() == TTestTxConfig::TxTablet0) + { + ui32 otherNodeIdx = ev->Sender.NodeId() - runtime.GetNodeId(0); + Cerr << "... disconnecting nodes " << initialNodeIdx << " <-> " << otherNodeIdx + << " (tablet connect attempt)" << Endl; + runtime.DisconnectNodes(initialNodeIdx, otherNodeIdx); + return true; + } + return false; + }); + + Cerr << "... disconnecting other nodes (new tablet connections fail)" << Endl; + for (ui32 nodeIdx : otherNodeIdxs) { + runtime.DisconnectNodes(initialNodeIdx, nodeIdx); + } + + Cerr << "... sleeping for 2 seconds (tablet expected to survive)" << Endl; + runtime.SimulateSleep(TDuration::Seconds(2)); + + { + runtime.SendToPipe(client, sender, new TEvents::TEvPing); + runtime.GrabEdgeEventRethrow(sender); + } + + // Block watch messages towards tablet node, a new owner must be selected + TBlockEvents blockWatch(runtime, + [&](const auto& ev) { + if (ev->GetRecipientRewrite().NodeId() == initialNode && + ev->Get()->Record.GetTabletID() == TTestTxConfig::TxTablet0) + { + ui32 otherNodeIdx = ev->Sender.NodeId() - runtime.GetNodeId(0); + Cerr << "... disconnecting nodes " << initialNodeIdx << " <-> " << otherNodeIdx + << " (bootstrap watch attempt)" << Endl; + runtime.DisconnectNodes(initialNodeIdx, otherNodeIdx); + return true; + } + return false; + }); + + Cerr << "... disconnect other nodes (new owner expected)" << Endl; + for (ui32 nodeIdx : otherNodeIdxs) { + runtime.DisconnectNodes(initialNodeIdx, nodeIdx); + } + + Cerr << "... sleeping for 2 seconds (new tablet expected to start once)" << Endl; + runtime.SimulateSleep(TDuration::Seconds(2)); + + ui32 gen2; + ui32 secondNode; + ui32 secondNodeIdx; + auto sender2 = runtime.AllocateEdgeActor(); + auto client2 = runtime.ConnectToPipe(TTestTxConfig::TxTablet0, sender2, 0, NTabletPipe::TClientRetryPolicy::WithRetries()); + { + Cerr << "... waiting for pipe to connect" << Endl; + auto ev = runtime.GrabEdgeEventRethrow(sender2); + UNIT_ASSERT_VALUES_EQUAL(ev->Get()->Status, NKikimrProto::OK); + gen2 = ev->Get()->Generation; + secondNode = ev->Get()->ServerId.NodeId(); + secondNodeIdx = secondNode - runtime.GetNodeId(0); + } + + UNIT_ASSERT_C(secondNodeIdx != initialNodeIdx, "Tablet expected to move to a different node"); + UNIT_ASSERT_C(gen2 == gen1 + 1, "Tablet restarted with gen2# " << gen2 << " after gen1# " << gen1); + + runtime.GrabEdgeEventRethrow(sender); + Y_UNUSED(client2); + } + +} // Y_UNIT_TEST_SUITE(BootstrapperTest) + +} // namespace NKikimr diff --git a/ydb/core/tablet/tablet_sys.cpp b/ydb/core/tablet/tablet_sys.cpp index aeebe5706a11..5f0c5a1a7186 100644 --- a/ydb/core/tablet/tablet_sys.cpp +++ b/ydb/core/tablet/tablet_sys.cpp @@ -817,6 +817,7 @@ void TTablet::HandleStateStorageInfoResolve(TEvStateStorage::TEvInfo::TPtr &ev) return PromoteToCandidate(0); } case NKikimrProto::ERROR: + case NKikimrProto::NODATA: case NKikimrProto::RACE: case NKikimrProto::TIMEOUT: return LockedInitializationPath(); @@ -848,6 +849,7 @@ void TTablet::HandleStateStorageInfoLock(TEvStateStorage::TEvInfo::TPtr &ev) { } return; case NKikimrProto::ERROR: + case NKikimrProto::NODATA: return CancelTablet(TEvTablet::TEvTabletDead::ReasonBootSSError); case NKikimrProto::TIMEOUT: return CancelTablet(TEvTablet::TEvTabletDead::ReasonBootSSTimeout); @@ -884,6 +886,7 @@ void TTablet::HandleStateStorageInfoUpgrade(TEvStateStorage::TEvInfo::TPtr &ev) return TabletBlockBlobStorage(); } case NKikimrProto::ERROR: + case NKikimrProto::NODATA: return CancelTablet(TEvTablet::TEvTabletDead::ReasonBootSSError); case NKikimrProto::TIMEOUT: return CancelTablet(TEvTablet::TEvTabletDead::ReasonBootSSTimeout); diff --git a/ydb/core/tablet/ut/ya.make b/ydb/core/tablet/ut/ya.make index ccd05f5eb909..93d89efd0123 100644 --- a/ydb/core/tablet/ut/ya.make +++ b/ydb/core/tablet/ut/ya.make @@ -18,6 +18,7 @@ PEERDIR( YQL_LAST_ABI_VERSION() SRCS( + bootstrapper_ut.cpp pipe_tracker_ut.cpp resource_broker_ut.cpp tablet_counters_ut.cpp diff --git a/ydb/core/tablet/ya.make b/ydb/core/tablet/ya.make index 7013c5cd0897..a9979cc56375 100644 --- a/ydb/core/tablet/ya.make +++ b/ydb/core/tablet/ya.make @@ -2,6 +2,8 @@ LIBRARY() SRCS( bootstrapper.cpp + bootstrapper.h + bootstrapper_impl.h defs.h labeled_counters_merger.cpp labeled_counters_merger.h diff --git a/ydb/core/tablet_flat/flat_executor_leases_ut.cpp b/ydb/core/tablet_flat/flat_executor_leases_ut.cpp index 4191b60d576c..807774b1bcbb 100644 --- a/ydb/core/tablet_flat/flat_executor_leases_ut.cpp +++ b/ydb/core/tablet_flat/flat_executor_leases_ut.cpp @@ -227,12 +227,12 @@ Y_UNIT_TEST_SUITE(TFlatExecutorLeases) { const ui64 tabletId = TTestTxConfig::TxTablet0; - auto boot1 = CreateTestBootstrapper(runtime, + auto instance1 = StartTestTablet(runtime, CreateTestTabletInfo(tabletId, TTabletTypes::Dummy), [enableInitialLease](const TActorId & tablet, TTabletStorageInfo* info) { return new TLeasesTablet(tablet, info, enableInitialLease); }); - runtime.EnableScheduleForActor(boot1); + runtime.EnableScheduleForActor(instance1); { TDispatchOptions options; @@ -262,13 +262,6 @@ Y_UNIT_TEST_SUITE(TFlatExecutorLeases) { return TTestActorRuntime::EEventAction::DROP; } break; - case TEvTablet::TEvTabletDead::EventType: - // Prevent tablets from restarting - // This is most important for the boot1 actor, since it - // quickly receives bad commit signal and tries to restart - // the original tablet. However we prevent executor from - // killing itself too, so we could make additional queries. - return TTestActorRuntime::EEventAction::DROP; case TEvTablet::TEvDemoted::EventType: // Block guardian from telling tablet about a new generation return TTestActorRuntime::EEventAction::DROP; @@ -280,13 +273,13 @@ Y_UNIT_TEST_SUITE(TFlatExecutorLeases) { }; auto prevObserver = runtime.SetObserverFunc(observerFunc); - auto boot2 = CreateTestBootstrapper(runtime, + auto instance2 = StartTestTablet(runtime, CreateTestTabletInfo(tabletId, TTabletTypes::Dummy), [enableInitialLease](const TActorId & tablet, TTabletStorageInfo* info) { return new TLeasesTablet(tablet, info, enableInitialLease); }, /* node index */ 1); - runtime.EnableScheduleForActor(boot2); + runtime.EnableScheduleForActor(instance2); { TDispatchOptions options; diff --git a/ydb/core/testlib/basics/helpers.cpp b/ydb/core/testlib/basics/helpers.cpp index 811ec532121e..f791fc32f4da 100644 --- a/ydb/core/testlib/basics/helpers.cpp +++ b/ydb/core/testlib/basics/helpers.cpp @@ -31,6 +31,13 @@ namespace NKikimr { return runtime.Register(CreateBootstrapper(info, bi.Get()), nodeIndex); } + TActorId StartTestTablet(TTestActorRuntime &runtime, TTabletStorageInfo *info, + std::function op, ui32 nodeIndex) + { + auto setup = MakeIntrusive(op, TMailboxType::Simple, ui32(0), TMailboxType::Simple, ui32(0)); + return runtime.Register(CreateTablet({}, info, setup.Get(), 0), nodeIndex); + } + NTabletPipe::TClientConfig GetPipeConfigWithRetries() { NTabletPipe::TClientConfig pipeConfig; diff --git a/ydb/core/testlib/basics/helpers.h b/ydb/core/testlib/basics/helpers.h index d21d9c4d4d02..5f095240f30c 100644 --- a/ydb/core/testlib/basics/helpers.h +++ b/ydb/core/testlib/basics/helpers.h @@ -36,6 +36,8 @@ namespace NFake { TBlobStorageGroupType::EErasureSpecies erasure = BootGroupErasure, ui32 groupId = 0); TActorId CreateTestBootstrapper(TTestActorRuntime &runtime, TTabletStorageInfo *info, std::function op, ui32 nodeIndex = 0); + TActorId StartTestTablet(TTestActorRuntime &runtime, TTabletStorageInfo *info, + std::function op, ui32 nodeIndex = 0); NTabletPipe::TClientConfig GetPipeConfigWithRetries(); void SetupStateStorage(TTestActorRuntime& runtime, ui32 nodeIndex, diff --git a/ydb/core/tx/coordinator/coordinator_volatile_ut.cpp b/ydb/core/tx/coordinator/coordinator_volatile_ut.cpp index cb57683785b4..a6dbd1177e50 100644 --- a/ydb/core/tx/coordinator/coordinator_volatile_ut.cpp +++ b/ydb/core/tx/coordinator/coordinator_volatile_ut.cpp @@ -545,9 +545,9 @@ namespace NKikimr::NFlatTxCoordinator::NTest { // Rewind to some older time runtime.UpdateCurrentTime(oldTimestamp, /* rewind */ true); - // Start a new bootstrapper, which will boot a new instance in parallel + // Start a new tablet instance in parallel Cerr << "... starting a new coordinator instance" << Endl; - CreateTestBootstrapper(runtime, CreateTestTabletInfo(coordinatorId, TTabletTypes::Coordinator), &CreateFlatTxCoordinator); + StartTestTablet(runtime, CreateTestTabletInfo(coordinatorId, TTabletTypes::Coordinator), &CreateFlatTxCoordinator); // Wait until new coordinator almost receives the in-memory state waitFor([&]{ return blockedStateResponses.size() >= 1; }, "migrated state"); @@ -824,9 +824,9 @@ namespace NKikimr::NFlatTxCoordinator::NTest { // Rewind to some older time runtime.UpdateCurrentTime(oldTimestamp, /* rewind */ true); - // Start a new bootstrapper, which will boot a new instance in parallel + // Start a new tablet instance in parallel Cerr << "... starting a new coordinator instance" << Endl; - CreateTestBootstrapper(runtime, CreateTestTabletInfo(coordinatorId, TTabletTypes::Coordinator), &CreateFlatTxCoordinator); + StartTestTablet(runtime, CreateTestTabletInfo(coordinatorId, TTabletTypes::Coordinator), &CreateFlatTxCoordinator); // Wait until new coordinator almost receives the in-memory state waitFor([&]{ return blockedStateResponses.size() >= 1; }, "migrated state"); diff --git a/ydb/core/tx/schemeshard/ut_subdomain/ut_subdomain.cpp b/ydb/core/tx/schemeshard/ut_subdomain/ut_subdomain.cpp index 8fc34d9edbe0..5ebc9b2ff64e 100644 --- a/ydb/core/tx/schemeshard/ut_subdomain/ut_subdomain.cpp +++ b/ydb/core/tx/schemeshard/ut_subdomain/ut_subdomain.cpp @@ -789,7 +789,7 @@ Y_UNIT_TEST_SUITE(TSchemeShardSubDomainTest) { {NLs::PathExist, NLs::Finished, NLs::NotInSubdomain, - NLs::PathVersionEqual(7), // it is 6 if drop simultaneous with create + NLs::PathVersionOneOf({6, 7}), // it is 6 if drop simultaneous with create NLs::PathsInsideDomain(0), NLs::ShardsInsideDomainOneOf({0, 1, 2, 3})}); UNIT_ASSERT(!CheckLocalRowExists(runtime, TTestTxConfig::SchemeShard, "SubDomains", "PathId", 2)); @@ -1096,7 +1096,7 @@ Y_UNIT_TEST_SUITE(TSchemeShardSubDomainTest) { TestDescribeResult(DescribePath(runtime, "/MyRoot"), {NLs::PathExist, - NLs::PathVersionEqual(7), // version 6 if deletion is simultaneous with creation + NLs::PathVersionOneOf({6, 7}), // version 6 if deletion is simultaneous with creation NLs::PathsInsideDomain(0), NLs::ShardsInsideDomain(0)});