From fc27cb9507932b2f837f049c5161829de7dcba9b Mon Sep 17 00:00:00 2001 From: Maxim Deb Natkh Date: Wed, 16 Oct 2024 18:56:16 +0200 Subject: [PATCH] issue-1146: load NodeRefs upon tablet load (#2241) * issue-1146: load NodeRefs upon tablet load --- cloud/filestore/config/storage.proto | 2 + cloud/filestore/libs/storage/core/config.cpp | 3 + cloud/filestore/libs/storage/core/config.h | 2 + .../libs/storage/service/service_ut.cpp | 4 +- .../libs/storage/tablet/tablet_actor.cpp | 3 + .../libs/storage/tablet/tablet_actor.h | 15 +++ .../storage/tablet/tablet_actor_loadstate.cpp | 20 ++-- .../tablet_actor_loadstate_noderefs.cpp | 106 +++++++++++++++++ .../libs/storage/tablet/tablet_database.cpp | 97 +++++++++++++--- .../libs/storage/tablet/tablet_database.h | 19 +++ .../libs/storage/tablet/tablet_private.h | 14 +-- .../libs/storage/tablet/tablet_state.h | 10 ++ .../storage/tablet/tablet_state_cache.cpp | 80 ++++++++++++- .../libs/storage/tablet/tablet_state_cache.h | 28 +++-- .../libs/storage/tablet/tablet_state_iface.h | 14 +++ .../storage/tablet/tablet_state_nodes.cpp | 23 ++++ .../filestore/libs/storage/tablet/tablet_tx.h | 40 +++++++ .../libs/storage/tablet/tablet_ut_cache.cpp | 109 +++++++++++++++++- .../libs/storage/tablet/tablet_ut_data.cpp | 4 +- cloud/filestore/libs/storage/tablet/ya.make | 1 + 20 files changed, 542 insertions(+), 52 deletions(-) create mode 100644 cloud/filestore/libs/storage/tablet/tablet_actor_loadstate_noderefs.cpp diff --git a/cloud/filestore/config/storage.proto b/cloud/filestore/config/storage.proto index fb1f49ebec..e49b624917 100644 --- a/cloud/filestore/config/storage.proto +++ b/cloud/filestore/config/storage.proto @@ -365,6 +365,8 @@ message TStorageConfig reserved 375; // InMemoryIndexCacheNodeAttrsVerCapacity optional uint64 InMemoryIndexCacheNodeRefsCapacity = 376; reserved 377; // InMemoryIndexCacheNodeRefsVerCapacity + optional bool InMemoryIndexCacheLoadOnTabletStart = 398; + optional uint64 InMemoryIndexCacheLoadOnTabletStartRowsPerTx = 399; // Used to send non-network metrics as network ones to HIVE, // while we use them for load balancing diff --git a/cloud/filestore/libs/storage/core/config.cpp b/cloud/filestore/libs/storage/core/config.cpp index 113b3d0309..327958601f 100644 --- a/cloud/filestore/libs/storage/core/config.cpp +++ b/cloud/filestore/libs/storage/core/config.cpp @@ -199,6 +199,9 @@ using TAliases = NProto::TStorageConfig::TFilestoreAliases; xxx(InMemoryIndexCacheNodesCapacity, ui64, 0 )\ xxx(InMemoryIndexCacheNodeAttrsCapacity, ui64, 0 )\ xxx(InMemoryIndexCacheNodeRefsCapacity, ui64, 0 )\ + xxx(InMemoryIndexCacheLoadOnTabletStart, bool, false )\ + xxx(InMemoryIndexCacheLoadOnTabletStartRowsPerTx, ui64, 1000000 )\ + \ xxx(NonNetworkMetricsBalancingFactor, ui32, 1_KB )\ \ xxx(AsyncDestroyHandleEnabled, bool, false )\ diff --git a/cloud/filestore/libs/storage/core/config.h b/cloud/filestore/libs/storage/core/config.h index cd7a32e242..b12c411d23 100644 --- a/cloud/filestore/libs/storage/core/config.h +++ b/cloud/filestore/libs/storage/core/config.h @@ -233,6 +233,8 @@ class TStorageConfig ui64 GetInMemoryIndexCacheNodesCapacity() const; ui64 GetInMemoryIndexCacheNodeAttrsCapacity() const; ui64 GetInMemoryIndexCacheNodeRefsCapacity() const; + bool GetInMemoryIndexCacheLoadOnTabletStart() const; + ui64 GetInMemoryIndexCacheLoadOnTabletStartRowsPerTx() const; bool GetAsyncDestroyHandleEnabled() const; TDuration GetAsyncHandleOperationPeriod() const; diff --git a/cloud/filestore/libs/storage/service/service_ut.cpp b/cloud/filestore/libs/storage/service/service_ut.cpp index e07c9bd186..726887c207 100644 --- a/cloud/filestore/libs/storage/service/service_ut.cpp +++ b/cloud/filestore/libs/storage/service/service_ut.cpp @@ -5820,11 +5820,11 @@ Y_UNIT_TEST_SUITE(TStorageServiceTest) break; } case TEvIndexTabletPrivate:: - EvLoadCompactionMapChunkCompleted: { + EvLoadCompactionMapChunkResponse: { lastCompactionMapRangeId = Max( event ->Get() + TEvLoadCompactionMapChunkResponse>() ->LastRangeId, lastCompactionMapRangeId); break; diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor.cpp index a5e8a9477d..295badbae1 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor.cpp @@ -875,6 +875,9 @@ STFUNC(TIndexTabletActor::StateWork) HFunc(TEvIndexTabletPrivate::TEvReadDataCompleted, HandleReadDataCompleted); HFunc(TEvIndexTabletPrivate::TEvWriteDataCompleted, HandleWriteDataCompleted); HFunc(TEvIndexTabletPrivate::TEvAddDataCompleted, HandleAddDataCompleted); + HFunc( + TEvIndexTabletPrivate::TEvLoadCompactionMapChunkResponse, + HandleLoadCompactionMapChunkResponse); HFunc(TEvIndexTabletPrivate::TEvUpdateCounters, HandleUpdateCounters); HFunc(TEvIndexTabletPrivate::TEvUpdateLeakyBucketCounters, HandleUpdateLeakyBucketCounters); diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor.h b/cloud/filestore/libs/storage/tablet/tablet_actor.h index 2f8fd9e988..5528356d4c 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor.h +++ b/cloud/filestore/libs/storage/tablet/tablet_actor.h @@ -362,6 +362,17 @@ class TIndexTabletActor final TVector GenerateForceDeleteZeroCompactionRanges() const; + /** + * @brief If necessary, code can iteratively call ReadNodeRefs for all + * nodes. This will populate cache with node refs and allow us to perform + * ListNodes using in-memory index state by knowing that the nodeRefs cache + * is exhaustive. + */ + void LoadNodeRefs( + const NActors::TActorContext& ctx, + ui64 nodeId, + const TString& name); + void AddTransaction( TRequestInfo& transaction, TRequestInfo::TCancelRoutine cancelRoutine); @@ -602,6 +613,10 @@ class TIndexTabletActor final const TEvIndexTabletPrivate::TEvNodeUnlinkedInShard::TPtr& ev, const NActors::TActorContext& ctx); + void HandleLoadCompactionMapChunkResponse( + const TEvIndexTabletPrivate::TEvLoadCompactionMapChunkResponse::TPtr& ev, + const NActors::TActorContext& ctx); + void SendMetricsToExecutor(const NActors::TActorContext& ctx); bool HandleRequests(STFUNC_SIG); diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_loadstate.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_loadstate.cpp index 4c53ec35f0..943d969fd4 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor_loadstate.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_loadstate.cpp @@ -307,6 +307,13 @@ void TIndexTabletActor::CompleteTx_LoadState( LOG_INFO_S(ctx, TFileStoreComponents::TABLET, LogTag << " Scheduling startup events"); + + if (Config->GetInMemoryIndexCacheEnabled() && + Config->GetInMemoryIndexCacheLoadOnTabletStart()) + { + LoadNodeRefs(ctx, 0, ""); + } + ScheduleSyncSessions(ctx); ScheduleCleanupSessions(ctx); RestartCheckpointDestruction(ctx); @@ -373,8 +380,8 @@ void TIndexTabletActor::LoadNextCompactionMapChunkIfNeeded( //////////////////////////////////////////////////////////////////////////////// -void TIndexTabletActor::HandleLoadCompactionMapChunkCompleted( - const TEvIndexTabletPrivate::TEvLoadCompactionMapChunkCompleted::TPtr& ev, +void TIndexTabletActor::HandleLoadCompactionMapChunkResponse( + const TEvIndexTabletPrivate::TEvLoadCompactionMapChunkResponse::TPtr& ev, const TActorContext& ctx) { const auto* msg = ev->Get(); @@ -488,18 +495,11 @@ void TIndexTabletActor::CompleteTx_LoadCompactionMapChunk( } using TNotification = - TEvIndexTabletPrivate::TEvLoadCompactionMapChunkCompleted; + TEvIndexTabletPrivate::TEvLoadCompactionMapChunkResponse; auto notification = std::make_unique( args.FirstRangeId, args.LastRangeId); NCloud::Send(ctx, SelfId(), std::move(notification)); - - if (args.RequestInfo->Sender != ctx.SelfID) { - using TResponse = - TEvIndexTabletPrivate::TEvLoadCompactionMapChunkResponse; - auto response = std::make_unique(); - NCloud::Reply(ctx, *args.RequestInfo, std::move(response)); - } } } // namespace NCloud::NFileStore::NStorage diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_loadstate_noderefs.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_loadstate_noderefs.cpp new file mode 100644 index 0000000000..316e375584 --- /dev/null +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_loadstate_noderefs.cpp @@ -0,0 +1,106 @@ +#include "tablet_actor.h" + +namespace NCloud::NFileStore::NStorage { + +using namespace NActors; + +//////////////////////////////////////////////////////////////////////////////// + +void TIndexTabletActor::LoadNodeRefs( + const NActors::TActorContext& ctx, + ui64 nodeId, + const TString& name) +{ + const ui64 maxNodeRefs = Config->GetInMemoryIndexCacheLoadOnTabletStartRowsPerTx(); + + LOG_INFO( + ctx, + TFileStoreComponents::TABLET, + "%s LoadNodeRefs iteration started (nodeId: %lu, name: %s, " + "maxNodeRefs: %lu)", + LogTag.c_str(), + nodeId, + name.c_str(), + maxNodeRefs); + + ExecuteTx( + ctx, + nodeId, + name, + maxNodeRefs); +} + +//////////////////////////////////////////////////////////////////////////////// + +bool TIndexTabletActor::ValidateTx_LoadNodeRefs( + const TActorContext& ctx, + TTxIndexTablet::TLoadNodeRefs& args) +{ + LOG_INFO( + ctx, + TFileStoreComponents::TABLET, + "%s LoadingNodeRefs (nodeId: %lu, name: %s, maxNodeRefs: %lu)", + LogTag.c_str(), + args.NodeId, + args.Cookie.c_str(), + args.MaxNodeRefs); + return true; +} + +bool TIndexTabletActor::PrepareTx_LoadNodeRefs( + const TActorContext& ctx, + IIndexTabletDatabase& db, + TTxIndexTablet::TLoadNodeRefs& args) +{ + TVector nodeRefs; + + bool ready = db.ReadNodeRefs( + args.NodeId, + args.Cookie, + args.MaxNodeRefs, + nodeRefs, + args.NextNodeId, + args.NextCookie); + + LOG_INFO( + ctx, + TFileStoreComponents::TABLET, + "%s LoadingNodeRefs (nodeId: %lu, name: %s, maxNodeRefs: %lu), read " + "%lu nodeRefs: %s", + LogTag.c_str(), + args.NodeId, + args.Cookie.c_str(), + args.MaxNodeRefs, + nodeRefs.size(), + ready ? "finished" : "restarted"); + + return ready; +} + +void TIndexTabletActor::CompleteTx_LoadNodeRefs( + const TActorContext& ctx, + TTxIndexTablet::TLoadNodeRefs& args) +{ + LOG_INFO( + ctx, + TFileStoreComponents::TABLET, + "%s LoadNodeRefs iteration completed, next nodeId: %lu, next cookie: " + "%s", + LogTag.c_str(), + args.NextNodeId, + args.NextCookie.c_str()); + + if (args.NextCookie || args.NextNodeId) { + LoadNodeRefs(ctx, args.NextNodeId, args.NextCookie); + } else { + LOG_INFO( + ctx, + TFileStoreComponents::TABLET, + "%s LoadNodeRefs completed", + LogTag.c_str()); + + MarkNodeRefsLoadComplete(); + } +} + +} // namespace NCloud::NFileStore::NStorage diff --git a/cloud/filestore/libs/storage/tablet/tablet_database.cpp b/cloud/filestore/libs/storage/tablet/tablet_database.cpp index 98e3c45196..fa5e3e99fc 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_database.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_database.cpp @@ -598,6 +598,50 @@ bool TIndexTabletDatabase::ReadNodeRefs( return true; } +bool TIndexTabletDatabase::ReadNodeRefs( + ui64 startNodeId, + const TString& startCookie, + ui64 maxCount, + TVector& refs, + ui64& nextNodeId, + TString& nextCookie) +{ + using TTable = TIndexTabletSchema::NodeRefs; + + if (!startNodeId && startCookie.empty()) { + Table().Precharge(); + } + + auto it = Table().GreaterOrEqual(startNodeId, startCookie).Select(); + + if (!it.IsReady()) { + return false; // not ready + } + + while (it.IsValid() && maxCount > 0) { + refs.emplace_back(TNodeRef{ + it.GetValue(), + it.GetValue(), + it.GetValue(), + it.GetValue(), + it.GetValue(), + it.GetValue(), + InvalidCommitId}); + --maxCount; + + if (!it.Next()) { + return false; // not ready + } + } + + if (it.IsValid()) { + nextNodeId = it.GetValue(); + nextCookie = it.GetValue(); + } + + return true; +} + bool TIndexTabletDatabase::PrechargeNodeRefs( ui64 nodeId, const TString& cookie, @@ -2030,13 +2074,7 @@ bool TIndexTabletDatabaseProxy::ReadNodeRef( if (result && ref) { // If ReadNodeRef was successful, it is reasonable to update the cache // with the value that has just been read. - NodeUpdates.emplace_back(TInMemoryIndexState::TWriteNodeRefsRequest{ - .NodeRefsKey = {nodeId, name}, - .NodeRefsRow = { - .CommitId = ref->MinCommitId, - .ChildId = ref->ChildNodeId, - .ShardId = ref->ShardId, - .ShardName = ref->ShardName}}); + NodeUpdates.emplace_back(ExtractWriteNodeRefsFromNodeRef(*ref)); } return result; } @@ -2055,13 +2093,32 @@ bool TIndexTabletDatabaseProxy::ReadNodeRefs( // If ReadNodeRefs was successful, it is reasonable to update the cache // with the values that have just been read. for (const auto& ref: refs) { - NodeUpdates.emplace_back(TInMemoryIndexState::TWriteNodeRefsRequest{ - .NodeRefsKey = {nodeId, ref.Name}, - .NodeRefsRow = { - .CommitId = ref.MinCommitId, - .ChildId = ref.ChildNodeId, - .ShardId = ref.ShardId, - .ShardName = ref.ShardName}}); + NodeUpdates.emplace_back(ExtractWriteNodeRefsFromNodeRef(ref)); + } + } + return result; +} + +bool TIndexTabletDatabaseProxy::ReadNodeRefs( + ui64 startNodeId, + const TString& startCookie, + ui64 maxCount, + TVector& refs, + ui64& nextNodeId, + TString& nextCookie) +{ + auto result = TIndexTabletDatabase::ReadNodeRefs( + startNodeId, + startCookie, + maxCount, + refs, + nextNodeId, + nextCookie); + if (result) { + // If ReadNodeRefs was successful, it is reasonable to update the cache + // with the values that have just been read. + for (const auto& ref: refs) { + NodeUpdates.emplace_back(ExtractWriteNodeRefsFromNodeRef(ref)); } } return result; @@ -2127,4 +2184,16 @@ void TIndexTabletDatabaseProxy::DeleteNodeRefVer( // TODO(#1146): _Ver tables not yet supported } +TInMemoryIndexState::TWriteNodeRefsRequest +TIndexTabletDatabaseProxy::ExtractWriteNodeRefsFromNodeRef(const TNodeRef& ref) +{ + return TInMemoryIndexState::TWriteNodeRefsRequest{ + .NodeRefsKey = {ref.NodeId, ref.Name}, + .NodeRefsRow = { + .CommitId = ref.MinCommitId, + .ChildId = ref.ChildNodeId, + .ShardId = ref.ShardId, + .ShardName = ref.ShardName}}; +} + } // namespace NCloud::NFileStore::NStorage diff --git a/cloud/filestore/libs/storage/tablet/tablet_database.h b/cloud/filestore/libs/storage/tablet/tablet_database.h index a8563de946..2c0a6f9d9a 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_database.h +++ b/cloud/filestore/libs/storage/tablet/tablet_database.h @@ -207,6 +207,14 @@ FILESTORE_FILESYSTEM_STATS(FILESTORE_DECLARE_STATS) ui32 maxBytes, TString* next = nullptr) override; + virtual bool ReadNodeRefs( + ui64 startNodeId, + const TString& startCookie, + ui64 maxCount, + TVector& refs, + ui64& nextNodeId, + TString& nextCookie) override; + virtual bool PrechargeNodeRefs( ui64 nodeId, const TString& cookie, @@ -611,6 +619,14 @@ class TIndexTabletDatabaseProxy: public TIndexTabletDatabase ui32 maxBytes, TString* next = nullptr) override; + bool ReadNodeRefs( + ui64 startNodeId, + const TString& startCookie, + ui64 maxCount, + TVector& refs, + ui64& nextNodeId, + TString& nextCookie) override; + void WriteNodeRef( ui64 nodeId, ui64 commitId, @@ -641,6 +657,9 @@ class TIndexTabletDatabaseProxy: public TIndexTabletDatabase private: TVector& NodeUpdates; + + static TInMemoryIndexState::TWriteNodeRefsRequest + ExtractWriteNodeRefsFromNodeRef(const TNodeRef& ref); }; } // namespace NCloud::NFileStore::NStorage diff --git a/cloud/filestore/libs/storage/tablet/tablet_private.h b/cloud/filestore/libs/storage/tablet/tablet_private.h index eed6fc0037..5d80a5c8e0 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_private.h +++ b/cloud/filestore/libs/storage/tablet/tablet_private.h @@ -37,7 +37,6 @@ namespace NCloud::NFileStore::NStorage { xxx(ReadBlob, __VA_ARGS__) \ xxx(WriteBlob, __VA_ARGS__) \ xxx(WriteBatch, __VA_ARGS__) \ - xxx(LoadCompactionMapChunk, __VA_ARGS__) \ // FILESTORE_TABLET_REQUESTS_PRIVATE_ASYNC #define FILESTORE_TABLET_REQUESTS_PRIVATE_SYNC(xxx, ...) \ @@ -50,6 +49,7 @@ namespace NCloud::NFileStore::NStorage { xxx(FilterAliveNodes, __VA_ARGS__) \ xxx(GenerateCommitId, __VA_ARGS__) \ xxx(SyncShardSessions, __VA_ARGS__) \ + xxx(LoadCompactionMapChunk, __VA_ARGS__) \ // FILESTORE_TABLET_REQUESTS_PRIVATE #define FILESTORE_TABLET_REQUESTS_PRIVATE(xxx, ...) \ @@ -84,7 +84,7 @@ namespace NCloud::NFileStore::NStorage { #define FILESTORE_IGNORE_COMPLETION(name, ns) \ IgnoreFunc(ns::TEv##name##Completed); \ -// FILESTORE_HANDLE_COMPLETION +// FILESTORE_IGNORE_COMPLETION //////////////////////////////////////////////////////////////////////////////// @@ -509,14 +509,12 @@ struct TEvIndexTabletPrivate struct TLoadCompactionMapChunkResponse { - }; + const ui32 FirstRangeId = 0; + const ui32 LastRangeId = 0; - struct TLoadCompactionMapChunkCompleted - { - const ui32 FirstRangeId; - const ui32 LastRangeId; + TLoadCompactionMapChunkResponse() = default; - TLoadCompactionMapChunkCompleted( + TLoadCompactionMapChunkResponse( ui32 firstRangeId, ui32 lastRangeId) : FirstRangeId(firstRangeId) diff --git a/cloud/filestore/libs/storage/tablet/tablet_state.h b/cloud/filestore/libs/storage/tablet/tablet_state.h index 22a9e54e42..809e2ce0e4 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_state.h +++ b/cloud/filestore/libs/storage/tablet/tablet_state.h @@ -530,6 +530,15 @@ FILESTORE_FILESYSTEM_STATS(FILESTORE_DECLARE_COUNTER) ui32 maxBytes, TString* next = nullptr); + bool ReadNodeRefs( + IIndexTabletDatabase& db, + ui64 startNodeId, + const TString& startCookie, + ui64 maxCount, + TVector& refs, + ui64& nextNodeId, + TString& nextCookie); + bool PrechargeNodeRefs( IIndexTabletDatabase& db, ui64 nodeId, @@ -1287,6 +1296,7 @@ FILESTORE_DUPCACHE_REQUESTS(FILESTORE_DECLARE_DUPCACHE) IIndexTabletDatabase& AccessInMemoryIndexState(); void UpdateInMemoryIndexState( TVector nodeUpdates); + void MarkNodeRefsLoadComplete(); }; } // namespace NCloud::NFileStore::NStorage diff --git a/cloud/filestore/libs/storage/tablet/tablet_state_cache.cpp b/cloud/filestore/libs/storage/tablet/tablet_state_cache.cpp index 9ad97ff4af..ed78fe802d 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_state_cache.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_state_cache.cpp @@ -21,6 +21,26 @@ void TInMemoryIndexState::Reset( NodeRefsCapacity = nodeRefsCapacity; } +void TInMemoryIndexState::LoadNodeRefs(const TVector& nodeRefs) +{ + for (const auto& nodeRef: nodeRefs) { + WriteNodeRef( + nodeRef.NodeId, + nodeRef.MinCommitId, + nodeRef.Name, + nodeRef.ChildNodeId, + nodeRef.ShardId, + nodeRef.ShardName); + } +} + +void TInMemoryIndexState::MarkNodeRefsLoadComplete() +{ + // If during the startup there were no evictions, then the cache should be + // complete upon the load completion. + IsNodeRefsExhaustive = !IsNodeRefsEvictionObserved; +} + // // Nodes // @@ -211,9 +231,60 @@ bool TInMemoryIndexState::ReadNodeRefs( ui32 maxBytes, TString* next) { - // TInMemoryIndexState is a preemptive cache, thus it is impossible to - // determine, whether the set of stored references is complete. - Y_UNUSED(nodeId, commitId, cookie, refs, maxBytes, next); + if (!IsNodeRefsExhaustive) { + // TInMemoryIndexState is a preemptive cache, thus it is impossible to + // determine, whether the set of stored references is complete. + return false; + } + + auto it = NodeRefs.lower_bound(TNodeRefsKey(nodeId, cookie)); + + ui32 bytes = 0; + while (it != NodeRefs.end() && it->first.NodeId == nodeId) { + ui64 minCommitId = it->second.CommitId; + ui64 maxCommitId = InvalidCommitId; + + if (VisibleCommitId(commitId, minCommitId, maxCommitId)) { + refs.emplace_back(TNodeRef{ + nodeId, + it->first.Name, + it->second.ChildId, + it->second.ShardId, + it->second.ShardName, + minCommitId, + maxCommitId}); + + // FIXME: bytes should represent the size of entire entry, not just + // the name + bytes += refs.back().Name.size(); + } + + ++it; + + if (maxBytes && bytes >= maxBytes) { + break; + } + } + + if (next && it != NodeRefs.end() && it->first.NodeId == nodeId) { + *next = it->first.Name; + } + + return true; +} + +bool TInMemoryIndexState::ReadNodeRefs( + ui64 startNodeId, + const TString& startCookie, + ui64 maxCount, + TVector& refs, + ui64& nextNodeId, + TString& nextCookie) +{ + Y_UNUSED(startNodeId, startCookie, maxCount, refs, nextNodeId, nextCookie); + // This method is supposed to be called only upon tablet load in order to + // populate the cache with data from localDb. Thus implementing in via + // in-memory cache is unnecessary. return false; } @@ -237,6 +308,9 @@ void TInMemoryIndexState::WriteNodeRef( const auto key = TNodeRefsKey(nodeId, name); if (NodeRefs.size() == NodeRefsCapacity && !NodeRefs.contains(key)) { NodeRefs.clear(); + + IsNodeRefsEvictionObserved = true; + IsNodeRefsExhaustive = false; } NodeRefs[key] = TNodeRefsRow{ .CommitId = commitId, diff --git a/cloud/filestore/libs/storage/tablet/tablet_state_cache.h b/cloud/filestore/libs/storage/tablet/tablet_state_cache.h index 230ac5a96a..f3e45be550 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_state_cache.h +++ b/cloud/filestore/libs/storage/tablet/tablet_state_cache.h @@ -24,6 +24,10 @@ class TInMemoryIndexState : public IIndexTabletDatabase ui64 nodeAttrsCapacity, ui64 nodeRefsCapacity); + void LoadNodeRefs(const TVector& nodeRefs); + + void MarkNodeRefsLoadComplete(); + // // Nodes // @@ -110,6 +114,14 @@ class TInMemoryIndexState : public IIndexTabletDatabase ui32 maxBytes, TString* next) override; + bool ReadNodeRefs( + ui64 startNodeId, + const TString& startCookie, + ui64 maxCount, + TVector& refs, + ui64& nextNodeId, + TString& nextCookie) override; + bool PrechargeNodeRefs( ui64 nodeId, const TString& cookie, @@ -220,17 +232,9 @@ class TInMemoryIndexState : public IIndexTabletDatabase ui64 NodeId = 0; TString Name; - bool operator==(const TNodeRefsKey& rhs) const - { - return std::tie(NodeId, Name) == std::tie(rhs.NodeId, rhs.Name); - } - }; - - struct TNodeRefsKeyHash - { - size_t operator()(const TNodeRefsKey& key) const + bool operator<(const TNodeRefsKey& rhs) const { - return MultiHash(key.NodeId, key.Name); + return std::tie(NodeId, Name) < std::tie(rhs.NodeId, rhs.Name); } }; @@ -242,7 +246,9 @@ class TInMemoryIndexState : public IIndexTabletDatabase TString ShardName; }; - THashMap NodeRefs; + TMap NodeRefs; + bool IsNodeRefsEvictionObserved = false; + bool IsNodeRefsExhaustive = false; public: struct TWriteNodeRequest diff --git a/cloud/filestore/libs/storage/tablet/tablet_state_iface.h b/cloud/filestore/libs/storage/tablet/tablet_state_iface.h index e7c4a8a6eb..ccd9b65d98 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_state_iface.h +++ b/cloud/filestore/libs/storage/tablet/tablet_state_iface.h @@ -115,6 +115,20 @@ class IIndexTabletDatabase ui32 maxBytes, TString* next) = 0; + /** + * @brief read at most maxCount node refs starting from key + * (startNodeId, startCookie). Populates refs with the nodeRefs that have + * been read. If there are more nodeRefs to read, nextNodeId and nextCookie + * will be populated with the key to continue reading from + */ + virtual bool ReadNodeRefs( + ui64 startNodeId, + const TString& startCookie, + ui64 maxCount, + TVector& refs, + ui64& nextNodeId, + TString& nextCookie) = 0; + virtual bool PrechargeNodeRefs( ui64 nodeId, const TString& cookie, diff --git a/cloud/filestore/libs/storage/tablet/tablet_state_nodes.cpp b/cloud/filestore/libs/storage/tablet/tablet_state_nodes.cpp index 367f140deb..7ccbdfc91d 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_state_nodes.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_state_nodes.cpp @@ -505,6 +505,24 @@ bool TIndexTabletState::ReadNodeRefs( return ready; } +bool TIndexTabletState::ReadNodeRefs( + IIndexTabletDatabase& db, + ui64 startNodeId, + const TString& startCookie, + ui64 maxCount, + TVector& refs, + ui64& nextNodeId, + TString& nextCookie) +{ + return db.ReadNodeRefs( + startNodeId, + startCookie, + maxCount, + refs, + nextNodeId, + nextCookie); +} + bool TIndexTabletState::PrechargeNodeRefs( IIndexTabletDatabase& db, ui64 nodeId, @@ -596,4 +614,9 @@ void TIndexTabletState::UpdateInMemoryIndexState( Impl->InMemoryIndexState.UpdateState(nodeUpdates); } +void TIndexTabletState::MarkNodeRefsLoadComplete() +{ + Impl->InMemoryIndexState.MarkNodeRefsLoadComplete(); +} + } // namespace NCloud::NFileStore::NStorage diff --git a/cloud/filestore/libs/storage/tablet/tablet_tx.h b/cloud/filestore/libs/storage/tablet/tablet_tx.h index bb61615683..affce6a8bc 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_tx.h +++ b/cloud/filestore/libs/storage/tablet/tablet_tx.h @@ -77,6 +77,8 @@ namespace NCloud::NFileStore::NStorage { xxx(ListNodeXAttr, __VA_ARGS__) \ \ xxx(UnsafeGetNode, __VA_ARGS__) \ + \ + xxx(LoadNodeRefs, __VA_ARGS__) \ // FILESTORE_TABLET_RO_TRANSACTIONS #define FILESTORE_TABLET_RW_TRANSACTIONS(xxx, ...) \ @@ -2099,6 +2101,44 @@ struct TTxIndexTablet Node.Clear(); } }; + + // + // LoadNodeRefs + // + + // The whole point of this transaction is to observe some data in the + // NodeRefs table and populate the contents of TIndexStateNodeUpdates with it + + struct TLoadNodeRefs: TIndexStateNodeUpdates + { + // actually unused, needed in tablet_tx.h to avoid sophisticated + // template tricks + const TRequestInfoPtr RequestInfo; + + const ui64 NodeId; + const TString Cookie; + const ui64 MaxNodeRefs; + + ui64 NextNodeId = 0; + TString NextCookie; + + TLoadNodeRefs( + ui64 nodeId, + TString cookie, + ui64 maxNodeRefs) + : NodeId(nodeId) + , Cookie(std::move(cookie)) + , MaxNodeRefs(maxNodeRefs) + {} + + void Clear() + { + TIndexStateNodeUpdates::Clear(); + + NextNodeId = 0; + NextCookie.clear(); + } + }; }; } // namespace NCloud::NFileStore::NStorage diff --git a/cloud/filestore/libs/storage/tablet/tablet_ut_cache.cpp b/cloud/filestore/libs/storage/tablet/tablet_ut_cache.cpp index b218dbae4f..296b0fb7c5 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_ut_cache.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_ut_cache.cpp @@ -821,8 +821,113 @@ Y_UNIT_TEST_SUITE(TIndexTabletTest_NodesCache) UNIT_ASSERT_VALUES_EQUAL( 1, statsAfter.ROCacheMissCount - statsBefore.ROCacheMissCount); - UNIT_ASSERT_VALUES_EQUAL(0, statsAfter.RWCount - - statsBefore.RWCount); + UNIT_ASSERT_VALUES_EQUAL(0, statsAfter.RWCount - statsBefore.RWCount); + } + + Y_UNIT_TEST(ShouldUseNodeRefsCacheIfOneIsExhaustive) + { + NProto::TStorageConfig storageConfig; + storageConfig.SetInMemoryIndexCacheEnabled(true); + storageConfig.SetInMemoryIndexCacheNodesCapacity(100); + storageConfig.SetInMemoryIndexCacheNodeRefsCapacity(100); + storageConfig.SetInMemoryIndexCacheLoadOnTabletStart(true); + TTestEnv env({}, storageConfig); + env.CreateSubDomain("nfs"); + + ui32 nodeIdx = env.CreateNode("nfs"); + ui64 tabletId = env.BootIndexTablet(nodeIdx); + + TIndexTabletClient tablet(env.GetRuntime(), nodeIdx, tabletId); + tablet.InitSession("client", "session"); + + auto id1 = + tablet.CreateNode(TCreateNodeArgs::File(RootNodeId, "test1")) + ->Record.GetNode() + .GetId(); + auto id2 = + tablet.CreateNode(TCreateNodeArgs::File(RootNodeId, "test2")) + ->Record.GetNode() + .GetId(); + + tablet.RebootTablet(); + tablet.InitSession("client", "session"); + + auto statsBefore = GetTxStats(env, tablet); + + // RO transactions, populate the cache. These calls are made to populate + // the Nodes cache, which is also used for ListNodes requests + tablet.GetNodeAttr(RootNodeId, ""); + tablet.GetNodeAttr(id1, ""); + tablet.GetNodeAttr(id2, ""); + + // The noderefs cache is exhaustive thus list nodes should be a cache + // hit + UNIT_ASSERT_VALUES_EQUAL( + 2, + tablet.ListNodes(RootNodeId)->Record.NodesSize()); + + auto statsAfter = GetTxStats(env, tablet); + + UNIT_ASSERT_VALUES_EQUAL( + 1, + statsAfter.ROCacheHitCount - statsBefore.ROCacheHitCount); + UNIT_ASSERT_VALUES_EQUAL( + 3, + statsAfter.ROCacheMissCount - statsBefore.ROCacheMissCount); + UNIT_ASSERT_VALUES_EQUAL(0, statsAfter.RWCount - statsBefore.RWCount); + + statsBefore = statsAfter; + + auto id3 = + tablet.CreateNode(TCreateNodeArgs::Directory(RootNodeId, "test3")) + ->Record.GetNode() + .GetId(); + tablet.CreateNode(TCreateNodeArgs::File(id3, "test4")); + tablet.CreateNode(TCreateNodeArgs::File(id3, "test5")); + tablet.CreateNode(TCreateNodeArgs::File(id3, "test6")); + + /* + |- test1 + |- test2 + |- test3 + |- test4 + |- test5 + |- test6 + */ + + // The NodeRefs cache is still exhaustive thus list nodes should be a + // cache hit + UNIT_ASSERT_VALUES_EQUAL(3, tablet.ListNodes(id3)->Record.NodesSize()); + UNIT_ASSERT_VALUES_EQUAL( + 3, + tablet.ListNodes(RootNodeId)->Record.NodesSize()); + + statsAfter = GetTxStats(env, tablet); + + UNIT_ASSERT_VALUES_EQUAL( + 2, + statsAfter.ROCacheHitCount - statsBefore.ROCacheHitCount); + UNIT_ASSERT_VALUES_EQUAL( + 0, + statsAfter.ROCacheMissCount - statsBefore.ROCacheMissCount); + + // Now let us ensure that the cache is evicted + for (int i = 0; i < 100; ++i) { + tablet.CreateNode(TCreateNodeArgs::File(RootNodeId, std::to_string(i))); + } + + statsBefore = statsAfter; + + tablet.ListNodes(RootNodeId); + + statsAfter = GetTxStats(env, tablet); + + UNIT_ASSERT_VALUES_EQUAL( + 0, + statsAfter.ROCacheHitCount - statsBefore.ROCacheHitCount); + UNIT_ASSERT_VALUES_EQUAL( + 1, + statsAfter.ROCacheMissCount - statsBefore.ROCacheMissCount); } } diff --git a/cloud/filestore/libs/storage/tablet/tablet_ut_data.cpp b/cloud/filestore/libs/storage/tablet/tablet_ut_data.cpp index bec020bb45..5554ca354c 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_ut_data.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_ut_data.cpp @@ -3374,11 +3374,11 @@ Y_UNIT_TEST_SUITE(TIndexTabletTest_Data) ++requests; break; } - case TEvIndexTabletPrivate::EvLoadCompactionMapChunkCompleted: { + case TEvIndexTabletPrivate::EvLoadCompactionMapChunkResponse: { lastCompactionMapRangeId = Max( event->Get< TEvIndexTabletPrivate - ::TEvLoadCompactionMapChunkCompleted>()->LastRangeId, + ::TEvLoadCompactionMapChunkResponse>()->LastRangeId, lastCompactionMapRangeId); break; } diff --git a/cloud/filestore/libs/storage/tablet/ya.make b/cloud/filestore/libs/storage/tablet/ya.make index 9ede6d55ac..a18b629ecd 100644 --- a/cloud/filestore/libs/storage/tablet/ya.make +++ b/cloud/filestore/libs/storage/tablet/ya.make @@ -47,6 +47,7 @@ SRCS( tablet_actor_listnodes.cpp tablet_actor_listnodexattr.cpp tablet_actor_loadstate.cpp + tablet_actor_loadstate_noderefs.cpp tablet_actor_monitoring.cpp tablet_actor_oplog.cpp tablet_actor_readblob.cpp