Skip to content
This repository was archived by the owner on Jan 7, 2022. It is now read-only.

Commit 218e006

Browse files
MohamedBassemfacebook-github-bot
authored andcommittedOct 9, 2019
Retry marking shards as PROVISIONED on VERSION_MISMATCH
Summary: Marking shards as provisioned can fail because of races with other shards during cluster startup. Let's harden it against that by retries with exponential backoffs and jitters. Reviewed By: AhmedSoliman Differential Revision: D17761397 fbshipit-source-id: 6382c0dbff52380a9832be4a1244505815902b39
1 parent 19daa59 commit 218e006

4 files changed

+55
-39
lines changed
 

‎logdevice/server/RebuildingCoordinator.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,6 @@ int RebuildingCoordinator::checkMarkers() {
306306
RebuildingMarkerChecker checker(
307307
storage_mem->getShardStates(getMyNodeID().index()),
308308
getMyNodeID(),
309-
storage_mem->getVersion(),
310309
processor_->getNodesConfigurationManager(),
311310
shardedStore_);
312311

‎logdevice/server/RebuildingMarkerChecker.cpp

+55-33
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,17 @@
1010

1111
#include <tuple>
1212

13+
#include "logdevice/common/RetryHandler.h"
14+
1315
namespace facebook { namespace logdevice {
1416

1517
RebuildingMarkerChecker::RebuildingMarkerChecker(
1618
const std::unordered_map<shard_index_t, membership::ShardState>& shards,
1719
NodeID my_node_id,
18-
membership::MembershipVersion::Type storage_version,
1920
configuration::NodesConfigurationAPI* nc_api,
2021
ShardedLocalLogStore* sharded_store)
2122
: shards_(shards),
2223
my_node_id_(std::move(my_node_id)),
23-
storage_version_(storage_version),
2424
nc_api_(nc_api),
2525
sharded_store_(sharded_store) {
2626
ld_check(sharded_store_);
@@ -124,37 +124,59 @@ Status RebuildingMarkerChecker::markShardsAsProvisioned(
124124
return Status::OK;
125125
}
126126

127-
ld_info("Will mark shards %s as provisioned in the NodesConfiguration",
128-
toString(to_be_updated).c_str());
129-
130-
using namespace configuration::nodes;
131-
using namespace membership;
132-
133-
// Build the MARK_SHARD_PROVISIONED update
134-
NodesConfiguration::Update update;
135-
update.storage_config_update = std::make_unique<StorageConfig::Update>();
136-
update.storage_config_update->membership_update =
137-
std::make_unique<StorageMembership::Update>(storage_version_);
138-
for (auto shard : shards) {
139-
update.storage_config_update->membership_update->addShard(
140-
ShardID(my_node_id_.index(), shard),
141-
{
142-
StorageStateTransition::MARK_SHARD_PROVISIONED,
143-
Condition::EMPTY_SHARD | Condition::LOCAL_STORE_READABLE |
144-
Condition::NO_SELF_REPORT_MISSING_DATA,
145-
});
146-
}
147-
148-
// Apply the update
149-
Status update_status;
150-
folly::Baton<> b;
151-
nc_api_->update(std::move(update),
152-
[&](Status st, std::shared_ptr<const NodesConfiguration>) {
153-
update_status = st;
154-
b.post();
155-
});
156-
b.wait();
157-
return update_status;
127+
auto result = RetryHandler<Status>::syncRun(
128+
[&](size_t trial_num) {
129+
ld_info("Will mark shards %s as PROVISIONED in the NodesConfiguration "
130+
"(trial #%ld)",
131+
toString(to_be_updated).c_str(),
132+
trial_num);
133+
134+
using namespace configuration::nodes;
135+
using namespace membership;
136+
137+
auto storage_membership = nc_api_->getConfig()->getStorageMembership();
138+
auto storage_version = storage_membership->getVersion();
139+
140+
// Build the MARK_SHARD_PROVISIONED update
141+
NodesConfiguration::Update update;
142+
update.storage_config_update =
143+
std::make_unique<StorageConfig::Update>();
144+
update.storage_config_update->membership_update =
145+
std::make_unique<StorageMembership::Update>(storage_version);
146+
for (auto shard : shards) {
147+
auto shard_id = ShardID(my_node_id_.index(), shard);
148+
if (storage_membership->getShardState(shard_id)->storage_state !=
149+
StorageState::PROVISIONING) {
150+
continue;
151+
}
152+
update.storage_config_update->membership_update->addShard(
153+
shard_id,
154+
{
155+
StorageStateTransition::MARK_SHARD_PROVISIONED,
156+
Condition::EMPTY_SHARD | Condition::LOCAL_STORE_READABLE |
157+
Condition::NO_SELF_REPORT_MISSING_DATA,
158+
});
159+
}
160+
161+
// Apply the update
162+
Status update_status;
163+
folly::Baton<> b;
164+
nc_api_->update(
165+
std::move(update),
166+
[&](Status st, std::shared_ptr<const NodesConfiguration>) {
167+
update_status = st;
168+
b.post();
169+
});
170+
b.wait();
171+
return update_status;
172+
},
173+
[](const Status& st) { return st == Status::VERSION_MISMATCH; },
174+
/* num_tries */ 10,
175+
/* backoff_min */ std::chrono::seconds(1),
176+
/* backoff_max */ std::chrono::seconds(60),
177+
/* jitter_param */ 0.25);
178+
179+
return result.hasValue() ? result.value() : result.error();
158180
}
159181

160182
}} // namespace facebook::logdevice

‎logdevice/server/RebuildingMarkerChecker.h

-2
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,6 @@ class RebuildingMarkerChecker {
5757
RebuildingMarkerChecker(
5858
const std::unordered_map<shard_index_t, membership::ShardState>& shards,
5959
NodeID my_node_id,
60-
membership::MembershipVersion::Type storage_version,
6160
configuration::NodesConfigurationAPI* nc_api,
6261
ShardedLocalLogStore* sharded_store);
6362

@@ -84,7 +83,6 @@ class RebuildingMarkerChecker {
8483
// Shards to checkm
8584
std::unordered_map<shard_index_t, membership::ShardState> shards_;
8685
NodeID my_node_id_;
87-
membership::MembershipVersion::Type storage_version_;
8886
// The NCAPI to apply the NodesConfiguration update. The update is ignored
8987
// if it's a nullptr (meaning that the NCM is disabled).
9088
// If the NCAPI is set, it has to outlive the RebuildingMarkerChecker.

‎logdevice/server/test/RebuildingMarkerCheckerTest.cpp

-3
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ TEST(RebuildingMarkerCheckerTest, testAllProvisioning) {
4848
RebuildingMarkerChecker checker(
4949
nc.getStorageMembership()->getShardStates(node_idx),
5050
NodeID(node_idx, generation),
51-
nc.getStorageMembership()->getVersion(),
5251
nc_api.get(),
5352
store.get());
5453

@@ -93,7 +92,6 @@ TEST(RebuildingMarkerCheckerTest, testFirstGeneration) {
9392
RebuildingMarkerChecker checker(
9493
nc.getStorageMembership()->getShardStates(node_idx),
9594
NodeID(node_idx, generation),
96-
nc.getStorageMembership()->getVersion(),
9795
nullptr /* simulating a disabled NCM */,
9896
store.get());
9997

@@ -145,7 +143,6 @@ TEST(RebuildingMarkerCheckerTest, testMissingData) {
145143
RebuildingMarkerChecker checker(
146144
nc.getStorageMembership()->getShardStates(node_idx),
147145
NodeID(node_idx, generation),
148-
nc.getStorageMembership()->getVersion(),
149146
nc_api.get(),
150147
store.get());
151148

0 commit comments

Comments
 (0)
This repository has been archived.