Skip to content
This repository was archived by the owner on Jan 7, 2022. It is now read-only.

Commit 0058029

Browse files
AhmedSolimanfacebook-github-bot
authored andcommitted
Support removing PROVISIONING nodes and apply DRAINED maintenance on them
Summary: This addresses the following cases: - A node that is PROVISIONING should be removable (removeNodes shouldn't reject that) - Applying a MAY_DISAPPEAR or DRAINED maintenance on a node/shard that is PROVISIONING should appear as COMPLETED (MaintenanceProgress). - Treat the shard workflow at the state of PROVISIONING or NONE to be equal, both have completed, no further actions needed. Reviewed By: MohamedBassem Differential Revision: D17817955 fbshipit-source-id: b56df01bbc5046d166a314ff1edcf589f24cef54
1 parent 5fae8d7 commit 0058029

File tree

4 files changed

+63
-2
lines changed

4 files changed

+63
-2
lines changed

logdevice/admin/cluster_membership/RemoveNodesHandler.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,9 @@ check_is_disabled(const NodesConfiguration& nodes_configuration,
7474
nodes_configuration.getStorageMembership()->getShardStates(idx);
7575
for (const auto& state : states) {
7676
storage_disabled &=
77-
state.second.storage_state == membership::StorageState::NONE;
77+
(state.second.storage_state == membership::StorageState::NONE ||
78+
state.second.storage_state ==
79+
membership::StorageState::PROVISIONING);
7880
}
7981
}
8082

logdevice/admin/maintenance/MaintenanceManager.cpp

+9-1
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,9 @@ MaintenanceManager::getShardOperationalStateInternal(ShardID shard) const {
601601
result = ShardOperationalState::ENABLED;
602602
}
603603
break;
604+
case membership::StorageState::PROVISIONING:
605+
result = ShardOperationalState::PROVISIONING;
606+
break;
604607
default:
605608
// This should never happen. All storage state
606609
// cases are handled above
@@ -1896,12 +1899,17 @@ bool MaintenanceManager::isTargetAchieved(ShardOperationalState current,
18961899
static folly::F14FastSet<ShardOperationalState> may_disappear_states{
18971900
{ShardOperationalState::MAY_DISAPPEAR,
18981901
ShardOperationalState::MIGRATING_DATA,
1902+
ShardOperationalState::PROVISIONING,
18991903
ShardOperationalState::DRAINED}};
19001904

1905+
// Any of these states are considered higher or equals the DRAINED state.
1906+
static folly::F14FastSet<ShardOperationalState> drained_states{
1907+
{ShardOperationalState::PROVISIONING, ShardOperationalState::DRAINED}};
1908+
19011909
if (target == ShardOperationalState::MAY_DISAPPEAR) {
19021910
return may_disappear_states.count(current) > 0;
19031911
} else if (target == ShardOperationalState::DRAINED) {
1904-
return current == ShardOperationalState::DRAINED;
1912+
return drained_states.count(current) > 0;
19051913
} else {
19061914
// we don't know any other targets.
19071915
ld_assert(false);

logdevice/admin/maintenance/ShardWorkflow.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,10 @@ void ShardWorkflow::computeMaintenanceStatusForDrain() {
125125

126126
switch (current_storage_state_) {
127127
case membership::StorageState::NONE:
128+
// If the node is provisioning, we can consider it drained as well. Any
129+
// maintenance that needs this node to be drained will appear completed
130+
// immediately.
131+
case membership::StorageState::PROVISIONING:
128132
// We have reached the target already, there is no further transitions
129133
// needed to declare the shard as DRAINED.
130134
updateStatus(MaintenanceStatus::COMPLETED);

logdevice/admin/test/ClusterMembershipAPIHandlerTest.cpp

+47
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,53 @@ TEST_F(ClusterMemebershipAPIIntegrationTest, TestRemoveAliveNodes) {
165165
}
166166
}
167167

168+
TEST_F(ClusterMemebershipAPIIntegrationTest, TestRemoveProvisioningNodes) {
169+
ASSERT_EQ(0, cluster_->start({0, 1, 2, 3}));
170+
cluster_->getNode(0).waitUntilNodeStateReady();
171+
auto admin_client = cluster_->getNode(0).createAdminClient();
172+
173+
{
174+
// Add two nodes with 2 shards each. They will get added as PROVISIONING.
175+
thrift::AddNodesResponse resp;
176+
admin_client->sync_addNodes(resp, buildAddNodesRequest({100, 101}));
177+
ASSERT_EQ(2, resp.added_nodes.size());
178+
179+
wait_until("AdminServer's NC picks the additions", [&]() {
180+
thrift::NodesConfigResponse nc;
181+
admin_client->sync_getNodesConfig(nc, thrift::NodesFilter{});
182+
return nc.version >= resp.new_nodes_configuration_version;
183+
});
184+
}
185+
186+
thrift::RemoveNodesResponse resp;
187+
admin_client->sync_removeNodes(resp, buildRemoveNodesRequest({100, 101}));
188+
EXPECT_EQ(2, resp.removed_nodes.size());
189+
}
190+
191+
TEST_F(ClusterMemebershipAPIIntegrationTest, TestApplyDrainOnProvisioning) {
192+
ASSERT_EQ(0, cluster_->start({0, 1, 2, 3}));
193+
cluster_->getNode(0).waitUntilNodeStateReady();
194+
auto admin_client = cluster_->getNode(0).createAdminClient();
195+
196+
{
197+
// Add two nodes with 2 shards each. They will get added as PROVISIONING.
198+
thrift::AddNodesResponse resp;
199+
admin_client->sync_addNodes(resp, buildAddNodesRequest({100, 101}));
200+
ASSERT_EQ(2, resp.added_nodes.size());
201+
202+
wait_until("AdminServer's NC picks the additions", [&]() {
203+
thrift::NodesConfigResponse nc;
204+
admin_client->sync_getNodesConfig(nc, thrift::NodesFilter{});
205+
return nc.version >= resp.new_nodes_configuration_version;
206+
});
207+
}
208+
209+
// We didn't provision the shared, let's apply a DRAINED maintenance
210+
// immediately.
211+
disableAndWait(
212+
{mkShardID(100, -1), mkShardID(101, -1)}, {mkNodeID(100), mkNodeID(101)});
213+
}
214+
168215
TEST_F(ClusterMemebershipAPIIntegrationTest, TestRemoveNonExistentNode) {
169216
ASSERT_EQ(0, cluster_->start({0, 1, 2, 3}));
170217
auto admin_client = cluster_->getNode(0).createAdminClient();

0 commit comments

Comments
 (0)