Skip to content

Commit

Permalink
Host tablets needing recovery
Browse files Browse the repository at this point in the history
Modified the TabletManagementIterator to return NEEDS_RECOVERY
when the tablet has wals and is not being deleted. Modified
TabletGoalState to return the goal of HOSTED when the tablet
has wals and tablet availability is UNHOSTED or ONDEMAND.

Closes apache#3663
  • Loading branch information
dlmarion committed Feb 5, 2024
1 parent 5f5cbd3 commit b7bcb71
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,12 @@ public class TabletManagement {
private static final Text EMPTY = new Text("");

public static enum ManagementAction {
BAD_STATE, NEEDS_COMPACTING, NEEDS_LOCATION_UPDATE, NEEDS_SPLITTING, NEEDS_VOLUME_REPLACEMENT;
BAD_STATE,
NEEDS_COMPACTING,
NEEDS_LOCATION_UPDATE,
NEEDS_RECOVERY,
NEEDS_SPLITTING,
NEEDS_VOLUME_REPLACEMENT;
}

public static void addActions(final SortedMap<Key,Value> decodedRow,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
*/
package org.apache.accumulo.server.manager.state;

import org.apache.accumulo.core.client.admin.TabletAvailability;
import org.apache.accumulo.core.data.TabletId;
import org.apache.accumulo.core.dataImpl.KeyExtent;
import org.apache.accumulo.core.dataImpl.TabletIdImpl;
Expand Down Expand Up @@ -84,6 +85,15 @@ public static TabletGoalState compute(TabletMetadata tm, TabletState currentStat
return TabletGoalState.UNASSIGNED;
}

// When the tablet has wals and it will not be hosted normally, then cause it to
// be hosted so that recovery can occur. When tablet availability is ONDEMAND or
// UNHOSTED, then this tablet will eventually become unhosted after recovery occurs.
// This could cause a little bit of churn on the cluster w/r/t balancing, but it's
// necessary.
if (!tm.getLogs().isEmpty() && tm.getTabletAvailability() != TabletAvailability.HOSTED) {
return TabletGoalState.HOSTED;
}

if (!params.isTableOnline(tm.getTableId())) {
return UNASSIGNED;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.SuspendLocationColumn;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.TabletColumnFamily;
import org.apache.accumulo.core.metadata.schema.TabletMetadata;
import org.apache.accumulo.core.metadata.schema.TabletOperationType;
import org.apache.accumulo.core.spi.balancer.SimpleLoadBalancer;
import org.apache.accumulo.core.spi.balancer.TabletBalancer;
import org.apache.accumulo.core.spi.compaction.CompactionKind;
Expand Down Expand Up @@ -259,6 +260,11 @@ private void computeTabletManagementActions(final TabletMetadata tm,
reasonsToReturnThisTablet.add(ManagementAction.NEEDS_LOCATION_UPDATE);
}

if (!tm.getLogs().isEmpty() && (tm.getOperationId() == null
|| tm.getOperationId().getType() != TabletOperationType.DELETING)) {
reasonsToReturnThisTablet.add(ManagementAction.NEEDS_RECOVERY);
}

if (tm.getOperationId() == null) {
try {
final long splitThreshold =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,11 @@ private TableMgmtStats manageTablets(Iterator<TabletManagement> iter,
final TabletGoalState goal =
TabletGoalState.compute(tm, state, manager.tabletBalancer, tableMgmtParams);

if (actions.contains(ManagementAction.NEEDS_RECOVERY) && goal != TabletGoalState.HOSTED) {
LOG.warn("Tablet has wals, but goal is not hosted. Tablet: {}, goal:{}", tm.getExtent(),
goal);
}

if (actions.contains(ManagementAction.NEEDS_VOLUME_REPLACEMENT)) {
tableMgmtStats.totalVolumeReplacements++;
if (state == TabletState.UNASSIGNED || state == TabletState.SUSPENDED) {
Expand Down Expand Up @@ -513,7 +518,8 @@ private TableMgmtStats manageTablets(Iterator<TabletManagement> iter,
}

if (actions.contains(ManagementAction.NEEDS_SPLITTING)
&& !actions.contains(ManagementAction.NEEDS_VOLUME_REPLACEMENT)) {
&& !actions.contains(ManagementAction.NEEDS_VOLUME_REPLACEMENT)
&& !actions.contains(ManagementAction.NEEDS_RECOVERY)) {
LOG.debug("{} may need splitting.", tm.getExtent());
if (manager.getSplitter().isSplittable(tm)) {
if (manager.getSplitter().addSplitStarting(tm.getExtent())) {
Expand All @@ -529,7 +535,8 @@ private TableMgmtStats manageTablets(Iterator<TabletManagement> iter,
}

if (actions.contains(ManagementAction.NEEDS_COMPACTING)
&& !actions.contains(ManagementAction.NEEDS_VOLUME_REPLACEMENT)) {
&& !actions.contains(ManagementAction.NEEDS_VOLUME_REPLACEMENT)
&& !actions.contains(ManagementAction.NEEDS_RECOVERY)) {
var jobs = compactionGenerator.generateJobs(tm,
TabletManagementIterator.determineCompactionKinds(actions));
LOG.debug("{} may need compacting adding {} jobs", tm.getExtent(), jobs.size());
Expand All @@ -542,14 +549,19 @@ private TableMgmtStats manageTablets(Iterator<TabletManagement> iter,
// entries from the queue because we see nothing here for that case. After a full
// metadata scan could remove any tablets that were not updated during the scan.

if (actions.contains(ManagementAction.NEEDS_LOCATION_UPDATE)) {
if (actions.contains(ManagementAction.NEEDS_LOCATION_UPDATE)
|| actions.contains(ManagementAction.NEEDS_RECOVERY)) {

if (tm.getLocation() != null) {
filteredServersToShutdown.remove(tm.getLocation().getServerInstance());
}

if (goal == TabletGoalState.HOSTED) {
if ((state != TabletState.HOSTED && !tm.getLogs().isEmpty())

// RecoveryManager.recoverLogs will return false when all of the logs
// have been sorted so that recovery can occur. Delay the hosting of
// the Tablet until the sorting is finished.
if ((state != TabletState.HOSTED && actions.contains(ManagementAction.NEEDS_RECOVERY))
&& manager.recoveryManager.recoverLogs(tm.getExtent(), tm.getLogs())) {
LOG.debug("Not hosting {} as it needs recovery, logs: {}", tm.getExtent(),
tm.getLogs().size());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ public void test() throws AccumuloException, AccumuloSecurityException, TableExi

try (AccumuloClient client = Accumulo.newClient().from(getClientProps()).build()) {

String[] tables = getUniqueNames(8);
String[] tables = getUniqueNames(9);
final String t1 = tables[0];
final String t2 = tables[1];
final String t3 = tables[2];
Expand All @@ -119,6 +119,7 @@ public void test() throws AccumuloException, AccumuloSecurityException, TableExi
final String metaCopy2 = tables[5];
final String metaCopy3 = tables[6];
final String metaCopy4 = tables[7];
final String metaCopy5 = tables[8];

// create some metadata
createTable(client, t1, true);
Expand Down Expand Up @@ -152,6 +153,7 @@ public void test() throws AccumuloException, AccumuloSecurityException, TableExi
copyTable(client, metaCopy1, metaCopy2);
copyTable(client, metaCopy1, metaCopy3);
copyTable(client, metaCopy1, metaCopy4);
copyTable(client, metaCopy1, metaCopy5);

// t1 is unassigned, setting to always will generate a change to host tablets
setTabletAvailability(client, metaCopy1, t1, TabletAvailability.HOSTED.name());
Expand All @@ -177,6 +179,18 @@ public void test() throws AccumuloException, AccumuloSecurityException, TableExi
assertEquals(1, findTabletsNeedingAttention(client, metaCopy2, tabletMgmtParams),
"Only 1 of 2 tablets in table t1 should be returned");

// Test the recovery cases
createLogEntry(client, metaCopy5, t1);
setTabletAvailability(client, metaCopy5, t1, TabletAvailability.UNHOSTED.name());
assertEquals(1, findTabletsNeedingAttention(client, metaCopy5, tabletMgmtParams),
"Only 1 of 2 tablets in table t1 should be returned");
setTabletAvailability(client, metaCopy5, t1, TabletAvailability.ONDEMAND.name());
assertEquals(1, findTabletsNeedingAttention(client, metaCopy5, tabletMgmtParams),
"Only 1 of 2 tablets in table t1 should be returned");
setTabletAvailability(client, metaCopy5, t1, TabletAvailability.HOSTED.name());
assertEquals(2, findTabletsNeedingAttention(client, metaCopy5, tabletMgmtParams),
"2 tablets in table t1 should be returned");

// Remove location and set merge operation id on both tablets
// These tablets should not need attention as they have no WALs
setTabletAvailability(client, metaCopy4, t4, TabletAvailability.HOSTED.name());
Expand Down Expand Up @@ -225,7 +239,7 @@ public void test() throws AccumuloException, AccumuloSecurityException, TableExi
"Should have one tablet that needs a volume replacement");

// clean up
dropTables(client, t1, t2, t3, t4, metaCopy1, metaCopy2, metaCopy3, metaCopy4);
dropTables(client, t1, t2, t3, t4, metaCopy1, metaCopy2, metaCopy3, metaCopy4, metaCopy5);
}
}

Expand Down

0 comments on commit b7bcb71

Please sign in to comment.