From b7bcb718d73b1278510805f71dd2750a8ab40c6e Mon Sep 17 00:00:00 2001 From: Dave Marion Date: Mon, 5 Feb 2024 19:50:17 +0000 Subject: [PATCH] Host tablets needing recovery Modified the TabletManagementIterator to return NEEDS_RECOVERY when the tablet has wals and is not being deleted. Modified TabletGoalState to return the goal of HOSTED when the tablet has wals and tablet availability is UNHOSTED or ONDEMAND. Closes #3663 --- .../core/manager/state/TabletManagement.java | 7 ++++++- .../server/manager/state/TabletGoalState.java | 10 ++++++++++ .../state/TabletManagementIterator.java | 6 ++++++ .../accumulo/manager/TabletGroupWatcher.java | 20 +++++++++++++++---- .../TabletManagementIteratorIT.java | 18 +++++++++++++++-- 5 files changed, 54 insertions(+), 7 deletions(-) diff --git a/core/src/main/java/org/apache/accumulo/core/manager/state/TabletManagement.java b/core/src/main/java/org/apache/accumulo/core/manager/state/TabletManagement.java index 94c41af7ffe..04d956dd8c9 100644 --- a/core/src/main/java/org/apache/accumulo/core/manager/state/TabletManagement.java +++ b/core/src/main/java/org/apache/accumulo/core/manager/state/TabletManagement.java @@ -53,7 +53,12 @@ public class TabletManagement { private static final Text EMPTY = new Text(""); public static enum ManagementAction { - BAD_STATE, NEEDS_COMPACTING, NEEDS_LOCATION_UPDATE, NEEDS_SPLITTING, NEEDS_VOLUME_REPLACEMENT; + BAD_STATE, + NEEDS_COMPACTING, + NEEDS_LOCATION_UPDATE, + NEEDS_RECOVERY, + NEEDS_SPLITTING, + NEEDS_VOLUME_REPLACEMENT; } public static void addActions(final SortedMap decodedRow, diff --git a/server/base/src/main/java/org/apache/accumulo/server/manager/state/TabletGoalState.java b/server/base/src/main/java/org/apache/accumulo/server/manager/state/TabletGoalState.java index 17779a4a7f9..e4a30df8b43 100644 --- a/server/base/src/main/java/org/apache/accumulo/server/manager/state/TabletGoalState.java +++ b/server/base/src/main/java/org/apache/accumulo/server/manager/state/TabletGoalState.java @@ -18,6 +18,7 @@ */ package org.apache.accumulo.server.manager.state; +import org.apache.accumulo.core.client.admin.TabletAvailability; import org.apache.accumulo.core.data.TabletId; import org.apache.accumulo.core.dataImpl.KeyExtent; import org.apache.accumulo.core.dataImpl.TabletIdImpl; @@ -84,6 +85,15 @@ public static TabletGoalState compute(TabletMetadata tm, TabletState currentStat return TabletGoalState.UNASSIGNED; } + // When the tablet has wals and it will not be hosted normally, then cause it to + // be hosted so that recovery can occur. When tablet availability is ONDEMAND or + // UNHOSTED, then this tablet will eventually become unhosted after recovery occurs. + // This could cause a little bit of churn on the cluster w/r/t balancing, but it's + // necessary. + if (!tm.getLogs().isEmpty() && tm.getTabletAvailability() != TabletAvailability.HOSTED) { + return TabletGoalState.HOSTED; + } + if (!params.isTableOnline(tm.getTableId())) { return UNASSIGNED; } diff --git a/server/base/src/main/java/org/apache/accumulo/server/manager/state/TabletManagementIterator.java b/server/base/src/main/java/org/apache/accumulo/server/manager/state/TabletManagementIterator.java index ae0027c200b..af03968e4b8 100644 --- a/server/base/src/main/java/org/apache/accumulo/server/manager/state/TabletManagementIterator.java +++ b/server/base/src/main/java/org/apache/accumulo/server/manager/state/TabletManagementIterator.java @@ -55,6 +55,7 @@ import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.SuspendLocationColumn; import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.TabletColumnFamily; import org.apache.accumulo.core.metadata.schema.TabletMetadata; +import org.apache.accumulo.core.metadata.schema.TabletOperationType; import org.apache.accumulo.core.spi.balancer.SimpleLoadBalancer; import org.apache.accumulo.core.spi.balancer.TabletBalancer; import org.apache.accumulo.core.spi.compaction.CompactionKind; @@ -259,6 +260,11 @@ private void computeTabletManagementActions(final TabletMetadata tm, reasonsToReturnThisTablet.add(ManagementAction.NEEDS_LOCATION_UPDATE); } + if (!tm.getLogs().isEmpty() && (tm.getOperationId() == null + || tm.getOperationId().getType() != TabletOperationType.DELETING)) { + reasonsToReturnThisTablet.add(ManagementAction.NEEDS_RECOVERY); + } + if (tm.getOperationId() == null) { try { final long splitThreshold = diff --git a/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java b/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java index b467d495519..e6782ad7413 100644 --- a/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java +++ b/server/manager/src/main/java/org/apache/accumulo/manager/TabletGroupWatcher.java @@ -460,6 +460,11 @@ private TableMgmtStats manageTablets(Iterator iter, final TabletGoalState goal = TabletGoalState.compute(tm, state, manager.tabletBalancer, tableMgmtParams); + if (actions.contains(ManagementAction.NEEDS_RECOVERY) && goal != TabletGoalState.HOSTED) { + LOG.warn("Tablet has wals, but goal is not hosted. Tablet: {}, goal:{}", tm.getExtent(), + goal); + } + if (actions.contains(ManagementAction.NEEDS_VOLUME_REPLACEMENT)) { tableMgmtStats.totalVolumeReplacements++; if (state == TabletState.UNASSIGNED || state == TabletState.SUSPENDED) { @@ -513,7 +518,8 @@ private TableMgmtStats manageTablets(Iterator iter, } if (actions.contains(ManagementAction.NEEDS_SPLITTING) - && !actions.contains(ManagementAction.NEEDS_VOLUME_REPLACEMENT)) { + && !actions.contains(ManagementAction.NEEDS_VOLUME_REPLACEMENT) + && !actions.contains(ManagementAction.NEEDS_RECOVERY)) { LOG.debug("{} may need splitting.", tm.getExtent()); if (manager.getSplitter().isSplittable(tm)) { if (manager.getSplitter().addSplitStarting(tm.getExtent())) { @@ -529,7 +535,8 @@ private TableMgmtStats manageTablets(Iterator iter, } if (actions.contains(ManagementAction.NEEDS_COMPACTING) - && !actions.contains(ManagementAction.NEEDS_VOLUME_REPLACEMENT)) { + && !actions.contains(ManagementAction.NEEDS_VOLUME_REPLACEMENT) + && !actions.contains(ManagementAction.NEEDS_RECOVERY)) { var jobs = compactionGenerator.generateJobs(tm, TabletManagementIterator.determineCompactionKinds(actions)); LOG.debug("{} may need compacting adding {} jobs", tm.getExtent(), jobs.size()); @@ -542,14 +549,19 @@ private TableMgmtStats manageTablets(Iterator iter, // entries from the queue because we see nothing here for that case. After a full // metadata scan could remove any tablets that were not updated during the scan. - if (actions.contains(ManagementAction.NEEDS_LOCATION_UPDATE)) { + if (actions.contains(ManagementAction.NEEDS_LOCATION_UPDATE) + || actions.contains(ManagementAction.NEEDS_RECOVERY)) { if (tm.getLocation() != null) { filteredServersToShutdown.remove(tm.getLocation().getServerInstance()); } if (goal == TabletGoalState.HOSTED) { - if ((state != TabletState.HOSTED && !tm.getLogs().isEmpty()) + + // RecoveryManager.recoverLogs will return false when all of the logs + // have been sorted so that recovery can occur. Delay the hosting of + // the Tablet until the sorting is finished. + if ((state != TabletState.HOSTED && actions.contains(ManagementAction.NEEDS_RECOVERY)) && manager.recoveryManager.recoverLogs(tm.getExtent(), tm.getLogs())) { LOG.debug("Not hosting {} as it needs recovery, logs: {}", tm.getExtent(), tm.getLogs().size()); diff --git a/test/src/main/java/org/apache/accumulo/test/functional/TabletManagementIteratorIT.java b/test/src/main/java/org/apache/accumulo/test/functional/TabletManagementIteratorIT.java index 8d4a354a2d4..ff6f4d34cb5 100644 --- a/test/src/main/java/org/apache/accumulo/test/functional/TabletManagementIteratorIT.java +++ b/test/src/main/java/org/apache/accumulo/test/functional/TabletManagementIteratorIT.java @@ -110,7 +110,7 @@ public void test() throws AccumuloException, AccumuloSecurityException, TableExi try (AccumuloClient client = Accumulo.newClient().from(getClientProps()).build()) { - String[] tables = getUniqueNames(8); + String[] tables = getUniqueNames(9); final String t1 = tables[0]; final String t2 = tables[1]; final String t3 = tables[2]; @@ -119,6 +119,7 @@ public void test() throws AccumuloException, AccumuloSecurityException, TableExi final String metaCopy2 = tables[5]; final String metaCopy3 = tables[6]; final String metaCopy4 = tables[7]; + final String metaCopy5 = tables[8]; // create some metadata createTable(client, t1, true); @@ -152,6 +153,7 @@ public void test() throws AccumuloException, AccumuloSecurityException, TableExi copyTable(client, metaCopy1, metaCopy2); copyTable(client, metaCopy1, metaCopy3); copyTable(client, metaCopy1, metaCopy4); + copyTable(client, metaCopy1, metaCopy5); // t1 is unassigned, setting to always will generate a change to host tablets setTabletAvailability(client, metaCopy1, t1, TabletAvailability.HOSTED.name()); @@ -177,6 +179,18 @@ public void test() throws AccumuloException, AccumuloSecurityException, TableExi assertEquals(1, findTabletsNeedingAttention(client, metaCopy2, tabletMgmtParams), "Only 1 of 2 tablets in table t1 should be returned"); + // Test the recovery cases + createLogEntry(client, metaCopy5, t1); + setTabletAvailability(client, metaCopy5, t1, TabletAvailability.UNHOSTED.name()); + assertEquals(1, findTabletsNeedingAttention(client, metaCopy5, tabletMgmtParams), + "Only 1 of 2 tablets in table t1 should be returned"); + setTabletAvailability(client, metaCopy5, t1, TabletAvailability.ONDEMAND.name()); + assertEquals(1, findTabletsNeedingAttention(client, metaCopy5, tabletMgmtParams), + "Only 1 of 2 tablets in table t1 should be returned"); + setTabletAvailability(client, metaCopy5, t1, TabletAvailability.HOSTED.name()); + assertEquals(2, findTabletsNeedingAttention(client, metaCopy5, tabletMgmtParams), + "2 tablets in table t1 should be returned"); + // Remove location and set merge operation id on both tablets // These tablets should not need attention as they have no WALs setTabletAvailability(client, metaCopy4, t4, TabletAvailability.HOSTED.name()); @@ -225,7 +239,7 @@ public void test() throws AccumuloException, AccumuloSecurityException, TableExi "Should have one tablet that needs a volume replacement"); // clean up - dropTables(client, t1, t2, t3, t4, metaCopy1, metaCopy2, metaCopy3, metaCopy4); + dropTables(client, t1, t2, t3, t4, metaCopy1, metaCopy2, metaCopy3, metaCopy4, metaCopy5); } }