From d53c1c65fe6bf89d63fdd01fa4f42e639e6ff455 Mon Sep 17 00:00:00 2001 From: "Dom G." Date: Fri, 13 Dec 2024 16:54:07 -0500 Subject: [PATCH 01/31] Improve VerifySerialRecoveryITgst (#5182) Improves the logging and assert messages for this flaky IT so it will be easier to debug next time it fails. --- .../accumulo/test/VerifySerialRecoveryIT.java | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/test/src/main/java/org/apache/accumulo/test/VerifySerialRecoveryIT.java b/test/src/main/java/org/apache/accumulo/test/VerifySerialRecoveryIT.java index 3dc8b272c56..a6c2046db24 100644 --- a/test/src/main/java/org/apache/accumulo/test/VerifySerialRecoveryIT.java +++ b/test/src/main/java/org/apache/accumulo/test/VerifySerialRecoveryIT.java @@ -46,9 +46,13 @@ import org.apache.hadoop.fs.RawLocalFileSystem; import org.apache.hadoop.io.Text; import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class VerifySerialRecoveryIT extends ConfigurableMacBase { + private static final Logger log = LoggerFactory.getLogger(VerifySerialRecoveryIT.class); + private static final byte[] HEXCHARS = {0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66}; @@ -114,32 +118,31 @@ public void testSerializedRecovery() throws Exception { assertEquals(0, cluster.exec(Admin.class, "stopAll").getProcess().waitFor()); ts.getProcess().waitFor(); String result = ts.readStdOut(); - for (String line : result.split("\n")) { - System.out.println(line); - } + log.info(result); + // walk through the output, verifying that only a single normal recovery was running at one // time - boolean started = false; + boolean ongoingRecovery = false; int recoveries = 0; var pattern = Pattern.compile(".*recovered \\d+ mutations creating \\d+ entries from \\d+ walogs.*"); for (String line : result.split("\n")) { - // ignore metadata tables + // ignore metadata and root tables if (line.contains("!0") || line.contains("+r")) { continue; } if (line.contains("recovering data from walogs")) { - assertFalse(started); - started = true; + assertFalse(ongoingRecovery, "Saw recovery start before previous recovery finished"); + ongoingRecovery = true; recoveries++; } if (pattern.matcher(line).matches()) { - assertTrue(started); - started = false; + assertTrue(ongoingRecovery, "Saw recovery end without recovery start"); + ongoingRecovery = false; } } - assertFalse(started); - assertTrue(recoveries > 0); + assertFalse(ongoingRecovery, "Expected no ongoing recovery at end of test"); + assertTrue(recoveries > 0, "Expected at least one recovery to have occurred"); } } } From 78cbdbe5560272028b93401b9f2f4f792c625798 Mon Sep 17 00:00:00 2001 From: Keith Turner Date: Sat, 14 Dec 2024 10:32:27 -0500 Subject: [PATCH 02/31] Only read data for fate lock from zookeeper when its used (#5180) --- .../java/org/apache/accumulo/core/fate/zookeeper/FateLock.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/FateLock.java b/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/FateLock.java index fa0e4db7edd..9ac21a8d31c 100644 --- a/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/FateLock.java +++ b/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/FateLock.java @@ -103,9 +103,9 @@ public SortedMap getEarlierEntries(long entry) { for (String name : children) { // this try catch must be done inside the loop because some subset of the children may exist try { - byte[] data = zoo.getData(path + "/" + name); long order = Long.parseLong(name.substring(PREFIX.length())); if (order <= entry) { + byte[] data = zoo.getData(path + "/" + name); result.put(order, data); } } catch (KeeperException.NoNodeException ex) { From 7b8b21335ecd6adc168e9fb99aae58cbff2b75dd Mon Sep 17 00:00:00 2001 From: Kevin Rathbun Date: Mon, 16 Dec 2024 10:37:57 -0500 Subject: [PATCH 03/31] Adds new accumulo command (#5073) * Adds new accumulo command: Adds new `accumulo check-accumulo-properties` command which checks the provided Accumulo configuration file for errors. Only checks the file contents, and not any running instance, so useful for verifying config prior to init. Performs a subset of the checks that `accumulo check-server-config` does. Useful for (at least mostly) validating the `accumulo.properties` file without a running instance. Co-authored-by: Christopher Tubbs --- .../server/conf/CheckAccumuloProperties.java | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 server/base/src/main/java/org/apache/accumulo/server/conf/CheckAccumuloProperties.java diff --git a/server/base/src/main/java/org/apache/accumulo/server/conf/CheckAccumuloProperties.java b/server/base/src/main/java/org/apache/accumulo/server/conf/CheckAccumuloProperties.java new file mode 100644 index 00000000000..e7c7394fece --- /dev/null +++ b/server/base/src/main/java/org/apache/accumulo/server/conf/CheckAccumuloProperties.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.accumulo.server.conf; + +import java.io.File; +import java.io.IOException; + +import org.apache.accumulo.core.conf.SiteConfiguration; +import org.apache.accumulo.server.ServerDirs; +import org.apache.accumulo.server.fs.VolumeManagerImpl; +import org.apache.accumulo.start.spi.KeywordExecutable; +import org.apache.hadoop.conf.Configuration; + +import com.google.auto.service.AutoService; +import com.google.common.base.Preconditions; + +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; + +@AutoService(KeywordExecutable.class) +public class CheckAccumuloProperties implements KeywordExecutable { + + @SuppressFBWarnings(value = "PATH_TRAVERSAL_IN", justification = "intentional user-provided path") + public static void main(String[] args) throws IOException { + Preconditions.checkArgument(args.length == 1, + "Expected 1 argument (the properties file path), got " + args.length); + var hadoopConfig = new Configuration(); + var siteConfig = SiteConfiguration.fromFile(new File(args[0])).build(); + + VolumeManagerImpl.get(siteConfig, hadoopConfig); + new ServerDirs(siteConfig, hadoopConfig); + } + + @Override + public String keyword() { + return "check-accumulo-properties"; + } + + @Override + public String description() { + return "Checks the provided Accumulo configuration file for errors. " + + "This only checks the contents of the file and not any running Accumulo system, " + + "so it can be used prior to init, but only performs a subset of the checks done by " + + (new CheckServerConfig().keyword()); + } + + @Override + public void execute(String[] args) throws Exception { + main(args); + } +} From 79d972f83b345adbf4cd4b65470d53b8b4f13f98 Mon Sep 17 00:00:00 2001 From: Kevin Rathbun Date: Mon, 16 Dec 2024 10:39:50 -0500 Subject: [PATCH 04/31] Fix Fate pool watcher bug (#5171) Was incorrectly calculating the number of TransactionRunners to execute --- core/src/main/java/org/apache/accumulo/core/fate/Fate.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/accumulo/core/fate/Fate.java b/core/src/main/java/org/apache/accumulo/core/fate/Fate.java index 280922a332b..8287edbb334 100644 --- a/core/src/main/java/org/apache/accumulo/core/fate/Fate.java +++ b/core/src/main/java/org/apache/accumulo/core/fate/Fate.java @@ -250,7 +250,7 @@ public void startTransactionRunners(AccumuloConfiguration conf) { // resize the pool if the property changed ThreadPools.resizePool(pool, conf, Property.MANAGER_FATE_THREADPOOL_SIZE); // If the pool grew, then ensure that there is a TransactionRunner for each thread - int needed = conf.getCount(Property.MANAGER_FATE_THREADPOOL_SIZE) - pool.getQueue().size(); + int needed = conf.getCount(Property.MANAGER_FATE_THREADPOOL_SIZE) - pool.getActiveCount(); if (needed > 0) { for (int i = 0; i < needed; i++) { try { From 7b746cd08fc18ea3e75ea5f5180c1a6661367db5 Mon Sep 17 00:00:00 2001 From: Christopher Tubbs Date: Tue, 17 Dec 2024 12:53:22 -0500 Subject: [PATCH 05/31] Fix KeywordStartIT broken by #5073 * Remove main method and move implementation to the execute method in the new CheckAccumuloProperties command * Add the new KeywordExecutable class to the IT --- .../server/conf/CheckAccumuloProperties.java | 23 ++++++++----------- .../accumulo/test/start/KeywordStartIT.java | 2 ++ 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/server/base/src/main/java/org/apache/accumulo/server/conf/CheckAccumuloProperties.java b/server/base/src/main/java/org/apache/accumulo/server/conf/CheckAccumuloProperties.java index e7c7394fece..08684a52085 100644 --- a/server/base/src/main/java/org/apache/accumulo/server/conf/CheckAccumuloProperties.java +++ b/server/base/src/main/java/org/apache/accumulo/server/conf/CheckAccumuloProperties.java @@ -35,17 +35,6 @@ @AutoService(KeywordExecutable.class) public class CheckAccumuloProperties implements KeywordExecutable { - @SuppressFBWarnings(value = "PATH_TRAVERSAL_IN", justification = "intentional user-provided path") - public static void main(String[] args) throws IOException { - Preconditions.checkArgument(args.length == 1, - "Expected 1 argument (the properties file path), got " + args.length); - var hadoopConfig = new Configuration(); - var siteConfig = SiteConfiguration.fromFile(new File(args[0])).build(); - - VolumeManagerImpl.get(siteConfig, hadoopConfig); - new ServerDirs(siteConfig, hadoopConfig); - } - @Override public String keyword() { return "check-accumulo-properties"; @@ -59,8 +48,16 @@ public String description() { + (new CheckServerConfig().keyword()); } + @SuppressFBWarnings(value = "PATH_TRAVERSAL_IN", justification = "intentional user-provided path") @Override - public void execute(String[] args) throws Exception { - main(args); + public void execute(String[] args) throws IOException { + Preconditions.checkArgument(args.length == 1, + "Expected 1 argument (the properties file path), got " + args.length); + var hadoopConfig = new Configuration(); + var siteConfig = SiteConfiguration.fromFile(new File(args[0])).build(); + + VolumeManagerImpl.get(siteConfig, hadoopConfig); + new ServerDirs(siteConfig, hadoopConfig); } + } diff --git a/test/src/main/java/org/apache/accumulo/test/start/KeywordStartIT.java b/test/src/main/java/org/apache/accumulo/test/start/KeywordStartIT.java index 97b75467e59..5579510723b 100644 --- a/test/src/main/java/org/apache/accumulo/test/start/KeywordStartIT.java +++ b/test/src/main/java/org/apache/accumulo/test/start/KeywordStartIT.java @@ -53,6 +53,7 @@ import org.apache.accumulo.miniclusterImpl.MiniClusterExecutable; import org.apache.accumulo.monitor.Monitor; import org.apache.accumulo.monitor.MonitorExecutable; +import org.apache.accumulo.server.conf.CheckAccumuloProperties; import org.apache.accumulo.server.conf.CheckCompactionConfig; import org.apache.accumulo.server.conf.CheckServerConfig; import org.apache.accumulo.server.conf.util.ConfigPropertyUpgrader; @@ -131,6 +132,7 @@ public void testExpectedClasses() { expectSet.put("admin", Admin.class); expectSet.put("check-compaction-config", CheckCompactionConfig.class); expectSet.put("check-server-config", CheckServerConfig.class); + expectSet.put("check-accumulo-properties", CheckAccumuloProperties.class); expectSet.put("compaction-coordinator", CoordinatorExecutable.class); expectSet.put("compactor", CompactorExecutable.class); expectSet.put("config-upgrade", ConfigPropertyUpgrader.class); From 9e146f95a455e3d760fd50bc468373ffc5b3b021 Mon Sep 17 00:00:00 2001 From: Christopher Tubbs Date: Sat, 21 Dec 2024 20:58:22 -0500 Subject: [PATCH 06/31] Backport and improve the cluster scripts to 2.1 (#5174) This backports the improvements to the accumulo-cluster and accumulo-service scripts to 2.1 to improve script maintenance and functionality across the branches. It also adds additional improvements to merge up to the newer branches for 3.1.0 and 4.0.0. Notable differences with the main branch: * This version includes deprecated commands that are replaced by newer versions: start-non-tservers, start-servers, stop-servers, start-tservers, and stop-tservers * This version includes the ability to control starting a compaction-coordinator, which was removed in the main branch for 4.0 * This version does not allow specifying tserver groups (unsupported by its cluster.yaml version) and uses a hard-coded value of "default" * As before these changes, this version checks for old deprecated config files present in conf/ * As before these changes, this version parses the cluster.yaml that was supported for the 2.1 and 3.1 branches * This version is modified to translate the variables emitted from its version of the ClusterConfigParser to the newer names used in the main branch * This version does not require compactor groups to be configured (since external compactions are optional) * This version uses `-a ` instead of `-o general.process.bind.address=` for the bind address * This version uses `-q` and `-g` for the group parameter This includes relevant changes from #4966 to standardize the service names, #5066 to reduce ssh connections to hosts, #5114 to support starting/stopping groups of servers by type, #5126 to support resource group overrides, #5116 to refactor and simplify the script argument parsing using getopt, and the bugfixes and improvements from #5167 Additional improvements: * Give the temporary file a better name, in case it doesn't get cleaned up, it will be easier to figure out what created it * Fix bug with invalid input that stopped processing options after detecting one that was invalid, and failed to honor the remaining good options, leading to serious issues affecting a larger set of servers than intended; this was fixed by checking the exit code from getopt * Also fixes an issue where `stop-here` would communicate with the accumulo manager and gracefully stop all tservers. * Make sure we have the correct getopt tool (and not the older getopt) * Verify the number of non-option parameters is exactly 1 (the command to execute) * Quote the variable containing the accumulo command since it could contain spaces * Remove --all (hasn't been included in any release yet, so that's fine) When I was updating the usage, I realized I couldn't write up a description of this that demonstrated that it made sense to exist; it can only be used when none of the other service selection options are used, but if none of the other service selection options are used, then that's already the behavior, so the flag does nothing other than introduce an error when it's misused; if we add it to be required, then we introduce an error when no options are used, but that's the most readable and intuitive case, so that would be weird; overall, it seemed best to remove the flag; this simplifies the implementation slightly, where we assume all service types, until that assumption is corrected by the use of any other service type flag * add more debugging output for original arguments vs. the parsed ones * This fixes #5196 by moving the parsing of the command into the `parse_args` method after getopt and strictly verifying that only one non-option argument exists (also relaxes the constraint that it has to be the first argument to the script; now it can be anywhere) * add a quote function that uses getopt to do the quoting; as explained in the inline comments, it produces more readable quoted output * standardize function definition syntax (keeps both the optional keyword "function" and the parens that become optional when the keyword is used) * Fix some grammar/spelling of script messages (fixes the typo of "stoping tablet servers", because "stop" command requires a second "p" when putting "ing" on the end) * parse args before doing anything else in the script, since parsing args doesn't depend on anything else, so if there's a problem parsing, that should be noticed before doing anything else * Add colors and improve messages * Fix description and behavior of using server options multiple times (it might be better to restrict being able to call it multiple times) --------- Co-authored-by: Daniel Roberts --- assemble/bin/accumulo-cluster | 1140 ++++++++--------- assemble/bin/accumulo-service | 112 +- assemble/conf/accumulo-env.sh | 32 +- assemble/conf/accumulo.properties | 5 + .../conf/cluster/ClusterConfigParser.java | 7 +- 5 files changed, 674 insertions(+), 622 deletions(-) diff --git a/assemble/bin/accumulo-cluster b/assemble/bin/accumulo-cluster index 130f7adf466..ab39fcec626 100755 --- a/assemble/bin/accumulo-cluster +++ b/assemble/bin/accumulo-cluster @@ -18,648 +18,649 @@ # under the License. # -function print_usage { +# +# Environment variables that can be set to influence the behavior +# of this script +# +# ACCUMULO_LOCALHOST_ADDRESSES - set to a space delimited string of localhost names +# and addresses to override the default lookups +# + +function print_usage() { cat < ( ...) [